# Libraries + Loading Files

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from catboost import CatBoostClassifier

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

# Features

In [12]:

# Date of Birth
train['dob'] = pd.to_datetime(train['dob'], errors='coerce')
test['dob'] = pd.to_datetime(test['dob'], errors='coerce')

# Age 
train['age'] = 2024 - train['dob'].dt.year
test['age'] = 2024 - test['dob'].dt.year

# Name of Individuals
train['name'] = train['first'] + ' ' + train['last']
test['name'] = test['first'] + ' ' + test['last']

# Time
train['hours'] = pd.to_datetime(train['unix_time'], unit='s').dt.hour
test['hours'] = pd.to_datetime(test['unix_time'], unit='s').dt.hour


# Distance from merch
train['distance'] = np.sqrt((train['lat'] - train['merch_lat'])**2 + (train['long'] - train['merch_long'])**2)
test['distance'] = np.sqrt((test['lat'] - test['merch_lat'])**2 + (test['long'] - test['merch_long'])**2)


# Checking if high amount
high_value_threshold = train['amt'].quantile(0.9)
train['is_high_value_transaction'] = (train['amt'] > high_value_threshold).astype(int)
test['is_high_value_transaction'] = (test['amt'] > high_value_threshold).astype(int)

# Checking if low amount - ('test' fraud amount)
low_value_threshold = train['amt'].quantile(0.1) 
train['is_low_value_transaction'] = (train['amt'] < low_value_threshold).astype(int)
test['is_low_value_transaction'] = (test['amt'] < low_value_threshold).astype(int)

def extract_street_name(street):
    # Remove any numeric values and keep only street name
    street_name = ' '.join([word for word in street.split() if not word.isdigit()])
    return street_name

# Street name
train['street_name'] = train['street'].apply(extract_street_name)
test['street_name'] = test['street'].apply(extract_street_name)

# Drop fields that are not needed (will tweak if needed)
dropped = ['trans_num', 'trans_date', 'trans_time', 'first', 'last', 
                'street', 'long', 'lat', 'city_pop', 'merch_lat', 'merch_long']

train.drop(columns=dropped, inplace=True)
test.drop(columns=dropped, inplace=True)

cat_col = ['merchant', 'name', 'category', 'gender', 'city', 'state', 'job', 'street_name'] 

# Split + Model

In [13]:

# Split data into features and target
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']
X_test = test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Train model now 
model = CatBoostClassifier(iterations=50000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=100)

idxs = [X.columns.get_loc(col) for col in cat_col]

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, cat_features=idxs)

0:	learn: 0.1869285	test: 0.1879978	best: 0.1879978 (0)	total: 109ms	remaining: 9m 4s
100:	learn: 0.0121386	test: 0.0135001	best: 0.0135001 (100)	total: 10.2s	remaining: 8m 17s
200:	learn: 0.0095318	test: 0.0120014	best: 0.0119911 (198)	total: 20.6s	remaining: 8m 12s
300:	learn: 0.0073993	test: 0.0109953	best: 0.0109642 (265)	total: 31.7s	remaining: 8m 14s
400:	learn: 0.0061612	test: 0.0105509	best: 0.0105262 (389)	total: 43.3s	remaining: 8m 17s
500:	learn: 0.0052351	test: 0.0104513	best: 0.0103381 (430)	total: 55.8s	remaining: 8m 21s
600:	learn: 0.0048292	test: 0.0104398	best: 0.0103381 (430)	total: 1m 8s	remaining: 8m 20s
700:	learn: 0.0043316	test: 0.0104426	best: 0.0103381 (430)	total: 1m 21s	remaining: 8m 17s
800:	learn: 0.0039270	test: 0.0103668	best: 0.0103297 (709)	total: 1m 32s	remaining: 8m 6s
900:	learn: 0.0037427	test: 0.0104371	best: 0.0103297 (709)	total: 1m 43s	remaining: 7m 48s
1000:	learn: 0.0036259	test: 0.0105119	best: 0.0103297 (709)	total: 1m 53s	remaining: 7m 33s


<catboost.core.CatBoostClassifier at 0x10f0d0390>

# Score + Report 

In [14]:
y_predict = model.predict(X_val)

f1 = f1_score(y_val, y_predict)
print(f'F1 Score: {f1}')

print("Classification Report:")
print(classification_report(y_val, y_predict))

F1 Score: 0.9852604380762229
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65592
           1       0.99      0.98      0.99      8549

    accuracy                           1.00     74141
   macro avg       0.99      0.99      0.99     74141
weighted avg       1.00      1.00      1.00     74141



# Submission Generation

In [15]:
y_test_pred = model.predict(X_test)

submission = pd.DataFrame({
    'id': sample_submission['id'],
    'is_fraud': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


Submission file saved as submission.csv
