# Libraries + Loading Files

In [148]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from catboost import CatBoostClassifier

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

# Features

In [149]:

# Date of Birth
train['dob'] = pd.to_datetime(train['dob'], errors='coerce')
test['dob'] = pd.to_datetime(test['dob'], errors='coerce')

# Age 
train['age'] = 2024 - train['dob'].dt.year
test['age'] = 2024 - test['dob'].dt.year

# Name of Individuals
train['name'] = train['first'] + ' ' + train['last']
test['name'] = test['first'] + ' ' + test['last']

# Time
train['hours'] = pd.to_datetime(train['unix_time'], unit='s').dt.hour
test['hours'] = pd.to_datetime(test['unix_time'], unit='s').dt.hour


# Distance from merch
train['distance'] = np.sqrt((train['lat'] - train['merch_lat'])**2 + (train['long'] - train['merch_long'])**2)
test['distance'] = np.sqrt((test['lat'] - test['merch_lat'])**2 + (test['long'] - test['merch_long'])**2)


# Checking if high amount
high_value_threshold = train['amt'].quantile(0.9)
train['is_high_value_transaction'] = (train['amt'] > high_value_threshold).astype(int)
test['is_high_value_transaction'] = (test['amt'] > high_value_threshold).astype(int)

# Checking if low amount - ('test' fraud amount)
low_value_threshold = train['amt'].quantile(0.1) 
train['is_low_value_transaction'] = (train['amt'] < low_value_threshold).astype(int)
test['is_low_value_transaction'] = (test['amt'] < low_value_threshold).astype(int)

def extract_street_name(street):
    # Remove any numeric values and keep only street name
    street_name = ' '.join([word for word in street.split() if not word.isdigit()])
    return street_name

# Street name
train['street_name'] = train['street'].apply(extract_street_name)
test['street_name'] = test['street'].apply(extract_street_name)

# Drop fields that are not needed (will tweak if needed)
dropped = ['trans_num', 'trans_date', 'trans_time', 'first', 'last', 
                'street', 'long', 'lat', 'city_pop', 'merch_lat', 'merch_long']

train.drop(columns=dropped, inplace=True)
test.drop(columns=dropped, inplace=True)

cat_col = ['merchant', 'name', 'category', 'gender', 'city', 'state', 'job', 'street_name'] 

# Split + Model

In [150]:

# Split data into features and target
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']
X_test = test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Train model now 
model = CatBoostClassifier(iterations=1250, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=100)

idxs = [X.columns.get_loc(col) for col in cat_col]

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, cat_features=idxs)

0:	learn: 0.5096478	test: 0.5098628	best: 0.5098628 (0)	total: 161ms	remaining: 3m 21s
100:	learn: 0.0254661	test: 0.0248901	best: 0.0248901 (100)	total: 12.2s	remaining: 2m 18s
200:	learn: 0.0164606	test: 0.0164707	best: 0.0164707 (200)	total: 24.8s	remaining: 2m 9s
300:	learn: 0.0145468	test: 0.0149492	best: 0.0149459 (297)	total: 36.7s	remaining: 1m 55s
400:	learn: 0.0131180	test: 0.0137444	best: 0.0137437 (399)	total: 48.1s	remaining: 1m 41s
500:	learn: 0.0121021	test: 0.0129167	best: 0.0129157 (499)	total: 1m	remaining: 1m 29s
600:	learn: 0.0115726	test: 0.0126704	best: 0.0126704 (599)	total: 1m 10s	remaining: 1m 16s
700:	learn: 0.0109612	test: 0.0122355	best: 0.0122355 (699)	total: 1m 21s	remaining: 1m 3s
800:	learn: 0.0104235	test: 0.0117415	best: 0.0117415 (800)	total: 1m 30s	remaining: 50.7s
900:	learn: 0.0101569	test: 0.0116741	best: 0.0116741 (900)	total: 1m 41s	remaining: 39.2s
1000:	learn: 0.0098775	test: 0.0115819	best: 0.0115819 (998)	total: 1m 52s	remaining: 27.9s
1100:

<catboost.core.CatBoostClassifier at 0x122271e10>

# Score + Report 

In [151]:
y_predict = model.predict(X_val)

f1 = f1_score(y_val, y_predict)
print(f'F1 Score: {f1}')

print("Classification Report:")
print(classification_report(y_val, y_predict))

F1 Score: 0.9852309502794939
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65592
           1       0.99      0.98      0.99      8549

    accuracy                           1.00     74141
   macro avg       0.99      0.99      0.99     74141
weighted avg       1.00      1.00      1.00     74141



# Submission Generation

In [152]:
y_test_pred = model.predict(X_test)

submission = pd.DataFrame({
    'id': sample_submission['id'],
    'is_fraud': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


Submission file saved as submission.csv
