# Libraries + Loading Files

In [143]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from catboost import CatBoostClassifier

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

# Features

In [144]:

# Date of Birth
train['dob'] = pd.to_datetime(train['dob'], errors='coerce')
test['dob'] = pd.to_datetime(test['dob'], errors='coerce')

# Age 
train['age'] = 2024 - train['dob'].dt.year
test['age'] = 2024 - test['dob'].dt.year

# Name of Individuals
train['name'] = train['first'] + ' ' + train['last']
test['name'] = test['first'] + ' ' + test['last']

# Time
train['hours'] = pd.to_datetime(train['unix_time'], unit='s').dt.hour
test['hours'] = pd.to_datetime(test['unix_time'], unit='s').dt.hour


# Distance from merch
train['distance'] = np.sqrt((train['lat'] - train['merch_lat'])**2 + (train['long'] - train['merch_long'])**2)
test['distance'] = np.sqrt((test['lat'] - test['merch_lat'])**2 + (test['long'] - test['merch_long'])**2)


# Checking if high amount
high_value_threshold = train['amt'].quantile(0.9)
train['is_high_value_transaction'] = (train['amt'] > high_value_threshold).astype(int)
test['is_high_value_transaction'] = (test['amt'] > high_value_threshold).astype(int)

def extract_street_name(street):
    # Remove any numeric values and keep only street name
    street_name = ' '.join([word for word in street.split() if not word.isdigit()])
    return street_name

# Street name
train['street_name'] = train['street'].apply(extract_street_name)
test['street_name'] = test['street'].apply(extract_street_name)

# Drop fields that are not needed (will tweak if needed)
dropped = ['trans_num', 'trans_date', 'trans_time', 'first', 'last', 
                'street', 'long', 'lat', 'city_pop', 'merch_lat', 'merch_long']

train.drop(columns=dropped, inplace=True)
test.drop(columns=dropped, inplace=True)

cat_col = ['merchant', 'name', 'category', 'gender', 'city', 'state', 'job', 'street_name'] 

# Split + Model

In [145]:

# Split data into features and target
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']
X_test = test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Train model now 
model = CatBoostClassifier(iterations=1250, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=100)

idxs = [X.columns.get_loc(col) for col in cat_col]

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, cat_features=idxs)

0:	learn: 0.5040894	test: 0.5037348	best: 0.5037348 (0)	total: 154ms	remaining: 3m 12s
100:	learn: 0.0252160	test: 0.0246609	best: 0.0246609 (100)	total: 10.6s	remaining: 2m
200:	learn: 0.0169532	test: 0.0168011	best: 0.0168011 (200)	total: 20.6s	remaining: 1m 47s
300:	learn: 0.0143943	test: 0.0146012	best: 0.0146012 (300)	total: 31.1s	remaining: 1m 38s
400:	learn: 0.0124212	test: 0.0127506	best: 0.0127499 (397)	total: 41.8s	remaining: 1m 28s
500:	learn: 0.0116487	test: 0.0121946	best: 0.0121946 (500)	total: 52.9s	remaining: 1m 19s
600:	learn: 0.0110805	test: 0.0118833	best: 0.0118816 (594)	total: 1m 3s	remaining: 1m 8s
700:	learn: 0.0102975	test: 0.0112753	best: 0.0112753 (700)	total: 1m 14s	remaining: 58.7s
800:	learn: 0.0098922	test: 0.0110416	best: 0.0110383 (770)	total: 1m 25s	remaining: 47.7s
900:	learn: 0.0093256	test: 0.0107059	best: 0.0107059 (900)	total: 1m 35s	remaining: 37.1s
1000:	learn: 0.0088839	test: 0.0104934	best: 0.0104888 (998)	total: 1m 47s	remaining: 26.6s
1100:	l

<catboost.core.CatBoostClassifier at 0x16a62b890>

# Score + Report 

In [146]:
y_predict = model.predict(X_val)

f1 = f1_score(y_val, y_predict)
print(f'F1 Score: {f1}')

print("Classification Report:")
print(classification_report(y_val, y_predict))

F1 Score: 0.9854611807640238
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65592
           1       0.99      0.98      0.99      8549

    accuracy                           1.00     74141
   macro avg       0.99      0.99      0.99     74141
weighted avg       1.00      1.00      1.00     74141



# Submission Generation

In [147]:
y_test_pred = model.predict(X_test)

submission = pd.DataFrame({
    'id': sample_submission['id'],
    'is_fraud': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


Submission file saved as submission.csv
