In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, 
                             precision_recall_curve, confusion_matrix, 
                             roc_auc_score)
from xgboost import XGBClassifier

def calc_haversine_distance(lat1, lon1, lat2, lon2):
    """
    Compute the Haversine distance between two latitude/longitude points.
    Returns the distance in kilometers.
    """
    # Convert from degrees to radians
    rad_lat1, rad_lon1, rad_lat2, rad_lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = rad_lat2 - rad_lat1
    dlon = rad_lon2 - rad_lon1
    a = np.sin(dlat / 2)**2 + np.cos(rad_lat1) * np.cos(rad_lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    radius_earth = 6371
    return c * radius_earth


In [56]:
# Load the training dataset
train = pd.read_csv('train.csv')

# Copy and sort by time
df = train.copy()
df.sort_values(by='unix_time', inplace=True)

# Group by credit card number
df_by_cc = df.groupby('cc_num')


In [57]:
# Calculate distance features
df['distance'] = calc_haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

# Category-based features
df['prev_category_same'] = df_by_cc['category'].shift(-1) == df['category']
df['next_category_same'] = df_by_cc['category'].shift(1) == df['category']
df['both_category_different'] = df['prev_category_same'] | df['next_category_same']

# Time-based features
df['avg_time'] = df.groupby(['cc_num', 'category'])['unix_time'].transform('mean')
df['cat_time_diff'] = df['unix_time'] - df['avg_time']
df['avg_time_cc'] = df.groupby('cc_num')['unix_time'].diff().mean()
df['time_between_last_transaction'] = df['unix_time'] - df['unix_time'].shift(1)
df['time_between_last_transaction'].fillna(0, inplace=True)
df['diff_time'] = df['avg_time_cc'] - df['time_between_last_transaction']

# Amount-based features
df['log_amt'] = np.log1p(df['amt'])
df['counts'] = df.groupby('cc_num')['cc_num'].transform('count')
df['counts_per_cc_category'] = df.groupby(['cc_num', 'category'])['category'].transform('count')

# Rolling means
df['ma_10'] = df.groupby('cc_num')['amt'].transform(lambda s: s.rolling(window=10).mean()).fillna(df['amt'])
df['ma_3'] = df.groupby('cc_num')['amt'].transform(lambda s: s.rolling(window=3).mean()).fillna(df['amt'])

df['max_time_ma_3'] = df.groupby('cc_num')['ma_3'].transform('max')
df['avg_money_per_category'] = df.groupby('category')['amt'].transform('mean')

df['prev_amt_diff'] = df_by_cc['amt'].diff().fillna(0)
df['next_amt_diff'] = df_by_cc['amt'].diff(-1).fillna(0)

# High amount flags
high_amt_thresh = df['amt'].quantile(0.9)
df['high_amt_flag'] = (df['amt'] > high_amt_thresh).astype(int)
df['large_txn_ratio'] = df.groupby('cc_num')['high_amt_flag'].transform('mean')

df['Hour'] = pd.to_timedelta(df['trans_time']).dt.total_seconds() / 3600

# Max unix time for ma_3
max_unix_times = df.loc[df.groupby('cc_num')['ma_3'].idxmax(), ['cc_num', 'unix_time']]
df['max_unix_time'] = df['cc_num'].map(max_unix_times.set_index('cc_num')['unix_time'])

df['v5'] = (df['unix_time'] - df['max_unix_time']).abs()
df['v6'] = df['amt'] / (df['v5'] + 1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_between_last_transaction'].fillna(0, inplace=True)


In [58]:
target_column = 'is_fraud'

drop_columns = [
    target_column, 'id', 'zip', 'avg_time_cc', 'state', 'long', 'lat', 'merch_lat', 'merch_long', 
    'first', 'last', 'street', 'city', 'dob', 'merchant', 'job', 'trans_num', 'gender',
    'time_between_last_transaction', 'cc_num', 'city_pop', 'counts_per_cc_category', 
    'diff_time', 'avg_time', 'trans_date', 'ma_10', 'amt', 'next_category_same', 
    'distance', 'trans_time', 'max_unix_time'
]

X = df.drop(columns=drop_columns)
y = df[target_column]

# Label encode any non-numeric columns
non_numeric_cols = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in non_numeric_cols:
    X[col] = encoder.fit_transform(X[col].astype(str))


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=100,                 
    class_weight='balanced', 
    random_state=42
)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9911385063595042
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     65702
           1       0.98      0.94      0.96      8439

    accuracy                           0.99     74141
   macro avg       0.98      0.97      0.98     74141
weighted avg       0.99      0.99      0.99     74141



In [60]:
feature_importances = rf_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False))


                    Feature  Importance
5                   log_amt    0.193916
15                       v5    0.170388
16                       v6    0.154542
12            high_amt_flag    0.083539
4             cat_time_diff    0.060992
14                     Hour    0.053917
3   both_category_different    0.049851
7                      ma_3    0.044628
11            next_amt_diff    0.042899
10            prev_amt_diff    0.035173
0                 unix_time    0.031847
2        prev_category_same    0.016946
13          large_txn_ratio    0.014489
9    avg_money_per_category    0.013958
8             max_time_ma_3    0.012925
6                    counts    0.011177
1                  category    0.008813


In [61]:
class_0_count, class_1_count = np.bincount(y_train)
scale_pos_weight = class_0_count / class_1_count

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,       
    max_depth=4,             
    min_child_weight=5,      
    gamma=0.2,                
    subsample=0.8,            
    colsample_bytree=0.8,    
    scale_pos_weight=scale_pos_weight, 
    random_state=43
)


xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     65702
           1       0.88      0.98      0.93      8439

    accuracy                           0.98     74141
   macro avg       0.94      0.98      0.96     74141
weighted avg       0.98      0.98      0.98     74141

XGBoost Accuracy: 0.9824523542978919


In [62]:
y_probs_rf = rf_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs_rf)

f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print("Optimal Threshold:", optimal_threshold)

y_pred_adj = (y_probs_rf >= optimal_threshold).astype(int)
print("Classification Report with Adjusted Threshold:")
print(classification_report(y_test, y_pred_adj))
print("Accuracy with Adjusted Threshold:", accuracy_score(y_test, y_pred_adj))


Optimal Threshold: 0.42
Classification Report with Adjusted Threshold:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     65702
           1       0.97      0.96      0.96      8439

    accuracy                           0.99     74141
   macro avg       0.98      0.98      0.98     74141
weighted avg       0.99      0.99      0.99     74141

Accuracy with Adjusted Threshold: 0.9918533604887984


In [63]:
final_train = pd.read_csv('train.csv')
final_test = pd.read_csv('test.csv')
combined = pd.concat([final_train, final_test], axis=0)
combined.sort_values(by='unix_time', inplace=True)
combined_by_cc = combined.groupby('cc_num')

# Repeat feature engineering for combined data
combined['distance'] = calc_haversine_distance(combined['lat'], combined['long'], combined['merch_lat'], combined['merch_long'])
combined['prev_category_same'] = combined_by_cc['category'].shift(-1) == combined['category']
combined['next_category_same'] = combined_by_cc['category'].shift(1) == combined['category']
combined['both_category_different'] = combined['prev_category_same'] | combined['next_category_same']

combined['avg_time'] = combined.groupby(['cc_num', 'category'])['unix_time'].transform('mean')
combined['cat_time_diff'] = combined['unix_time'] - combined['avg_time']
combined['avg_time_cc'] = combined.groupby('cc_num')['unix_time'].diff().mean()
combined['time_between_last_transaction'] = combined['unix_time'] - combined['unix_time'].shift(1)
combined['time_between_last_transaction'].fillna(0, inplace=True)
combined['diff_time'] = combined['avg_time_cc'] - combined['time_between_last_transaction']

combined['log_amt'] = np.log1p(combined['amt'])
combined['counts'] = combined.groupby('cc_num')['cc_num'].transform('count')
combined['counts_per_cc_category'] = combined.groupby(['cc_num', 'category'])['category'].transform('count')

combined['ma_10'] = combined.groupby('cc_num')['amt'].transform(lambda s: s.rolling(window=10).mean()).fillna(combined['amt'])
combined['ma_3'] = combined.groupby('cc_num')['amt'].transform(lambda s: s.rolling(window=3).mean()).fillna(combined['amt'])

combined['max_time_ma_3'] = combined.groupby('cc_num')['ma_3'].transform('max')
combined['avg_money_per_category'] = combined.groupby('category')['amt'].transform('mean')

combined['prev_amt_diff'] = combined_by_cc['amt'].diff().fillna(0)
combined['next_amt_diff'] = combined_by_cc['amt'].diff(-1).fillna(0)

high_amt_thresh = combined['amt'].quantile(0.9)
combined['high_amt_flag'] = (combined['amt'] > high_amt_thresh).astype(int)
combined['large_txn_ratio'] = combined.groupby('cc_num')['high_amt_flag'].transform('mean')

combined['Hour'] = pd.to_timedelta(combined['trans_time']).dt.total_seconds() / 3600

max_unix_times_combined = combined.loc[combined.groupby('cc_num')['ma_3'].idxmax(), ['cc_num', 'unix_time']]
max_unix_times_combined = max_unix_times_combined.drop_duplicates(subset='cc_num').set_index('cc_num')
combined['max_unix_time'] = combined['cc_num'].map(max_unix_times_combined['unix_time'])

combined['v5'] = abs(combined['unix_time'] - combined['max_unix_time'])
combined['v6'] = combined['amt'] / (combined['v5'] + 1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['time_between_last_transaction'].fillna(0, inplace=True)


In [64]:
final = combined[combined['id'].isin(final_test['id'])]

final_drop_cols = [
    'is_fraud', 'id', 'zip', 'avg_time_cc', 'state', 'long', 'lat', 'merch_lat', 'merch_long',
    'first', 'last', 'street', 'city', 'dob', 'merchant', 'job', 'trans_num', 'gender', 
    'time_between_last_transaction', 'cc_num', 'city_pop', 'counts_per_cc_category', 'diff_time',
    'avg_time', 'trans_date', 'ma_10', 'amt', 'next_category_same', 'distance', 'trans_time', 
    'max_unix_time'
]

X_final = final.drop(columns=final_drop_cols)

# Label encode if needed
final_non_numeric = X_final.select_dtypes(include=['object']).columns
for c in final_non_numeric:
    X_final[c] = encoder.fit_transform(X_final[c].astype(str))

final_probs = rf_model.predict_proba(X_final)[:, 1]
final_preds = (final_probs >= optimal_threshold).astype(int)

submission = pd.DataFrame({'id': final['id'], 'is_fraud': final_preds})
submission.to_csv('submission2.csv', index=False)
print("Submission file created: submission.csv")


Submission file created: submission.csv
