In [27]:
from math import radians, cos, sin, asin, sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, f1_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_selection import SelectFromModel
from scipy.stats import randint 
from sklearn.tree import export_graphviz 
from IPython.display import Image 
import graphviz 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  
from imblearn.over_sampling import SMOTE

In [2]:
def preprocess_fraud_data(file_path):
    """
    Preprocess the fraud dataset and split into training and validation sets.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        tuple: Processed training and validation sets (X_train, X_val, y_train, y_val).
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Convert 'trans_date_trans_time' to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    # Calculate age using 'dob' and transaction year
    df['transaction_year'] = df['trans_date_trans_time'].dt.year
    df['year_of_birth'] = pd.to_datetime(df['dob']).dt.year
    df['age'] = df['transaction_year'] - df['year_of_birth']
    df.drop(columns=['dob', 'transaction_year', 'year_of_birth'], inplace=True)

    # Drop irrelevant columns
    irrelevant_columns = ['Unnamed: 0', 'cc_num', 'trans_num', 'street']
    df_cleaned = df.drop(columns=irrelevant_columns)

    # Haversine function to calculate distance
    def haversine(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Radius of Earth in kilometers.
        return c * r

    # Calculate distance and add to the dataset
    df_cleaned['distance'] = df_cleaned.apply(
        lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

    # Create bins for latitude and longitude
    n_bins = 10
    df_cleaned['lat_bucket'] = pd.cut(df_cleaned['lat'], bins=n_bins, labels=False)
    df_cleaned['long_bucket'] = pd.cut(df_cleaned['long'], bins=n_bins, labels=False)
    df_cleaned['merch_lat_bucket'] = pd.cut(df_cleaned['merch_lat'], bins=n_bins, labels=False)
    df_cleaned['merch_long_bucket'] = pd.cut(df_cleaned['merch_long'], bins=n_bins, labels=False)

    # Encode categorical columns
    categorical_columns = ['merchant', 'category', 'gender', 'job']
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        df_cleaned[col] = le.fit_transform(df_cleaned[col])
        label_encoders[col] = le

    # Drop columns that are no longer needed
    columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'city', 'state', 'zip', 'lat', 'long', 'merch_lat',
                       'merch_long']
    df_cleaned = df_cleaned.drop(columns=columns_to_drop)

    # Separate features and target variable
    X = df_cleaned.drop(columns=['is_fraud'])
    y = df_cleaned['is_fraud']

    # Normalize numerical columns
    numerical_columns = ['amt', 'age', 'distance', 'lat_bucket', 'long_bucket', 'merch_lat_bucket', 'merch_long_bucket']
    scaler = StandardScaler()
    X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

    # Split into training and validation sets (fixed parameters)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_val, y_train, y_val, scaler


In [3]:
# declare data 
X_train, X_val, y_train, y_val, scaler = preprocess_fraud_data('fraudTrain.csv') 

# run Random Forest 
rf = RandomForestClassifier() 
rf.fit(X_train, y_train) 

# calculate metrics 
y_pred = rf.predict(X_val) 
accuracy = accuracy_score(y_val, y_pred) 
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print("Accuracy:", accuracy) 
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9975437175853625
Precision: 0.8883025505716798
Recall: 0.6644736842105263
F1 Score: 0.7602559277380504


In [14]:
# Run Random Forest using Test Set 

X_test, X_val, y_test, y_val, scaler = preprocess_fraud_data('fraudTest.csv') 
y_test_pred = rf.predict(X_test)

# Get accuracy, f1 score, precision, and recall  
acc_test = accuracy_score(y_test, y_test_pred) 
prec_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred) 
f1_test = f1_score(y_test, y_test_pred) 

print('accuracy: ', acc_test) 
print("Precision:", prec_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)

accuracy:  0.9975639655851094
Precision: 0.8087378640776699
Recall: 0.48458406050029085
F1 Score: 0.6060385594761731


In [19]:
# Start the Feature Importance Journey 
feature_scores = pd.Series(rf.feature_importances_, index = X_train.columns).sort_values(ascending = False) 
print(feature_scores) # most important features are amount and category 

# visualize feature scores 
'''
sns.barplot(x = feature_scores, y = feature_scores.index) 

plt.xlabel('Feature Importance Score') 
plt.ylabel('Features') 
plt.show()
'''

amt                  0.433424
category             0.156438
unix_time            0.077960
distance             0.060924
city_pop             0.055601
age                  0.054368
merchant             0.051538
job                  0.043614
merch_long_bucket    0.013821
merch_lat_bucket     0.013256
long_bucket          0.013102
lat_bucket           0.013092
gender               0.012862
dtype: float64


"\nsns.barplot(x = feature_scores, y = feature_scores.index) \n\nplt.xlabel('Feature Importance Score') \nplt.ylabel('Features') \nplt.show()\n"

In [15]:
# Continue with feature seletion 

# select all features with feature importance score above 0.02
selector = SelectFromModel(rf, threshold = 0.02, prefit = True)

# Transform training and validation sets to include only selected features
X_train_sel = selector.transform(X_train)
X_val_sel = selector.transform(X_val)

selected_features = X_train.columns[selector.get_support()]
print("Selected features based on importance:", list(selected_features))

warnings.filterwarnings(
    "ignore",
    message="X has feature names, but SelectFromModel was fitted without feature names"
)

Selected features based on importance: ['merchant', 'category', 'amt', 'city_pop', 'job', 'unix_time', 'age', 'distance']


In [5]:
# more feature selection 

# define hyper parameter distribution 
param_dist = { 
    'n_estimators': randint(50, 150), 
    'max_depth': [None] + list(range(5, 15)), 
    'min_samples_split': randint(2, 6), # min samples for split 
    'min_samples_leaf': randint(1, 6), # min samples for leaf 
    'bootstrap': [True, False]
}

rf_tuned = RandomForestClassifier(random_state = 42) # new instance 

# set up RandomizedSearchCV using defined hyper parameters 
random_search = RandomizedSearchCV(
    estimator = rf_tuned, 
    param_distributions = param_dist, 
    n_iter = 10, # hyper parameter combos 
    cv = 2, # three fold cross validation 
    scoring = 'f1', 
    random_state = 42, 
    n_jobs = -1
)

random_search.fit(X_train_sel, y_train) 

print("Best cross-validation F1 score:", random_search.best_score_)


Best parameters found: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 3, 'n_estimators': 71}
Best cross-validation F1 score: 0.6985929718691194


In [16]:
# test on Testing Set 

X_test_sel = selector.transform(X_test) 
selected_features = X_test.columns[selector.get_support()]

random_search.fit(X_test_sel, y_test) 
print("Best cross-validation F1 score:", random_search.best_score_)

Best cross-validation F1 score: 0.58132604288487


In [17]:
def preprocess_fraud_test(file_path, scaler):
    """
    Preprocess the fraud dataset for testing.
    
    Parameters:
        file_path (str): Path to the input CSV file.
        scaler (StandardScaler): A scaler fitted on the training data.
    
    Returns:
        tuple: Processed features and target (X, y).
    """
    # Load the dataset
    df = pd.read_csv(file_path)

    # Convert 'trans_date_trans_time' to datetime
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    # Calculate age using 'dob' and transaction year
    df['transaction_year'] = df['trans_date_trans_time'].dt.year
    df['year_of_birth'] = pd.to_datetime(df['dob']).dt.year
    df['age'] = df['transaction_year'] - df['year_of_birth']
    df.drop(columns=['dob', 'transaction_year', 'year_of_birth'], inplace=True)

    # Drop irrelevant columns
    irrelevant_columns = ['Unnamed: 0', 'cc_num', 'trans_num', 'street']
    df_cleaned = df.drop(columns=irrelevant_columns)

    # Haversine function to calculate distance
    def haversine(lat1, lon1, lat2, lon2):
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * asin(sqrt(a))
        r = 6371  # Radius of Earth in kilometers.
        return c * r

    # Calculate distance and add to the dataset
    df_cleaned['distance'] = df_cleaned.apply(
        lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

    # Create bins for latitude and longitude
    n_bins = 10
    df_cleaned['lat_bucket'] = pd.cut(df_cleaned['lat'], bins=n_bins, labels=False)
    df_cleaned['long_bucket'] = pd.cut(df_cleaned['long'], bins=n_bins, labels=False)
    df_cleaned['merch_lat_bucket'] = pd.cut(df_cleaned['merch_lat'], bins=n_bins, labels=False)
    df_cleaned['merch_long_bucket'] = pd.cut(df_cleaned['merch_long'], bins=n_bins, labels=False)

    # Encode categorical columns
    categorical_columns = ['merchant', 'category', 'gender', 'job']
    for col in categorical_columns:
        le = LabelEncoder()
        df_cleaned[col] = le.fit_transform(df_cleaned[col])

    # Drop columns that are no longer needed
    columns_to_drop = ['trans_date_trans_time', 'first', 'last', 'city', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long']
    df_cleaned = df_cleaned.drop(columns=columns_to_drop)

    # Separate features and target variable
    X = df_cleaned.drop(columns=['is_fraud'])
    y = df_cleaned['is_fraud']

    # Normalize numerical columns using the scaler from training
    numerical_columns = ['amt', 'age', 'distance', 'lat_bucket', 'long_bucket', 'merch_lat_bucket', 'merch_long_bucket']
    X[numerical_columns] = scaler.transform(X[numerical_columns])

    return X, y


In [20]:
# Apply feature selection on the training set
X_train_sel = selector.transform(X_train)

# Train a new model on the selected features
rf_sel = RandomForestClassifier(random_state=42)
rf_sel.fit(X_train_sel, y_train)

# Preprocess the test data using the scaler fitted on the training data
X_test, y_test = preprocess_fraud_test('fraudTest.csv', scaler)

# If you have a feature selector (like SelectFromModel), apply it to the test features as well:
X_test_sel = selector.transform(X_test)

# Predict and evaluate on the test set
y_test_pred = rf_sel.predict(X_test_sel)

# Calculate metrics
acc_test = accuracy_score(y_test, y_test_pred)
prec_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print('Test Accuracy: ', acc_test)
print("Test Precision:", prec_test)
print("Test Recall:", recall_test)
print("Test F1 Score:", f1_test)


Test Accuracy:  0.9976534903431411
Test Precision: 0.7704180064308682
Test Recall: 0.5585081585081585
Test F1 Score: 0.6475675675675676


In [22]:
# define hyper parameter distribution 
param_dist = { 
    'n_estimators': randint(50, 150), 
    'max_depth': [None] + list(range(5, 15)), 
    'min_samples_split': randint(2, 6), # min samples for split 
    'min_samples_leaf': randint(1, 6), # min samples for leaf 
    'bootstrap': [True, False]
}

rf_tuned = RandomForestClassifier(random_state = 42) # new instance 

# set up RandomizedSearchCV using defined hyper parameters 
random_search = RandomizedSearchCV(
    estimator = rf_tuned, 
    param_distributions = param_dist, 
    n_iter = 10, # hyper parameter combos 
    cv = 2, # three fold cross validation 
    scoring = 'f1', 
    random_state = 42, 
    n_jobs = -1
)

# Apply feature selection on the training set
X_train_sel = selector.transform(X_train)

# Fit RandomizedSearchCV on the training set with selected features
random_search.fit(X_train_sel, y_train)

print("Best parameters found:", random_search.best_params_)
print("Best cross-validation F1 score:", random_search.best_score_)

# Use the best estimator from the search for predictions
best_rf = random_search.best_estimator_

# Preprocess the test data using the scaler fitted on the training data
X_test, y_test = preprocess_fraud_test('fraudTest.csv', scaler)

# Apply the same feature selector to the test features
X_test_sel = selector.transform(X_test)

# Predict and evaluate on the test set
y_test_pred = best_rf.predict(X_test_sel)

# Calculate metrics
acc_test = accuracy_score(y_test, y_test_pred)
prec_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print('Test Accuracy:', acc_test)
print("Test Precision:", prec_test)
print("Test Recall:", recall_test)
print("Test F1 Score:", f1_test)

Best parameters found: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 3, 'n_estimators': 71}
Best cross-validation F1 score: 0.6985929718691194
Test Accuracy: 0.9976444929901623
Test Precision: 0.7622333751568381
Test Recall: 0.5664335664335665
Test F1 Score: 0.6499063920834448


In [25]:
# Commence Over Sampling 

sm = SMOTE(random_state = 42) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train) 

print("Before oversampling:", y_train.value_counts())
print("After oversampling:", pd.Series(y_train_res).value_counts())

Before oversampling: is_fraud
0    1031354
1       5986
Name: count, dtype: int64
After oversampling: is_fraud
0    1031354
1    1031354
Name: count, dtype: int64


In [26]:
# Apply oversampling on the training data
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Then, apply feature selector on the oversampled training set
X_train_sel = selector.transform(X_train_res)

# Train model on the oversampled, selected features
rf_sel = RandomForestClassifier(random_state=42)
rf_sel.fit(X_train_sel, y_train_res)


In [29]:
# Apply Randomized Search CV 

# Create a new selector instance without pre-fitting
selector_pipeline = SelectFromModel(RandomForestClassifier(random_state=42), threshold=0.02)

# Create piplearn to apply SMOTE, feature selector, and the classifier
pipeline = Pipeline([
    ('smote', SMOTE(random_state = 42)),
    ('selector', selector_pipeline),  # your pre-fitted selector or a new instance if you want to refit it
    ('classifier', RandomForestClassifier(random_state = 42))
])

# Hyperparameters 

param_dist = { 
    'classifier__n_estimators': randint(50, 150), 
    'classifier__max_depth': [None] + list(range(5, 15)), 
    'classifier__min_samples_split': randint(2, 6),
    'classifier__min_samples_leaf': randint(1, 6),
    'classifier__bootstrap': [True, False]
}

# Randomized Search CV using the created pipline 
random_search = RandomizedSearchCV(
    estimator = pipeline,
    param_distributions = param_dist,
    n_iter = 10,
    cv = 2,
    scoring = 'f1',
    random_state = 42,
    n_jobs = -1
)

# Fit RandomizedSearchCV on original training set (X_train, y_train)
random_search.fit(X_train, y_train)

print("Best parameters found:", random_search.best_params_)
print("Best cross-validation F1 score:", random_search.best_score_)

Best parameters found: {'classifier__bootstrap': True, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 71}
Best cross-validation F1 score: 0.5397242492692361


In [30]:
# Preprocess test data
X_test, y_test = preprocess_fraud_test('fraudTest.csv', scaler)

# Apply the selector to the test set (if it's a separate step)
X_test_sel = selector.transform(X_test)

# Predict using the best estimator from RandomizedSearchCV
y_test_pred = random_search.best_estimator_.predict(X_test)

# Evaluate metrics
acc_test = accuracy_score(y_test, y_test_pred)
prec_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print('Test Accuracy:', acc_test)
print("Test Precision:", prec_test)
print("Test Recall:", recall_test)
print("Test F1 Score:", f1_test)


Test Accuracy: 0.33799636147045536
Test Precision: 0.005221819086966286
Test Recall: 0.8997668997668997
Test F1 Score: 0.010383377987238667
