# First dataset (Base.csv)

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_fscore_support
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
!pip install xgboost
import xgboost as xgb
from xgboost import XGBClassifier

import random
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import learning_curve
from scipy import stats
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression


#Set Matplotlib options
%matplotlib inline
plt.style.use('ggplot')
sns.set_style("whitegrid")

#Increase max Pandas columns
pd.set_option('display.max_columns', 200)

#Define color palette
my_palette = sns.color_palette("Paired", 2)

# ignore FutureWarning
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)






### Read dataset

In [2]:
df = pd.read_csv('Fraud Detection/neurips-2022/Base.csv')

### Preprocessing

In [3]:
numerical_features = []

for col in df.columns:
    if df[col].dtypes =='float64' or df[col].dtypes =='int64':
        numerical_features.append(col)

numerical_features

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [4]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w']

# Replace -1 with NULL
for col in cols:
    df[col] = df[col].replace(-1, np.nan)

# Replace any other missing representations with NULL
for col in cols:
    df[col] = df[col].replace(['NA', '()', 'none'], np.nan, regex=True)

In [5]:
# Replace negative values with NULL
df['intended_balcon_amount'] = df['intended_balcon_amount'].replace([-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], np.nan)

In [6]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w'
]

# Median imputer
imputer = SimpleImputer(strategy="median")

# Fit imputer on each column
for col in cols:
    imputer = imputer.fit(df[col].values.reshape(-1,1))

# Transform data (impute nulls)
for col in cols:
    df[col] = imputer.transform(df[col].values.reshape(-1,1))

In [7]:
df_dumm = pd.get_dummies(df) # one hot encoding

In [8]:
# Take log of columns
df_dumm['proposed_credit_limit'] = np.log(df_dumm['proposed_credit_limit'])
df_dumm['days_since_request'] = np.log(df_dumm['days_since_request'])
df_dumm['zip_count_4w'] = np.log(df_dumm['zip_count_4w'])

In [9]:
dataset_2 = df_dumm.drop(columns='fraud_bool')

In [10]:
# Identify features (X) and target (y)
X = df_dumm.drop('fraud_bool', axis=1)
y = df_dumm['fraud_bool']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Check class distribution in training set
print(y_train.value_counts()/len(y_train))

fraud_bool
0    0.989032
1    0.010968
Name: count, dtype: float64


In [11]:
#Instantiate oversampler and logistic regression model
randomover = RandomOverSampler(random_state=42)
log_reg = LogisticRegression()

# Instantiate samplers
enn = SMOTEENN(random_state=42)

# Apply ENN cleaning
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

# Check the new class distribution
print(y_train_enn.value_counts()/len(y_train_enn))

# Train a model using the oversampled training data
log_reg.fit(X_train_enn, y_train_enn)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
report

fraud_bool
1    0.533585
0    0.466415
Name: count, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'              precision    recall  f1-score   support\n\n           0       0.99      0.61      0.76    247197\n           1       0.02      0.72      0.04      2803\n\n    accuracy                           0.61    250000\n   macro avg       0.51      0.67      0.40    250000\nweighted avg       0.98      0.61      0.75    250000\n'

In [12]:
accuracy_SMOTEENN = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SMOTEENN}")

precision_SMOTEENN = precision_score(y_test, y_pred)
print(f"Precision: {precision_SMOTEENN}")

recall_SMOTEENN = recall_score(y_test, y_pred)
print(f"Recall: {recall_SMOTEENN}")

f1_SMOTEENN = f1_score(y_test, y_pred)
print(f"F1 Score: {f1_SMOTEENN}")

Accuracy: 0.612264
Precision: 0.02060564083237418
Recall: 0.7217267213699607
F1 Score: 0.04006734006734007


In [17]:
# #Feature selection using SMOTE Sampling
# model = ExtraTreesClassifier()
# model.fit(X_train_enn, y_train_enn)
# # Make predictions on the test set
# y_pred = model.predict(X_test)


In [18]:
# #select the top 25 features
# importances = model.feature_importances_
# n_features = 25
# feature_indices = np.argsort(importances)[::-1][:n_features]
# selected_features = [X.columns[i] for i in feature_indices]

In [19]:
# Get the top 25 important features
selected_features =['housing_status_BA',
 'device_os_windows',
 'keep_alive_session',
 'phone_home_valid',
 'payment_type_AC',
 'has_other_cards',
 'payment_type_AB',
 'employment_status_CA',
 'device_os_other',
 'device_os_linux',
 'housing_status_BC',
 'housing_status_BB',
 'payment_type_AA',
 'payment_type_AD',
 'proposed_credit_limit',
 'employment_status_CC',
 'phone_mobile_valid',
 'employment_status_CB',
 'income',
 'customer_age',
 'device_os_macintosh',
 'housing_status_BE',
 'current_address_months_count',
 'bank_months_count',
 'credit_risk_score']

In [21]:
# Create a PCA model and fit it to the selected features
pca = PCA()
pca.fit(X[selected_features])

# Transform the selected features using the fitted PCA model
X_reduced = pca.transform(X[selected_features])

# Get the proportion of variance explained
var_exp = pca.explained_variance_ratio_

# Print the number of dimensions after reduction
print(X_reduced.shape[1])

25


In [22]:
pca = PCA(n_components=15)
X_reduced = pca.fit_transform(X)

In [23]:
model = ExtraTreesClassifier()
model.fit(X_reduced, y)
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 1.0
Precision: [1. 1.]
Recall: [1. 1.]
F1 Score: [1. 1.]


In [24]:
model = xgb.XGBClassifier(
    tree_method='approx', gpu_id=0,
    scale_pos_weight=1
)
model.fit(X_reduced, y)

In [25]:
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 0.989188
Precision: [0.98920229 0.93227092]
Recall: [0.99998281 0.02121679]
F1 Score: [0.99456334 0.04148936]


# Second dataset (Variant 1)

### Read dataset

In [27]:
df_1 = pd.read_csv('Fraud Detection/neurips-2022/Variant I.csv')

### Preprocessing

In [28]:
numerical_features = []

for col in df_1.columns:
    if df_1[col].dtypes =='float64' or df_1[col].dtypes =='int64':
        numerical_features.append(col)

numerical_features

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [29]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w']

# Replace -1 with NULL
for col in cols:
    df_1[col] = df_1[col].replace(-1, np.nan)

# Replace any other missing representations with NULL
for col in cols:
    df_1[col] = df_1[col].replace(['NA', '()', 'none'], np.nan, regex=True)

In [30]:
# Replace negative values with NULL
df_1['intended_balcon_amount'] = df_1['intended_balcon_amount'].replace([-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], np.nan)

In [31]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w'
]

# Median imputer
imputer = SimpleImputer(strategy="median")

# Fit imputer on each column
for col in cols:
    imputer = imputer.fit(df_1[col].values.reshape(-1,1))

# Transform data (impute nulls)
for col in cols:
    df_1[col] = imputer.transform(df_1[col].values.reshape(-1,1))

In [32]:
df_dumm = pd.get_dummies(df_1) # one hot encoding

In [33]:
# Take log of columns
df_dumm['proposed_credit_limit'] = np.log(df_dumm['proposed_credit_limit'])
df_dumm['days_since_request'] = np.log(df_dumm['days_since_request'])
df_dumm['zip_count_4w'] = np.log(df_dumm['zip_count_4w'])

In [34]:
dataset_2 = df_dumm.drop(columns='fraud_bool')

In [35]:
# Identify features (X) and target (y)
X = df_dumm.drop('fraud_bool', axis=1)
y = df_dumm['fraud_bool']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Check class distribution in training set
print(y_train.value_counts()/len(y_train))

fraud_bool
0    0.989036
1    0.010964
Name: count, dtype: float64


In [36]:
#Instantiate oversampler and logistic regression model
randomover = RandomOverSampler(random_state=42)
log_reg = LogisticRegression()

# Instantiate samplers
enn = SMOTEENN(random_state=42)

# Apply ENN cleaning
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

# Check the new class distribution
print(y_train_enn.value_counts()/len(y_train_enn))

# Train a model using the oversampled training data
log_reg.fit(X_train_enn, y_train_enn)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
report

fraud_bool
1    0.534584
0    0.465416
Name: count, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'              precision    recall  f1-score   support\n\n           0       0.99      0.53      0.70    247194\n           1       0.02      0.73      0.03      2806\n\n    accuracy                           0.54    250000\n   macro avg       0.51      0.63      0.36    250000\nweighted avg       0.98      0.54      0.69    250000\n'

In [37]:
accuracy_SMOTEENN = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SMOTEENN}")

precision_SMOTEENN = precision_score(y_test, y_pred)
print(f"Precision: {precision_SMOTEENN}")

recall_SMOTEENN = recall_score(y_test, y_pred)
print(f"Recall: {recall_SMOTEENN}")

f1_SMOTEENN = f1_score(y_test, y_pred)
print(f"F1 Score: {f1_SMOTEENN}")

Accuracy: 0.536392
Precision: 0.017450890038059153
Recall: 0.7287954383464006
F1 Score: 0.034085605707047134


In [38]:
#select the top 25 features
importances = model.feature_importances_
n_features = 25
feature_indices = np.argsort(importances)[::-1][:n_features]
selected_features = [X.columns[i] for i in feature_indices]

In [39]:
# Get the top 25 important features
selected_features = [X.columns[i] for i in feature_indices[:25]]

# Create a PCA model and fit it to the selected features
pca = PCA()
pca.fit(X[selected_features])

# Transform the selected features using the fitted PCA model
X_reduced = pca.transform(X[selected_features])

# Get the proportion of variance explained
var_exp = pca.explained_variance_ratio_

# Print the number of dimensions after reduction
print(X_reduced.shape[1])

15


In [40]:
pca = PCA(n_components=15)
X_reduced = pca.fit_transform(X)

In [41]:
model = ExtraTreesClassifier()
model.fit(X_reduced, y)
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 1.0
Precision: [1. 1.]
Recall: [1. 1.]
F1 Score: [1. 1.]


In [42]:
model = xgb.XGBClassifier(
    tree_method='approx', gpu_id=0,
    scale_pos_weight=1
)
model.fit(X_reduced, y)

In [43]:
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 0.989126
Precision: [0.98913214 0.95321637]
Recall: [0.99999191 0.01477922]
F1 Score: [0.99453238 0.02910714]


# Third Dataset(Variant 2)

### Read dataset

In [63]:
df_2 = pd.read_csv('Fraud Detection/neurips-2022/Variant II.csv')

### Preprocessing

In [64]:
numerical_features = []

for col in df_2.columns:
    if df_2[col].dtypes =='float64' or df_2[col].dtypes =='int64':
        numerical_features.append(col)

numerical_features

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [65]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w']

# Replace -1 with NULL
for col in cols:
    df_2[col] = df_2[col].replace(-1, np.nan)

# Replace any other missing representations with NULL
for col in cols:
    df_2[col] = df_2[col].replace(['NA', '()', 'none'], np.nan, regex=True)

In [66]:
# Replace negative values with NULL
df_2['intended_balcon_amount'] = df_2['intended_balcon_amount'].replace([-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], np.nan)

In [67]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w'
]

# Median imputer
imputer = SimpleImputer(strategy="median")

# Fit imputer on each column
for col in cols:
    imputer = imputer.fit(df_2[col].values.reshape(-1,1))

# Transform data (impute nulls)
for col in cols:
    df_2[col] = imputer.transform(df_2[col].values.reshape(-1,1))

In [68]:
df_dumm = pd.get_dummies(df_2) # one hot encoding

In [69]:
# Take log of columns
df_dumm['proposed_credit_limit'] = np.log(df_dumm['proposed_credit_limit'])
df_dumm['days_since_request'] = np.log(df_dumm['days_since_request'])
df_dumm['zip_count_4w'] = np.log(df_dumm['zip_count_4w'])

In [70]:
dataset_2 = df_dumm.drop(columns='fraud_bool')

In [71]:
# Identify features (X) and target (y)
X = df_dumm.drop('fraud_bool', axis=1)
y = df_dumm['fraud_bool']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Check class distribution in training set
print(y_train.value_counts()/len(y_train))

fraud_bool
0    0.988969
1    0.011031
Name: count, dtype: float64


In [72]:
#Instantiate oversampler and logistic regression model
randomover = RandomOverSampler(random_state=42)
log_reg = LogisticRegression()

# Instantiate samplers
enn = SMOTEENN(random_state=42)

# Apply ENN cleaning
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

# Check the new class distribution
print(y_train_enn.value_counts()/len(y_train_enn))

# Train a model using the oversampled training data
log_reg.fit(X_train_enn, y_train_enn)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
report

fraud_bool
1    0.533144
0    0.466856
Name: count, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'              precision    recall  f1-score   support\n\n           0       1.00      0.59      0.74    247244\n           1       0.02      0.78      0.04      2756\n\n    accuracy                           0.60    250000\n   macro avg       0.51      0.69      0.39    250000\nweighted avg       0.99      0.60      0.74    250000\n'

In [73]:
accuracy_SMOTEENN = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SMOTEENN}")

precision_SMOTEENN = precision_score(y_test, y_pred)
print(f"Precision: {precision_SMOTEENN}")

recall_SMOTEENN = recall_score(y_test, y_pred)
print(f"Recall: {recall_SMOTEENN}")

f1_SMOTEENN = f1_score(y_test, y_pred)
print(f"F1 Score: {f1_SMOTEENN}")

Accuracy: 0.596024
Precision: 0.020939804158701673
Recall: 0.7790275761973875
F1 Score: 0.04078337512347086


In [74]:
#select the top 25 features
importances = model.feature_importances_
n_features = 25
feature_indices = np.argsort(importances)[::-1][:n_features]
selected_features = [X.columns[i] for i in feature_indices]

In [75]:
# Get the top 25 important features
selected_features = [X.columns[i] for i in feature_indices[:25]]

# Create a PCA model and fit it to the selected features
pca = PCA()
pca.fit(X[selected_features])

# Transform the selected features using the fitted PCA model
X_reduced = pca.transform(X[selected_features])

# Get the proportion of variance explained
var_exp = pca.explained_variance_ratio_

# Print the number of dimensions after reduction
print(X_reduced.shape[1])

15


In [76]:
pca = PCA(n_components=15)
X_reduced = pca.fit_transform(X)

In [77]:
model = ExtraTreesClassifier()
model.fit(X_reduced, y)
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 1.0
Precision: [1. 1.]
Recall: [1. 1.]
F1 Score: [1. 1.]


In [78]:
model = xgb.XGBClassifier(
    tree_method='approx', gpu_id=0,
    scale_pos_weight=1
)
model.fit(X_reduced, y)

In [79]:
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 0.989214
Precision: [0.98924172 0.89836066]
Recall: [0.99996865 0.02484359]
F1 Score: [0.99457626 0.0483501 ]


# Fourth Dataset(Variant 3)

### Read dataset

In [81]:
df_3 = pd.read_csv('Fraud Detection/neurips-2022/Variant III.csv')

### Preprocessing

In [82]:
numerical_features = []

for col in df_3.columns:
    if df_3[col].dtypes =='float64' or df_3[col].dtypes =='int64':
        numerical_features.append(col)

numerical_features

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month',
 'x1',
 'x2']

In [83]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w']

# Replace -1 with NULL
for col in cols:
    df_3[col] = df_3[col].replace(-1, np.nan)

# Replace any other missing representations with NULL
for col in cols:
    df_3[col] = df_3[col].replace(['NA', '()', 'none'], np.nan, regex=True)

In [84]:
# Replace negative values with NULL
df_3['intended_balcon_amount'] = df_3['intended_balcon_amount'].replace([-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], np.nan)

In [85]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w'
]

# Median imputer
imputer = SimpleImputer(strategy="median")

# Fit imputer on each column
for col in cols:
    imputer = imputer.fit(df_3[col].values.reshape(-1,1))

# Transform data (impute nulls)
for col in cols:
    df_3[col] = imputer.transform(df_3[col].values.reshape(-1,1))

In [86]:
df_dumm = pd.get_dummies(df_3) # one hot encoding

In [87]:
# Take log of columns
df_dumm['proposed_credit_limit'] = np.log(df_dumm['proposed_credit_limit'])
df_dumm['days_since_request'] = np.log(df_dumm['days_since_request'])
df_dumm['zip_count_4w'] = np.log(df_dumm['zip_count_4w'])

In [88]:
dataset_2 = df_dumm.drop(columns='fraud_bool')

In [89]:
# Identify features (X) and target (y)
X = df_dumm.drop('fraud_bool', axis=1)
y = df_dumm['fraud_bool']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Check class distribution in training set
print(y_train.value_counts()/len(y_train))

fraud_bool
0    0.988983
1    0.011017
Name: count, dtype: float64


In [90]:
#Instantiate oversampler and logistic regression model
randomover = RandomOverSampler(random_state=42)
log_reg = LogisticRegression()

# Instantiate samplers
enn = SMOTEENN(random_state=42)

# Apply ENN cleaning
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

# Check the new class distribution
print(y_train_enn.value_counts()/len(y_train_enn))

# Train a model using the oversampled training data
log_reg.fit(X_train_enn, y_train_enn)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
report

fraud_bool
1    0.53462
0    0.46538
Name: count, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'              precision    recall  f1-score   support\n\n           0       0.99      0.55      0.71    247233\n           1       0.02      0.72      0.03      2767\n\n    accuracy                           0.56    250000\n   macro avg       0.51      0.64      0.37    250000\nweighted avg       0.98      0.56      0.70    250000\n'

In [91]:
accuracy_SMOTEENN = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SMOTEENN}")

precision_SMOTEENN = precision_score(y_test, y_pred)
print(f"Precision: {precision_SMOTEENN}")

recall_SMOTEENN = recall_score(y_test, y_pred)
print(f"Recall: {recall_SMOTEENN}")

f1_SMOTEENN = f1_score(y_test, y_pred)
print(f"F1 Score: {f1_SMOTEENN}")

Accuracy: 0.55664
Precision: 0.017690584360523755
Recall: 0.7162992410552945
F1 Score: 0.0345284136441239


In [92]:
#select the top 25 features
importances = model.feature_importances_
n_features = 25
feature_indices = np.argsort(importances)[::-1][:n_features]
selected_features = [X.columns[i] for i in feature_indices]

In [93]:
# Get the top 25 important features
selected_features = [X.columns[i] for i in feature_indices[:25]]

# Create a PCA model and fit it to the selected features
pca = PCA()
pca.fit(X[selected_features])

# Transform the selected features using the fitted PCA model
X_reduced = pca.transform(X[selected_features])

# Get the proportion of variance explained
var_exp = pca.explained_variance_ratio_

# Print the number of dimensions after reduction
print(X_reduced.shape[1])

15


In [94]:
pca = PCA(n_components=15)
X_reduced = pca.fit_transform(X)

In [95]:
model = ExtraTreesClassifier()
model.fit(X_reduced, y)
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 1.0
Precision: [1. 1.]
Recall: [1. 1.]
F1 Score: [1. 1.]


In [96]:
model = xgb.XGBClassifier(
    tree_method='approx', gpu_id=0,
    scale_pos_weight=1
)
model.fit(X_reduced, y)

In [97]:
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 0.9927
Precision: [0.99304681 0.91573785]
Recall: [0.99961778 0.3724388 ]
F1 Score: [0.99632146 0.52951792]


# The five  Dataset (Variant 4)

### Read dataset (Variant 4)

In [98]:
df_4 = pd.read_csv('Fraud Detection/neurips-2022/Variant IV.csv')

### Preprocessing

In [99]:
numerical_features = []

for col in df_4.columns:
    if df_4[col].dtypes =='float64' or df_4[col].dtypes =='int64':
        numerical_features.append(col)

numerical_features

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month']

In [100]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w']

# Replace -1 with NULL
for col in cols:
    df_4[col] = df_4[col].replace(-1, np.nan)

# Replace any other missing representations with NULL
for col in cols:
    df_4[col] = df_4[col].replace(['NA', '()', 'none'], np.nan, regex=True)

In [101]:
# Replace negative values with NULL
df_4['intended_balcon_amount'] = df_4['intended_balcon_amount'].replace([-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], np.nan)

In [102]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w'
]

# Median imputer
imputer = SimpleImputer(strategy="median")

# Fit imputer on each column
for col in cols:
    imputer = imputer.fit(df_4[col].values.reshape(-1,1))

# Transform data (impute nulls)
for col in cols:
    df_4[col] = imputer.transform(df_4[col].values.reshape(-1,1))

In [103]:
df_dumm = pd.get_dummies(df_4) # one hot encoding

In [104]:
# Take log of columns
df_dumm['proposed_credit_limit'] = np.log(df_dumm['proposed_credit_limit'])
df_dumm['days_since_request'] = np.log(df_dumm['days_since_request'])
df_dumm['zip_count_4w'] = np.log(df_dumm['zip_count_4w'])

In [105]:
dataset_2 = df_dumm.drop(columns='fraud_bool')

In [106]:
# Identify features (X) and target (y)
X = df_dumm.drop('fraud_bool', axis=1)
y = df_dumm['fraud_bool']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Check class distribution in training set
print(y_train.value_counts()/len(y_train))

fraud_bool
0    0.988929
1    0.011071
Name: count, dtype: float64


In [107]:
#Instantiate oversampler and logistic regression model
randomover = RandomOverSampler(random_state=42)
log_reg = LogisticRegression()

# Instantiate samplers
enn = SMOTEENN(random_state=42)

# Apply ENN cleaning
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

# Check the new class distribution
print(y_train_enn.value_counts()/len(y_train_enn))

# Train a model using the oversampled training data
log_reg.fit(X_train_enn, y_train_enn)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
report

fraud_bool
1    0.533201
0    0.466799
Name: count, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'              precision    recall  f1-score   support\n\n           0       1.00      0.58      0.73    247273\n           1       0.02      0.75      0.04      2727\n\n    accuracy                           0.58    250000\n   macro avg       0.51      0.67      0.39    250000\nweighted avg       0.98      0.58      0.73    250000\n'

In [108]:
accuracy_SMOTEENN = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SMOTEENN}")

precision_SMOTEENN = precision_score(y_test, y_pred)
print(f"Precision: {precision_SMOTEENN}")

recall_SMOTEENN = recall_score(y_test, y_pred)
print(f"Recall: {recall_SMOTEENN}")

f1_SMOTEENN = f1_score(y_test, y_pred)
print(f"F1 Score: {f1_SMOTEENN}")

Accuracy: 0.584328
Precision: 0.019515484183436054
Recall: 0.7535753575357536
F1 Score: 0.03804569185766653


In [109]:
#select the top 25 features
importances = model.feature_importances_
n_features = 25
feature_indices = np.argsort(importances)[::-1][:n_features]
selected_features = [X.columns[i] for i in feature_indices]

In [110]:
# Get the top 25 important features
selected_features = [X.columns[i] for i in feature_indices[:25]]

# Create a PCA model and fit it to the selected features
pca = PCA()
pca.fit(X[selected_features])

# Transform the selected features using the fitted PCA model
X_reduced = pca.transform(X[selected_features])

# Get the proportion of variance explained
var_exp = pca.explained_variance_ratio_

# Print the number of dimensions after reduction
print(X_reduced.shape[1])

15


In [111]:
pca = PCA(n_components=15)
X_reduced = pca.fit_transform(X)

In [112]:
model = ExtraTreesClassifier()
model.fit(X_reduced, y)
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 1.0
Precision: [1. 1.]
Recall: [1. 1.]
F1 Score: [1. 1.]


In [113]:
model = xgb.XGBClassifier(
    tree_method='approx', gpu_id=0,
    scale_pos_weight=1
)
model.fit(X_reduced, y)

In [114]:
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 0.989194
Precision: [0.98921017 0.92748092]
Recall: [0.99998079 0.02203083]
F1 Score: [0.99456632 0.04303932]


# The six dataset (Variant 5)

### Read dataset

In [115]:
df_5 = pd.read_csv('Fraud Detection/neurips-2022/Variant V.csv')

### Preprocessing

In [116]:
numerical_features = []

for col in df_5.columns:
    if df_5[col].dtypes =='float64' or df_5[col].dtypes =='int64':
        numerical_features.append(col)

numerical_features

['fraud_bool',
 'income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'email_is_free',
 'phone_home_valid',
 'phone_mobile_valid',
 'bank_months_count',
 'has_other_cards',
 'proposed_credit_limit',
 'foreign_request',
 'session_length_in_minutes',
 'keep_alive_session',
 'device_distinct_emails_8w',
 'device_fraud_count',
 'month',
 'x1',
 'x2']

In [117]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w']

# Replace -1 with NULL
for col in cols:
    df_5[col] = df_5[col].replace(-1, np.nan)

# Replace any other missing representations with NULL
for col in cols:
    df_5[col] = df_5[col].replace(['NA', '()', 'none'], np.nan, regex=True)

In [118]:
# Replace negative values with NULL
df_5['intended_balcon_amount'] = df_5['intended_balcon_amount'].replace([-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1], np.nan)

In [119]:
# Columns with missing values
cols = ['prev_address_months_count',
'current_address_months_count',
'bank_months_count',
'session_length_in_minutes',
'device_distinct_emails_8w'
]

# Median imputer
imputer = SimpleImputer(strategy="median")

# Fit imputer on each column
for col in cols:
    imputer = imputer.fit(df_5[col].values.reshape(-1,1))

# Transform data (impute nulls)
for col in cols:
    df_5[col] = imputer.transform(df_5[col].values.reshape(-1,1))

In [120]:
df_dumm = pd.get_dummies(df_5) # one hot encoding

In [121]:
# Take log of columns
df_dumm['proposed_credit_limit'] = np.log(df_dumm['proposed_credit_limit'])
df_dumm['days_since_request'] = np.log(df_dumm['days_since_request'])
df_dumm['zip_count_4w'] = np.log(df_dumm['zip_count_4w'])

In [122]:
dataset_2 = df_dumm.drop(columns='fraud_bool')

In [123]:
# Identify features (X) and target (y)
X = df_dumm.drop('fraud_bool', axis=1)
y = df_dumm['fraud_bool']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Check class distribution in training set
print(y_train.value_counts()/len(y_train))

fraud_bool
0    0.989013
1    0.010987
Name: count, dtype: float64


In [124]:
#Instantiate oversampler and logistic regression model
randomover = RandomOverSampler(random_state=42)
log_reg = LogisticRegression()

# Instantiate samplers
enn = SMOTEENN(random_state=42)

# Apply ENN cleaning
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)

# Check the new class distribution
print(y_train_enn.value_counts()/len(y_train_enn))

# Train a model using the oversampled training data
log_reg.fit(X_train_enn, y_train_enn)

# Make predictions on test set
y_pred = log_reg.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
report

fraud_bool
1    0.534477
0    0.465523
Name: count, dtype: float64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


'              precision    recall  f1-score   support\n\n           0       0.99      0.59      0.74    247210\n           1       0.02      0.68      0.04      2790\n\n    accuracy                           0.59    250000\n   macro avg       0.51      0.63      0.39    250000\nweighted avg       0.98      0.59      0.73    250000\n'

In [125]:
accuracy_SMOTEENN = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_SMOTEENN}")

precision_SMOTEENN = precision_score(y_test, y_pred)
print(f"Precision: {precision_SMOTEENN}")

recall_SMOTEENN = recall_score(y_test, y_pred)
print(f"Recall: {recall_SMOTEENN}")

f1_SMOTEENN = f1_score(y_test, y_pred)
print(f"F1 Score: {f1_SMOTEENN}")

Accuracy: 0.586504
Precision: 0.018275862068965518
Recall: 0.6838709677419355
F1 Score: 0.035600335852225024


In [126]:
#select the top 25 features
importances = model.feature_importances_
n_features = 25
feature_indices = np.argsort(importances)[::-1][:n_features]
selected_features = [X.columns[i] for i in feature_indices]

In [127]:
# Get the top 25 important features
selected_features = [X.columns[i] for i in feature_indices[:25]]

# Create a PCA model and fit it to the selected features
pca = PCA()
pca.fit(X[selected_features])

# Transform the selected features using the fitted PCA model
X_reduced = pca.transform(X[selected_features])

# Get the proportion of variance explained
var_exp = pca.explained_variance_ratio_

# Print the number of dimensions after reduction
print(X_reduced.shape[1])

15


In [128]:
pca = PCA(n_components=15)
X_reduced = pca.fit_transform(X)

In [129]:
model = ExtraTreesClassifier()
model.fit(X_reduced, y)
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 1.0
Precision: [1. 1.]
Recall: [1. 1.]
F1 Score: [1. 1.]


In [130]:
model = xgb.XGBClassifier(
    tree_method='approx', gpu_id=0,
    scale_pos_weight=1
)
model.fit(X_reduced, y)

In [131]:
# Make predictions on the test set
y_pred = model.predict(X_reduced)

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, f1 score
prec, rec, fscore, sup = precision_recall_fscore_support(y, y_pred)

print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(prec))
print("Recall: {}".format(rec))
print("F1 Score: {}".format(fscore))

Accuracy: 0.991847
Precision: [0.99206035 0.9285076 ]
Recall: [0.99975732 0.28259293]
F1 Score: [0.99589396 0.43330785]
