# DSCI445 Term Project Paper - Bank Account Fraud Detection
### Jakob Wickham, Nick Brady, Noah Sturgeon

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import imblearn as skim
import xgboost as xgb
import kagglehub
import seaborn as sns
import sklearn as sk

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek

## Introduction



### Motivation

-{Imporanted of fraud dection in finacial service (Provide reference)}
-{Challanges associated with fraud dection: Imbalced datsets , cost associated with false postive / false negatives}
-{Objective of the proeject}:Our goal in this project is to work with various machine learning methods to get the highest recall (how many fraudulent transactions are successfully identified) possible, while locking our false positive rate (how many valid transactions are misidentified) at 5% or lower. The reason for using recall, as opposed to accuracy, is due to the nature of the dataset. There is far less fraud than legitimate transactions, and so aiming for accuracy will inherently create a model that is useless at detecting fraud.

## Dataset overview 

-{Description of the dataset}:
    - Source of the data set 
    - {Key Features} 
    - {Target Variables}

&nbsp;&nbsp;&nbsp;&nbsp;The Bank Account Fraud NeurIPS 2022 datasets (called BAF for short) are a suite of synthetic datasets meant to evaluate machine learning methods. 
BAF was generated with a fraud label, a boolean value called fraud_bool, which is the response on which machine learning methods can be tested on.



## Methodolgy

### Data Preprocessing 

-{handling missing values}
-{removing or inputing outliers}
-{flagging inputed values}

In [2]:
# Only run if you want to locally have the dataset on your machine
path = kagglehub.dataset_download("sgpjesus/bank-account-fraud-dataset-neurips-2022")

data: pd.DataFrame = pd.read_csv(f"{path}/Base.csv")

-{Checking for missing values there is none but based on variable description some missing could be hideen due to -1 } 

In [5]:
missing_data_summary = pd.DataFrame({
    'Column': data.columns,
    'Missing_Count': data.isnull().sum(),
    'Missing_Percentage': data.isnull().mean() * 100
}).reset_index(drop=True)


-{checking for negative values in the columns that state that negative are missing, due to the high percentage of missing we are dropping those speicific columns}

In [8]:
numeric_missing_value_columns = [
    'prev_address_months_count',
    'current_address_months_count',
    'intended_balcon_amount', 
    'bank_months_count', 
    'session_length_in_minutes', 
    'device_distinct_emails_8w'
]

# Create a summary DataFrame for missing data
missing_data_summary = pd.DataFrame({
    'Missing Data Count': [(data[col] < 0).sum() for col in numeric_missing_value_columns],
    'Percentage Missing': [(data[col] < 0).mean() * 100 for col in numeric_missing_value_columns]
}, index=numeric_missing_value_columns)

print(missing_data_summary)


                              Missing Data Count  Percentage Missing
prev_address_months_count                 712920             71.2920
current_address_months_count                4254              0.4254
intended_balcon_amount                    742523             74.2523
bank_months_count                         253635             25.3635
session_length_in_minutes                   2015              0.2015
device_distinct_emails_8w                    359              0.0359


-{What is imputation and what we are dong it, why is flagging the inputed data so important}

In [9]:
cleaned_data = data.copy()

# Removing the 'device_fraud_count' column due to it only containing one value
cleaned_data = cleaned_data.drop('device_fraud_count', axis=1)

# Removing the 'prev_address_months_count' and 'intended_balcon_amount' columns due to high missing data
cleaned_data = cleaned_data.drop(['prev_address_months_count', 'intended_balcon_amount'], axis=1)

# List of columns to impute with the median
columns_to_impute = [
    "current_address_months_count", 
    "bank_months_count", 
    "session_length_in_minutes", 
    "device_distinct_emails_8w"
]

# Impute missing values and create an is_imputed flag
for col in columns_to_impute:
    imputed_flag_col = f"{col}_is_imputed"
    # Create a flag column to indicate imputed values
    cleaned_data[imputed_flag_col] = cleaned_data[col] < 0
    # Impute missing values (negative values treated as missing) with the median
    median_value = cleaned_data.loc[cleaned_data[col] >= 0, col].median()
    cleaned_data[col] = cleaned_data[col].where(cleaned_data[col] >= 0, median_value)
