In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from utils import evaluate_model, evaluate_model_full

In [3]:
# Import the dataset
df = pd.read_csv('data\credit_card_transactions.csv')

  df = pd.read_csv('data\credit_card_transactions.csv')


In [4]:
# Drop the columns that are not needed
df_dropped = df.copy()
df_dropped = df_dropped.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','first', 'last', 'gender', 'street', 'lat', 'long', 'dob','trans_num','merch_lat', 'merch_long', 'unix_time','city', 'state'])

In [5]:
# Fill the missing values in the zipcode column with the value in the zip column
df_dropped['merch_zipcode'] = df_dropped['merch_zipcode'].fillna(df_dropped['zip'])

In [6]:
# Filter out the rows where the amount is greater than 10000
df_copy = df_dropped.copy()
df_copy = df_copy[df_copy['amt'] < 10000]

In [7]:
# One-hot encode the category column
dummies = pd.get_dummies(df_copy['category'], drop_first=True, dtype=int)
df_copy = pd.concat([df_copy, dummies], axis=1)
df_copy = df_copy.drop(columns=['category'])

In [8]:
# Scale the numerical columns
scaler = StandardScaler()
df_scaled = df_copy.copy()
df_scaled[['amt', 'zip', 'city_pop', 'merch_zipcode']] = scaler.fit_transform(df_scaled[['amt', 'zip', 'city_pop', 'merch_zipcode']])
df_scaled.head()

Unnamed: 0,merchant,amt,zip,city_pop,job,is_fraud,merch_zipcode,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel
0,"fraud_Rippin, Kub and Mann",-0.469521,-0.749142,-0.282589,"Psychologist, counselling",0,-0.754769,0,0,0,0,0,0,0,1,0,0,0,0,0
1,"fraud_Heller, Gutmann and Zieme",0.269247,1.872566,-0.29367,Special educational needs teacher,0,1.874991,0,0,0,1,0,0,0,0,0,0,0,0,0
2,fraud_Lind-Buckridge,1.084737,1.28104,-0.280406,Nature conservation officer,0,1.280622,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"fraud_Kutch, Hermiston and Farrell",-0.180328,0.40275,-0.287742,Patent attorney,0,0.399593,0,1,0,0,0,0,0,0,0,0,0,0,0
4,fraud_Keeling-Crist,-0.20229,-0.906096,-0.293835,Dance movement psychotherapist,0,-0.973533,0,0,0,0,0,0,0,0,1,0,0,0,0


In [9]:
# Scale the merchant column
merch_freq = df_scaled['merchant'].value_counts()
df_scaled['merchant_encoded'] = df_scaled['merchant'].apply(lambda x: merch_freq[x])
df_scaled = df_scaled.drop(columns=['merchant'])

In [10]:
# Scale the job column
job_freq = df_scaled['job'].value_counts()
df_scaled['job_encoded'] = df_scaled['job'].apply(lambda x: job_freq[x])
df_scaled = df_scaled.drop(columns=['job'])

In [12]:
# Split the data into training and testing sets
X = df_scaled.drop(columns=['is_fraud'])
y = df_scaled['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
# Resample the data using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [14]:
# Perform a PCA on the data
pca_model = PCA(n_components = 10)
pca_model.fit(X_train)

X_train_pca = pd.DataFrame(pca_model.transform(X_train))
X_test_pca = pd.DataFrame(pca_model.transform(X_test))

In [16]:
XGB = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
evaluate_model_full(XGB, X_train, y_train, X_test, y_test, X_train_resampled, y_train_resampled, X_train_pca, X_test_pca)

Accuracy Score:
0.9968410758851312

Confusion Matrix:
[[322032    255]
 [   769   1105]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    322287
           1       0.81      0.59      0.68      1874

    accuracy                           1.00    324161
   macro avg       0.91      0.79      0.84    324161
weighted avg       1.00      1.00      1.00    324161

Accuracy Score:
0.9845663111848741

Confusion Matrix:
[[317554   4733]
 [   270   1604]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    322287
           1       0.25      0.86      0.39      1874

    accuracy                           0.98    324161
   macro avg       0.63      0.92      0.69    324161
weighted avg       0.99      0.98      0.99    324161

Accuracy Score:
0.9970354237554795

Confusion Matrix:
[[322007    280]
 [   681   1193]]

Classification Report:
       

In [None]:
# Define a Logistic Regression model
LogReg = LogisticRegression(solver='saga',max_iter=5000)

Accuracy Score:
0.993611199373151

Confusion Matrix:
[[322090    197]
 [  1874      0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    322287
           1       0.00      0.00      0.00      1874

    accuracy                           0.99    324161
   macro avg       0.50      0.50      0.50    324161
weighted avg       0.99      0.99      0.99    324161



In [None]:
# Evaluate the Logistic Regression model
evaluate_model_full(LogReg, X_train, y_train, X_test, y_test, X_train_resampled, y_train_resampled, X_train_pca, X_test_pca)

In [None]:
# Define a SGD model
SGD = SGDClassifier(loss = 'log_loss', max_iter=5000, tol=1e-3, random_state=42)

In [None]:
# Evaluate the SGD model
evaluate_model_full(SGD, X_train, y_train, X_test, y_test, X_train_resampled, y_train_resampled, X_train_pca, X_test_pca)

In [None]:
# Define a Decision Tree model
DTC = DecisionTreeClassifier(random_state=42, max_depth=5)

In [None]:
# Evaluate the Decision Tree model
evaluate_model_full(DTC, X_train, y_train, X_test, y_test, X_train_resampled, y_train_resampled, X_train_pca, X_test_pca)

In [None]:
# Define an XGBoost model
XGB = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

Accuracy Score:
0.9845663111848741

Confusion Matrix:
[[317554   4733]
 [   270   1604]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    322287
           1       0.25      0.86      0.39      1874

    accuracy                           0.98    324161
   macro avg       0.63      0.92      0.69    324161
weighted avg       0.99      0.98      0.99    324161



In [None]:
# Evaluate the XGBoost model
evaluate_model_full(XGB, X_train, y_train, X_test, y_test, X_train_resampled, y_train_resampled, X_train_pca, X_test_pca)

In [21]:
# Define a Random Forest model
RFC = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Evaluate the Random Forest model
evaluate_model_full(RFC, X_train, y_train, X_test, y_test, X_train_resampled, y_train_resampled, X_train_pca, X_test_pca)