In [35]:
import numpy as np
import pandas as pd
import sqlalchemy
import os
import psycopg2 as ps
from pathlib import Path
from collections import Counter

In [36]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sqlalchemy import create_engine
from config import db_password

In [37]:
"postgresql://[user]:[password]@[location]:[port]/[database]"

'postgresql://[user]:[password]@[location]:[port]/[database]'

In [38]:
from config import db_password

In [39]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/finalproject"

In [40]:
engine = create_engine(db_string, pool_size=10, max_overflow=20)

In [41]:
df = pd.read_sql_table('mergedfrauddata', con= engine)

In [42]:
df.head()

Unnamed: 0,zip,city_pop,amt,category,is_fraud,gender,dob,age,agegroup,time,timeofday,avg_income
0,28654,3495,4.97,misc_net,0,F,1988-03-09,34,Middle_Aged,0,Early Morning,48427.56
1,99160,149,107.23,grocery_pos,0,F,1978-06-21,44,Middle_Aged,0,Early Morning,
2,83252,4154,220.11,entertainment,0,M,1962-01-19,60,Older_Adult,0,Early Morning,49807.39
3,59632,1939,45.0,gas_transport,0,M,1967-01-12,55,Older_Adult,0,Early Morning,52056.04
4,24433,99,41.96,misc_pos,0,M,1986-03-28,36,Middle_Aged,0,Early Morning,


In [43]:
# Cleaning
df2= df.drop(['category','dob', 'amt','time', 'timeofday','age'], axis=1)

In [44]:
df2.dropna(inplace=True)

In [45]:
# Converting string categories into numeric indicators
import warnings
warnings.filterwarnings('ignore')

df2.gender[df2.gender == 'M'] = 1
df2.gender[df2.gender == 'F'] = 2

df2.agegroup[df2.agegroup == 'Minor'] = 1
df2.agegroup[df2.agegroup == 'Young_Adult'] = 2
df2.agegroup[df2.agegroup == 'Middle_Aged'] = 3
df2.agegroup[df2.agegroup == 'Older_Adult'] = 4
df2.agegroup[df2.agegroup == 'Senior'] = 5


In [46]:
df2.head()

Unnamed: 0,zip,city_pop,is_fraud,gender,agegroup,avg_income
0,28654,3495,0,2,3,48427.56
2,83252,4154,0,1,4,49807.39
3,59632,1939,0,1,4,52056.04
5,18917,2158,0,2,4,65388.07
6,67851,2691,0,2,2,55165.0


## Logistic Regression

In [47]:
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=2, random_state=5)

In [48]:
X=df2.drop(columns='is_fraud')
y = df2["is_fraud"]

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [50]:
Counter(y_train)

Counter({0: 802285, 1: 4660})

In [70]:
Counter(y_test)

Counter({0: 267428, 1: 1554})

In [51]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', 
                                random_state=1)
classifier

LogisticRegression(random_state=1)

In [52]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [53]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
260827,0,0
1161050,0,0
824221,0,0
905172,0,0
703902,0,0
...,...,...
317205,0,0
571781,0,0
1101443,0,0
1004597,0,0


In [54]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,267428,0
Actual 1,1554,0


In [55]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      1.00      0.00      1.00      0.00      0.00    267428
          1       0.00      0.00      1.00      0.00      0.00      0.00      1554

avg / total       0.99      0.99      0.01      0.99      0.00      0.00    268982



In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.9942226617394473

## Random Forest Classifier

In [57]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfcmodel = BalancedRandomForestClassifier(n_estimators =200, random_state=1)
brfcmodel.fit( X_train, y_train)

BalancedRandomForestClassifier(n_estimators=500, random_state=1)

In [58]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfcmodel.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6648701908753212

In [59]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.61      0.72      0.76      0.66      0.43    267428
          1       0.01      0.72      0.61      0.02      0.66      0.44      1554

avg / total       0.99      0.61      0.72      0.75      0.66      0.43    268982



In [60]:
# List the features sorted in descending order by feature importance
sorted_features = sorted(zip(brfcmodel.feature_importances_, X.columns), reverse=True)
sorted_features

[(0.3046484772749871, 'city_pop'),
 (0.30046000456952826, 'avg_income'),
 (0.28683358227173883, 'zip'),
 (0.07877550977205293, 'agegroup'),
 (0.029282426111692874, 'gender')]