In [54]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
import numpy as np
import pandas as pd
import sqlalchemy
import os
import psycopg2 as ps
from pathlib import Path
from collections import Counter

In [56]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sqlalchemy import create_engine
from config import db_password

In [57]:
"postgresql://[user]:[password]@[location]:[port]/[database]"

'postgresql://[user]:[password]@[location]:[port]/[database]'

In [58]:
from config import db_password

In [59]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/finalproject"

In [60]:
engine = create_engine(db_string, pool_size=10, max_overflow=20)

In [61]:
df = pd.read_sql_table('mergedfrauddata', con= engine)

In [62]:
df.head()

Unnamed: 0,zip,city_pop,amt,category,is_fraud,gender,dob,age,agegroup,time,timeofday,avg_income
0,28654,3495,4.97,misc_net,0,F,1988-03-09,34,Middle_Aged,0,Early Morning,48427.56
1,99160,149,107.23,grocery_pos,0,F,1978-06-21,44,Middle_Aged,0,Early Morning,
2,83252,4154,220.11,entertainment,0,M,1962-01-19,60,Older_Adult,0,Early Morning,49807.39
3,59632,1939,45.0,gas_transport,0,M,1967-01-12,55,Older_Adult,0,Early Morning,52056.04
4,24433,99,41.96,misc_pos,0,M,1986-03-28,36,Middle_Aged,0,Early Morning,


In [94]:
# Cleaning
df2= df.drop(['category','dob', 'amt'], axis=1)

In [95]:
df2.dropna(inplace=True)

In [96]:
# Converting string categories into numeric indicators

df2.gender[df2.gender == 'M'] = 1
df2.gender[df2.gender == 'F'] = 2

df2.agegroup[df2.agegroup == 'Minor'] = 1
df2.agegroup[df2.agegroup == 'Young_Adult'] = 2
df2.agegroup[df2.agegroup == 'Middle_Aged'] = 3
df2.agegroup[df2.agegroup == 'Older_Adult'] = 4
df2.agegroup[df2.agegroup == 'Senior'] = 5

df2.timeofday[df2.timeofday == 'Early Morning'] = 1
df2.timeofday[df2.timeofday == 'Morning'] = 2
df2.timeofday[df2.timeofday == 'Afternoon'] = 3
df2.timeofday[df2.timeofday == 'Evening'] = 4
df2.timeofday[df2.timeofday == 'Night'] = 5

In [97]:
df2.head()

Unnamed: 0,zip,city_pop,is_fraud,gender,age,agegroup,time,timeofday,avg_income
0,28654,3495,0,2,34,3,0,1,48427.56
2,83252,4154,0,1,60,4,0,1,49807.39
3,59632,1939,0,1,55,4,0,1,52056.04
5,18917,2158,0,2,61,4,0,1,65388.07
6,67851,2691,0,2,29,2,0,1,55165.0


## Beginning of Logistic Regression Model

In [98]:
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=2, random_state=5)

In [99]:
X=df2.drop(columns='is_fraud')
y = df2["is_fraud"]

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [101]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', 
                                random_state=1)
classifier

LogisticRegression(random_state=1)

In [102]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [103]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
260827,0,0
1161050,0,0
824221,0,0
905172,0,0
703902,0,0
...,...,...
317205,0,0
571781,0,0
1101443,0,0
1004597,0,0


In [104]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.9942226617394473

In [105]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      1.00      0.00      1.00      0.00      0.00    267428
          1       0.00      0.00      1.00      0.00      0.00      0.00      1554

avg / total       0.99      0.99      0.01      0.99      0.00      0.00    268982



## Beginning of Balanced Random Forest Classifier

In [106]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfcmodel = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfcmodel.fit( X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [107]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfcmodel.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8116594521290365

In [108]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.78      0.84      0.88      0.81      0.65    267428
          1       0.02      0.84      0.78      0.04      0.81      0.66      1554

avg / total       0.99      0.78      0.84      0.87      0.81      0.65    268982



In [109]:
# List the features sorted in descending order by feature importance
sorted_features = sorted(zip(brfcmodel.feature_importances_, X.columns), reverse=True)
sorted_features

[(0.39983327283238834, 'time'),
 (0.14321961453874074, 'timeofday'),
 (0.113523267410041, 'city_pop'),
 (0.10888477241156184, 'avg_income'),
 (0.10725725402802479, 'zip'),
 (0.09435533192086078, 'age'),
 (0.01936477870215177, 'agegroup'),
 (0.01356170815623074, 'gender')]