In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import sqlalchemy
import os
import psycopg2 as ps
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sqlalchemy import create_engine
from config import db_password

In [4]:
"postgresql://[user]:[password]@[location]:[port]/[database]"

'postgresql://[user]:[password]@[location]:[port]/[database]'

In [5]:
from config import db_password

In [6]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/finalproject"

In [7]:
engine = create_engine(db_string, pool_size=10, max_overflow=20)

In [8]:
df = pd.read_sql_table('mergedfrauddata', con= engine)

In [9]:
df.head()

Unnamed: 0,zip,city_pop,amt,category,is_fraud,gender,dob,age,agegroup,time,timeofday,avg_income
0,28654,3495,4.97,misc_net,0,F,1988-03-09,34,Middle_Aged,0,Early Morning,48427.56
1,99160,149,107.23,grocery_pos,0,F,1978-06-21,44,Middle_Aged,0,Early Morning,
2,83252,4154,220.11,entertainment,0,M,1962-01-19,60,Older_Adult,0,Early Morning,49807.39
3,59632,1939,45.0,gas_transport,0,M,1967-01-12,55,Older_Adult,0,Early Morning,52056.04
4,24433,99,41.96,misc_pos,0,M,1986-03-28,36,Middle_Aged,0,Early Morning,


In [10]:
df.tail()

Unnamed: 0,zip,city_pop,amt,category,is_fraud,gender,dob,age,agegroup,time,timeofday,avg_income
1296670,84735,258,15.56,entertainment,0,M,1961-11-24,60,Older_Adult,12,Afternoon,
1296671,21790,100,51.7,food_dining,0,M,1979-12-11,42,Middle_Aged,12,Afternoon,
1296672,88325,899,105.93,food_dining,0,M,1967-08-30,55,Older_Adult,12,Afternoon,
1296673,57756,1126,74.9,food_dining,0,M,1980-08-18,42,Middle_Aged,12,Afternoon,
1296674,59871,218,4.3,food_dining,0,M,1995-08-16,27,Young_Adult,12,Afternoon,50154.55


In [20]:
# Cleaning
df2= df.drop(['category','dob', 'amt','time', 'timeofday'], axis=1)

In [21]:
df2.dropna(inplace=True)

In [24]:
# Converting string categories into numeric indicators

df2.gender[df2.gender == 'M'] = 1
df2.gender[df2.gender == 'F'] = 2

df2.agegroup[df2.agegroup == 'Minor'] = 1
df2.agegroup[df2.agegroup == 'Young_Adult'] = 2
df2.agegroup[df2.agegroup == 'Middle_Aged'] = 3
df2.agegroup[df2.agegroup == 'Older_Adult'] = 4
df2.agegroup[df2.agegroup == 'Senior'] = 5


In [25]:
df2.head()

Unnamed: 0,zip,city_pop,is_fraud,gender,age,agegroup,avg_income
0,28654,3495,0,2,34,3,48427.56
2,83252,4154,0,1,60,4,49807.39
3,59632,1939,0,1,55,4,52056.04
5,18917,2158,0,2,61,4,65388.07
6,67851,2691,0,2,29,2,55165.0


## Beginning of Logistic Regression Model

In [None]:
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=2, random_state=5)

In [27]:
X=df2.drop(columns='is_fraud')
y = df2["is_fraud"]

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [29]:
Counter(y_train)

Counter({0: 802285, 1: 4660})

In [30]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [31]:
Counter(y_resampled)

Counter({0: 802285, 1: 802285})

In [32]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[174934,  92494],
       [   973,    581]], dtype=int64)

In [34]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5140044093033309

In [35]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.65      0.37      0.79      0.49      0.25    267428
          1       0.01      0.37      0.65      0.01      0.49      0.24      1554

avg / total       0.99      0.65      0.38      0.78      0.49      0.25    268982



## Undersampling Example

In [41]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 4660, 1: 4660})

In [42]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [43]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[197027,  70401],
       [  1086,    468]], dtype=int64)

In [44]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5189530680447862

In [45]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.99      0.74      0.30      0.85      0.47      0.23    267428
          1       0.01      0.30      0.74      0.01      0.47      0.21      1554

avg / total       0.99      0.73      0.30      0.84      0.47      0.23    268982



## Beginning of Balanced Random Forest Classifier

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfcmodel = BalancedRandomForestClassifier(n_estimators =100, random_state=1)
brfcmodel.fit( X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [38]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfcmodel.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6663125425558678

In [39]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.61      0.72      0.76      0.66      0.44    267428
          1       0.01      0.72      0.61      0.02      0.66      0.45      1554

avg / total       0.99      0.61      0.72      0.75      0.66      0.44    268982



In [40]:
# List the features sorted in descending order by feature importance
sorted_features = sorted(zip(brfcmodel.feature_importances_, X.columns), reverse=True)
sorted_features

[(0.2548965682386973, 'city_pop'),
 (0.24114343845726535, 'avg_income'),
 (0.23074499121400113, 'zip'),
 (0.20508447896826015, 'age'),
 (0.03725104276032766, 'agegroup'),
 (0.030879480361448455, 'gender')]