In [68]:
#Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [69]:
#Set file path
file = "Resources/PCA.csv"

In [70]:
#Load data into dataframe
data_df = pd.read_csv(file,index_col=None)


In [71]:
#Display first few rows
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,fraud_bool
0,0.04485,0.867448,0.265031,1.061007,1.931991,2.603333,0.357424,-1.254235,-1.688642,-0.957369,-2.248566,-0.857269,0.373771,0.926526,1.03638,-1.037806,0
1,2.706031,-1.599373,0.344212,0.675761,-0.763198,-1.087848,1.460787,-0.151737,0.239009,1.254337,0.695271,0.744711,-2.082136,-0.209089,-0.172229,-0.615665,0
2,0.230808,-0.546442,-0.568492,0.770089,-0.102704,-0.761389,1.895435,-0.434829,0.373212,-0.084965,1.431698,0.026007,-1.881884,0.208164,0.469362,1.128934,0
3,1.336715,2.274095,-0.687472,0.805492,1.150722,-0.335813,1.206365,3.425307,-0.470821,0.623067,-1.609263,0.723076,0.179099,0.280377,-1.666089,0.001679,0
4,3.923968,1.584103,0.663011,-0.066138,-0.166105,1.315669,0.052427,0.14739,1.427424,-0.10098,1.551504,-1.043255,-0.443029,0.167595,-0.197517,0.445158,0


In [72]:
#Display column names
data_df.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', 'fraud_bool'],
      dtype='object')

In [73]:
#Separate features into X
X = data_df.drop(columns=['fraud_bool'])

In [74]:
#Set target variable into y
y = data_df['fraud_bool']

In [75]:
#Split Dataset into training and testing and set testing size to 20% of training data with random state 42
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [76]:
#Initialize RandomForestClassifier with 100 estimators and random state of 42
model = RandomForestClassifier(n_estimators=100,random_state=42)

In [77]:
#Train RandomForestClassifier on training data
model.fit(X_train,y_train)

In [78]:
#Calculate and print balanced accuracy score for predictions on testing data
y_test_pred = model.predict(X_test)
print(f"Testing Score: {balanced_accuracy_score(y_test,y_test_pred)}")

Testing Score: 0.5


In [79]:
#Calculate and print balanced accuracy score for predictions on training data
y_train_pred = model.predict(X_train)
print(f"Training Score: {balanced_accuracy_score(y_train,y_train_pred)}")

Training Score: 0.9994347088750707


In [80]:
#Display models accuracy score
accuracy = accuracy_score(y_test,y_test_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9884


In [81]:
#Displays classification report for evaluation
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     39537
           1       0.00      0.00      0.00       463

    accuracy                           0.99     40000
   macro avg       0.49      0.50      0.50     40000
weighted avg       0.98      0.99      0.98     40000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [82]:
#Initialize SMOTE for balancing the dataset
smote = SMOTE(random_state=42)

In [83]:
#Resample the dataset using SMOTE to balance fraud and non-fraud transactions
X_resampled, y_resampled = smote.fit_resample(X, y)

In [99]:
#Split resampled dataset into training and testing and set testing size to 20% of training data with random state 42
X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,random_state=42,test_size=0.2)

In [100]:
#Retrain RandomForestClassifier on resampled data
model.fit(X_train,y_train)

In [101]:
#Calculate and print balanced accuracy score of resampled data for predictions on testing data
y_test_pred = model.predict(X_test)
print(f"Testing Score: {balanced_accuracy_score(y_test,y_test_pred)}")

Testing Score: 0.989952003529644


In [102]:
#Calculate and print balanced accuracy score of resampled data for predictions on training data
y_train_pred = model.predict(X_train)
print(f"Training Score: {balanced_accuracy_score(y_train,y_train_pred)}")

Training Score: 1.0


In [103]:
#Display resampled models accuracy score
accuracy = accuracy_score(y_test,y_test_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9899


In [104]:
#Displays resampled models classification report for evaluation
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     39627
           1       0.98      1.00      0.99     39481

    accuracy                           0.99     79108
   macro avg       0.99      0.99      0.99     79108
weighted avg       0.99      0.99      0.99     79108



In [105]:
# Split all data without utilizing SMOTE for oversampling.
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [106]:
# Check balanced accuracy score for unbalanced training data after training new model.
y_pred = model.predict(X_test)
print(f"Testing Score: {balanced_accuracy_score(y_test,y_pred)}")

Testing Score: 0.9975924090072228
