In [None]:
#Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor

In [None]:
#Set file path
file = "Resources/PCA.csv"

In [None]:
#Load data into dataframe
data_df = pd.read_csv(file,index_col=None)


In [None]:
#Display first few rows
data_df.head()

In [None]:
#Display column names
data_df.columns

In [None]:
#Seperate features into X
X = data_df.drop(columns=['fraud_bool'])

In [None]:
#Set target variable into y
y = data_df['fraud_bool']

In [None]:
#Split Dataset into training and testing and set testing size to 20% of training data with random state 42
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)

In [None]:
#Initialize RandomForestClassifier with 100 estimators and random state of 42
model = RandomForestClassifier(n_estimators=100,random_state=42)

In [43]:
#Train RandomForestClassifier on training data
model.fit(X_train,y_train)

In [44]:
#Caculate and print balanced accuracy score for predictions on testing data
y_test_pred = model.predict(X_test)
print(f"Testing Score: {balanced_accuracy_score(y_test,y_test_pred)}")

Testing Score: 0.5


In [45]:
#Caculate and print balanced accuracy score for predictions on training data
y_train_pred = model.predict(X_train)
print(f"Training Score: {balanced_accuracy_score(y_train,y_train_pred)}")

Training Score: 0.9997173544375353


In [46]:
#Display models accuracy score
accuracy = accuracy_score(y_test,y_test_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9884


In [47]:
#Displays classification report for evaluation
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     39537
           1       0.00      0.00      0.00       463

    accuracy                           0.99     40000
   macro avg       0.49      0.50      0.50     40000
weighted avg       0.98      0.99      0.98     40000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
#Initialize SMOTE for balancing the dataset
smote = SMOTE(random_state=42)

In [49]:
#Resample the dataset using SMOTE to balance fraud and non-fraud transactions
X_resampled, y_resampled = smote.fit_resample(X, y)

In [50]:
#Split resampled dataset into training and testing and set testing size to 20% of training data with random state 42
X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled,random_state=42,test_size=0.2)

In [51]:
#Retrain RandomForestClassifier on resampled data
model.fit(X_resampled,y_resampled)

In [52]:
#Caculate and print balanced accuracy score of resampled data for predictions on testing data
y_test_pred = model.predict(X_test)
print(f"Testing Score: {balanced_accuracy_score(y_test,y_test_pred)}")

Testing Score: 1.0


In [53]:
#Caculate and print balanced accuracy score of resampled data for predictions on training data
y_train_pred = model.predict(X_train)
print(f"Training Score: {balanced_accuracy_score(y_train,y_train_pred)}")

Training Score: 0.9999968382645867


In [54]:
#Display resampled models accuracy score
accuracy = accuracy_score(y_test,y_test_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000


In [55]:
#Displays resampled models classification report for evaluation
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39627
           1       1.00      1.00      1.00     39481

    accuracy                           1.00     79108
   macro avg       1.00      1.00      1.00     79108
weighted avg       1.00      1.00      1.00     79108

