In [42]:
# We will be importing our necessary libraries to be able to use them in our code.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  # For confusion matrix visualization
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [43]:
# We will load our data from our clean_data.csv, rename it 'data' and display the first few rows of our data to
# remind ourselves the columns that we are working with and 
data = pd.read_csv('../Data/clean_data.csv')
print(data.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  isFraud  isFlaggedFraud  
0  M1979787155        0               0  
1  M2044282225        0               0  
2   C553264065        1               0  
3    C38997010        1               0  
4  M1230701703        0               0  


In [44]:
# We will create a new data frame called X and exclude 'isFraud' and 'isFlaggedFraud' features
# And also create the 'y' target variable to isolate it from the rest of our data set
# This will help our  model learn to predict the target variable based on the features in our data set
X = data.drop(columns=['isFraud', 'isFlaggedFraud'])  # Features
y = data['isFraud']  # Target variable

In [45]:
# We are splitting our dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [46]:
# We are identifying numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

In [8]:
# We will be creating a column transformer for preprocessing the data as I kept on getting errors
# The first part of the transformer replaces any missing values with the mean of the non-missing values in that column and standardizes the features by removing the mean and scaling to unit variance. 
# The second part is similar to that of the previous step except we are now filling in any missing values with the most common one and then converting the categories into a binary format (0 and 1), making it easier for the machine learning model to process the data
# The preprocessor then combines both the first and second step, which allows for both types of data to be prepared and used by the machine learning model
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler()) 
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [11]:
# We will create a Pipeline that includes the preprocessor and the random forest classifier into one workflow called Pipeline that prepares the data and then uses it for classification
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42)) 
])

In [None]:
# This step first fits the pipeline to our data, makes predictions on the data using the trained model, and then evaluates the random forest classifier's performance 
# by printing a classification report and a confusion matrix.
pipeline.fit(X_train, y_train)
y_pred_rf = pipeline.predict(X_test)
print("Evaluation for Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
# This next model sets up a pipeline for training a decision tree model by combining the preprocessing steps with the decision tree classifier, fits the model to the training data,
# makes predictions on our data, and then evaluates its performance by also displaying a classification report and a confusion matrix.
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])
decision_tree_pipeline.fit(X_train, y_train)
y_pred_tree = decision_tree_pipeline.predict(X_test)
print("Evaluation for Decision Tree:")
print(classification_report(y_test, y_pred_tree))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))

In [None]:
# This code prepares the fine-tuning of the random forest model by specifying different options for its key settings, like the number of trees and how deep each tree can grow, 
# and then using GridSearchCV, finds the best combination of the best-performing model (determining it on how well it predicts based on the F1 score).
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='f1')

In [None]:
# This step runs grid search to train the random forest model on our data, exploring different combinations of parameters and printing out the best set of parameters for optimal performance.
grid_search.fit(X_train, y_train)
print("The best parameters found for Random Forest: ", grid_search.best_params_)

In [None]:
# This step utilizes the best version of the random forest model that was found during our tuning process and uses it to make predictions
# on our data, allowing us to see how well the model performs on new examples
best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

In [None]:
# This code checks how well the best random forest model performed by calculating the F1 score and showing a detailed report of its results. 
# It also creates a confusion matrix that shows how many predictions were correct and how many were wrong.
f1 = f1_score(y_test, y_pred_best_rf)
print(f"F1 Score of the best Random Forest model: {f1:.2f}")
print("Best Random Forest Model Evaluation:")
print(classification_report(y_test, y_pred_best_rf))

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred_best_rf), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Best Random Forest Model')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()