In [68]:
#pip install imbalanced-learn before running the code - used for balancing the set in the later [random forest] part of the code
# pip install xgboost before running the code - used for the xgboost model
# Import the modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [69]:
#load csv file into dataframe DataCoSupplyChainDataset_VL.csv from Resources folder
file_path = Path("Resources/DataCoSupplyChainDataset_VL.csv")
sales_df = pd.read_csv(file_path)
sales_df.head()


Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Delivery Status,Late_delivery_risk,Category Name,Customer City,Customer Country,Customer Id,Customer Segment,...,Order State,Order Status,Order Zipcode,Product Card Id,Product Category Id,Product Name,Product Price,shipping date (DateOrders),Shipping Mode,order_date_VL
0,DEBIT,3,4,Advance shipping,0,Sporting Goods,Caguas,Puerto Rico,20755,Consumer,...,Java Occidental,COMPLETE,,1360,73,Smart watch,327.75,2/3/2018 22:56,Standard Class,2018-01-31
1,TRANSFER,5,4,Late delivery,1,Sporting Goods,Caguas,Puerto Rico,19492,Consumer,...,Rajastán,PENDING,,1360,73,Smart watch,327.75,1/18/2018 12:27,Standard Class,2018-01-13
2,CASH,4,4,Shipping on time,0,Sporting Goods,San Jose,EE. UU.,19491,Consumer,...,Rajastán,CLOSED,,1360,73,Smart watch,327.75,1/17/2018 12:06,Standard Class,2018-01-13
3,DEBIT,3,4,Advance shipping,0,Sporting Goods,Los Angeles,EE. UU.,19490,Home Office,...,Queensland,COMPLETE,,1360,73,Smart watch,327.75,1/16/2018 11:45,Standard Class,2018-01-13
4,PAYMENT,2,4,Advance shipping,0,Sporting Goods,Caguas,Puerto Rico,19489,Corporate,...,Queensland,PENDING_PAYMENT,,1360,73,Smart watch,327.75,1/15/2018 11:24,Standard Class,2018-01-13


In [70]:
sales_df.columns

Index(['Type', 'Days for shipping (real)', 'Days for shipment (scheduled)',
       'Delivery Status', 'Late_delivery_risk', 'Category Name',
       'Customer City', 'Customer Country', 'Customer Id', 'Customer Segment',
       'Customer State', 'Customer Zipcode', 'Department Id',
       'Department Name', 'Latitude', 'Longitude', 'Market', 'Order City',
       'Order Country', 'Order Customer Id', 'order date (DateOrders)',
       'Order Id', 'Order Item Cardprod Id', 'Order Item Discount',
       'Order Item Discount Rate', 'Order Item Id', 'Order Item Product Price',
       'Order Item Profit Ratio', 'Order Item Quantity', 'Sales',
       'Order Item Total', 'Order Profit Per Order', 'Order Region',
       'Order State', 'Order Status', 'Order Zipcode', 'Product Card Id',
       'Product Category Id', 'Product Name', 'Product Price',
       'shipping date (DateOrders)', 'Shipping Mode', 'order_date_VL'],
      dtype='object')

Unique Order dataframe preparation 

In [71]:
#remove rows with "Delivery Status"="Shipping canceled"
sales_df = sales_df[sales_df["Order Status"] != "CANCELED"]

In [72]:
#remove columns 'Days for shipping (real)', 'Late_delivery_risk'
sales_df = sales_df.drop(columns=['Days for shipping (real)', 'Late_delivery_risk'])

In [73]:
# create a new dataframe orders_df listing all unique "Order ID" as index with min "Type", "Days for shipment (scheduled)"
orders_df = sales_df.groupby("Order Id").agg({"Type": "min", "Days for shipment (scheduled)": "min"})
#add columns to orders_df from sales_df with 'Customer Segment', 'Department Id', 'Order Country'
orders_df = orders_df.join(sales_df.groupby("Order Id").agg({"Customer Segment": "min","Market":"min", "Order Country": "min", "Order State":"min",
                                                             "Shipping Mode":"min","Order Region":"min",
                                                             "Order Status":"min"}))
#add columns to orders_df from sales_df with sum of "Order Item Product Price", "Order Item Discount", 'Order Item Quantity',"Order Item Total"
orders_df = orders_df.join(sales_df.groupby("Order Id").agg({"Order Item Product Price": "sum", "Order Item Discount": "sum", "Order Item Quantity": "sum", "Order Item Total": "sum"}))
# add column with number of unique "Product Card Id" per "Order Id"
orders_df["Number of Products"] = sales_df.groupby("Order Id")["Product Card Id"].nunique()
#add column with number of unique "Category Name" per "Order Id"
orders_df["Number of Categories"] = sales_df.groupby("Order Id")["Category Name"].nunique()
#add column with number of unique "Department Name" per "Order Id"
orders_df["Number of Departments"] = sales_df.groupby("Order Id")["Department Name"].nunique()
orders_df.tail()

Unnamed: 0_level_0,Type,Days for shipment (scheduled),Customer Segment,Market,Order Country,Order State,Shipping Mode,Order Region,Order Status,Order Item Product Price,Order Item Discount,Order Item Quantity,Order Item Total,Number of Products,Number of Categories,Number of Departments
Order Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
77200,TRANSFER,4,Consumer,Pacific Asia,Indonesia,Java Occidental,Standard Class,Southeast Asia,PROCESSING,215.820007,53.959999,1,161.869995,1,1,1
77201,DEBIT,4,Consumer,Pacific Asia,Indonesia,Java Occidental,Standard Class,Southeast Asia,COMPLETE,215.820007,43.16,1,172.660004,1,1,1
77202,DEBIT,4,Consumer,Pacific Asia,Indonesia,Java Occidental,Standard Class,Southeast Asia,COMPLETE,327.75,13.11,1,314.640015,1,1,1
77203,PAYMENT,4,Corporate,Pacific Asia,Indonesia,Java Occidental,Standard Class,Southeast Asia,PENDING_PAYMENT,11.54,0.63,1,10.91,1,1,1
77204,CASH,0,Corporate,Pacific Asia,Australia,Queensland,Same Day,Oceania,CLOSED,39.75,4.77,1,34.98,1,1,1


In [74]:
orders_df.columns

Index(['Type', 'Days for shipment (scheduled)', 'Customer Segment', 'Market',
       'Order Country', 'Order State', 'Shipping Mode', 'Order Region',
       'Order Status', 'Order Item Product Price', 'Order Item Discount',
       'Order Item Quantity', 'Order Item Total', 'Number of Products',
       'Number of Categories', 'Number of Departments'],
      dtype='object')

In [75]:
#change column name "Order Item Total" to "order_total", change column "Order Item Product Price" to "order_original_price", "Order Item Discount" to "order_discount", "Order Item Quantity" to "no_of_items"
orders_df = orders_df.rename(columns={"Order Item Total": "order_total", "Order Item Product Price": "order_original_price", 
                                      "Order Item Discount": "order_discount", "Order Item Quantity": "no_of_items",
                                      "Number of Products": "no_of_unique_items", "Number of Categories": "no_of_unique_categories","Number of Departments": "no_of_unique_departments"})

In [76]:
#add column "fraud" to orders_df - mark status "SUSPECTED_FRAUD" as 1, others as 0
orders_df["fraud"] = np.where(orders_df["Order Status"] == "SUSPECTED_FRAUD", 1, 0)
#drop column "Order Status" column from orders_df
orders_df = orders_df.drop(columns=["Order Status"])
# make "fraud" an integer
orders_df["fraud"] = orders_df["fraud"].astype(int)

DATA CLEANING 

In [77]:
#show data type for each column
orders_df.dtypes

Type                              object
Days for shipment (scheduled)      int64
Customer Segment                  object
Market                            object
Order Country                     object
Order State                       object
Shipping Mode                     object
Order Region                      object
order_original_price             float64
order_discount                   float64
no_of_items                        int64
order_total                      float64
no_of_unique_items                 int64
no_of_unique_categories            int64
no_of_unique_departments           int64
fraud                              int32
dtype: object

In [78]:
#save orders_df to csv file
orders_df.to_csv("Resources/orders_df.csv")

################### --------------------JESS START COPYING HERE - PART 1--------###############################

In [79]:
#read csv file into dataframe orders_df from Resources folder
file_path = Path("Resources/orders_df.csv")
orders_df = pd.read_csv(file_path)

In [80]:
#dummy code categorical columns
orders_df = pd.get_dummies(orders_df, columns=["Type", "Customer Segment", "Order Country", "Order State", "Shipping Mode", "Market", "Order Region"])
orders_df.head()

Unnamed: 0,Order Id,Days for shipment (scheduled),order_original_price,order_discount,no_of_items,order_total,no_of_unique_items,no_of_unique_categories,no_of_unique_departments,fraud,...,Order Region_South Asia,Order Region_South of USA,Order Region_Southeast Asia,Order Region_Southern Africa,Order Region_Southern Europe,Order Region_US Center,Order Region_West Africa,Order Region_West Asia,Order Region_West of USA,Order Region_Western Europe
0,1,4,299.980011,60.0,1,239.979996,1,1,1,0,...,False,False,False,False,False,False,False,False,False,False
1,2,4,379.980011,50.6,7,529.380005,3,3,3,0,...,False,False,False,False,False,False,False,False,False,False
2,4,4,184.960001,78.98,14,620.870014,4,4,4,0,...,False,False,False,False,False,False,False,False,False,False
3,5,4,839.920029,142.789999,10,987.070007,4,4,2,0,...,False,False,False,False,False,False,False,False,False,False
4,7,2,515.960016,54.4,7,525.520004,3,3,2,0,...,False,False,False,False,False,False,False,False,False,False


################### --------------------JESS END COPYING HERE - PART 1--------###############################

In [81]:
#show row count
orders_df.count()

Order Id                         64385
Days for shipment (scheduled)    64385
order_original_price             64385
order_discount                   64385
no_of_items                      64385
                                 ...  
Order Region_US Center           64385
Order Region_West Africa         64385
Order Region_West Asia           64385
Order Region_West of USA         64385
Order Region_Western Europe      64385
Length: 1301, dtype: int64

LOGISTIC REGRESSION - FRAUD PREDICTION MODEL

In [82]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = orders_df["fraud"]

# Separate the X variable, the features
X = orders_df.drop(columns="fraud")

In [83]:
# Review the y variable Series
print(f"Labels: {y[:10]}")

Labels: 0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: fraud, dtype: int64


In [84]:
# Review the X variable DataFrame
print(f"Data: {X[:10]}")

Data:    Order Id  Days for shipment (scheduled)  order_original_price  \
0         1                              4            299.980011   
1         2                              4            379.980011   
2         4                              4            184.960001   
3         5                              4            839.920029   
4         7                              2            515.960016   
5         8                              4            219.960003   
6         9                              4            499.970009   
7        10                              4            601.940016   
8        11                              4            299.930000   
9        12                              4            524.950008   

   order_discount  no_of_items  order_total  no_of_unique_items  \
0       60.000000            1   239.979996                   1   
1       50.600000            7   529.380005                   3   
2       78.980000           14   620.870014 

In [85]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(48288, 1300)

In [86]:
X_test.shape

(16097, 1300)

In [87]:
#concatenate our training data back together
train_data = pd.concat([X_train, y_train], axis=1)

In [88]:
#Create a logistic regression model
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model with class weight to handle imbalance in the dataset
classifier = LogisticRegression(max_iter=10000)


# Fit the model using scaled training data
classifier.fit(X_train, y_train)

In [89]:
from sklearn.calibration import CalibratedClassifierCV

# Instantiate the calibrated classifier with sigmoid calibration
calibrated_clf = CalibratedClassifierCV(classifier, method='sigmoid', cv='prefit')

# Fit the calibrated classifier using the validation data (you might need to split your data again)
calibrated_clf.fit(X_train, y_train)



In [90]:
# Get calibrated probabilities
calibrated_probs = calibrated_clf.predict_proba(X_test)

# Predict labels (if needed)
calibrated_preds = calibrated_clf.predict(X_test)

In [91]:
from sklearn.metrics import accuracy_score, log_loss

print("Accuracy: ", accuracy_score(y_test, calibrated_preds))
print("Log loss: ", log_loss(y_test, calibrated_probs))

Accuracy:  0.9768901037460397
Log loss:  0.10963621413930995


In [92]:
# Make a prediction using the testing data with the calibrated classifier
calibrated_predictions = calibrated_clf.predict(X_test)

pd.DataFrame({"Prediction": calibrated_predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
55756,0,0
35954,0,0
41049,0,0
22551,0,0
29847,0,0
...,...,...
38723,0,0
43000,0,0
58541,0,0
7968,0,0


In [93]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Generate a confusion matrix for the model
confusion_matrix(y_test, calibrated_predictions)

array([[15725,     0],
       [  372,     0]], dtype=int64)

In [94]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, calibrated_predictions)
cm_df = pd.DataFrame(
    cm, index=["Legitemate [0]", "Fraud [1]"], columns=["Legitemate [0]", "Fraud [1]"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, calibrated_predictions)

# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, calibrated_predictions))


Confusion Matrix


Unnamed: 0,Legitemate [0],Fraud [1]
Legitemate [0],15725,0
Fraud [1],372,0


Accuracy Score : 0.9768901037460397
Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     15725
           1       0.00      0.00      0.00       372

    accuracy                           0.98     16097
   macro avg       0.49      0.50      0.49     16097
weighted avg       0.95      0.98      0.97     16097



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Terrible score for fraud recall - model was not able to identify any true fraud cases after training - the imbalanced data 

#################------------------- JESS START COPYING HERE Part 2 ----------------------------------################

TESTING RANDOM FOREST MODEL - To improve accuracy and see feature importance breakdown

In [95]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [96]:
# Define features set 
X = orders_df.copy()
X.drop('fraud', axis=1, inplace=True)
X.head()

Unnamed: 0,Order Id,Days for shipment (scheduled),order_original_price,order_discount,no_of_items,order_total,no_of_unique_items,no_of_unique_categories,no_of_unique_departments,Type_CASH,...,Order Region_South Asia,Order Region_South of USA,Order Region_Southeast Asia,Order Region_Southern Africa,Order Region_Southern Europe,Order Region_US Center,Order Region_West Africa,Order Region_West Asia,Order Region_West of USA,Order Region_Western Europe
0,1,4,299.980011,60.0,1,239.979996,1,1,1,True,...,False,False,False,False,False,False,False,False,False,False
1,2,4,379.980011,50.6,7,529.380005,3,3,3,False,...,False,False,False,False,False,False,False,False,False,False
2,4,4,184.960001,78.98,14,620.870014,4,4,4,True,...,False,False,False,False,False,False,False,False,False,False
3,5,4,839.920029,142.789999,10,987.070007,4,4,2,False,...,False,False,False,False,False,False,False,False,False,False
4,7,2,515.960016,54.4,7,525.520004,3,3,2,False,...,False,False,False,False,False,False,False,False,False,False


In [97]:
# Define target vector
y = orders_df["fraud"].values.reshape(-1, 1)
y[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [98]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [99]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [100]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [101]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [102]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [103]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

KeyboardInterrupt: 

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Legitemate [0]", "Fraud [1]"], columns=["Legitemate [0]", "Fraud [1]"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
print(np.unique(y_test))
print(np.unique(predictions))

[0 1]
[0 1]


In [None]:
print(cm)
print(cm.shape)


[[15673    85]
 [  329    10]]
(2, 2)


In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Legitemate [0],Fraud [1]
Legitemate [0],15744,14
Fraud [1],337,2


Accuracy Score : 0.9781946946636019
Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     15758
           1       0.12      0.01      0.01       339

    accuracy                           0.98     16097
   macro avg       0.55      0.50      0.50     16097
weighted avg       0.96      0.98      0.97     16097



In [105]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted [:10]

[(0.0, 'order_total'),
 (0.0, 'order_original_price'),
 (0.0, 'order_discount'),
 (0.0, 'no_of_unique_items'),
 (0.0, 'no_of_unique_departments'),
 (0.0, 'no_of_unique_categories'),
 (0.0, 'no_of_items'),
 (0.0, 'Type_TRANSFER'),
 (0.0, 'Type_PAYMENT'),
 (0.0, 'Type_DEBIT')]

Despite high accuracy, the recall for fraud cases is ridiculously low - A recall of 0.01 for fraud means the model is missing out on 99% of actual fraud cases, which is a significant risk in a real-world scenario. Assuming that data imbalance is the most likely issue, adding a resampling strategy to improve the recall


In [106]:

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [107]:
# Rebalance the dataset
# Use SMOTE to oversample the minority class and RandomUnderSampler to undersample the majority class
resampling = Pipeline([('SMOTE', SMOTE()), ('RandomUnderSampler', RandomUnderSampler())])
X_train_balanced, y_train_balanced = resampling.fit_resample(X_train_scaled, y_train)


In [108]:
# Create the random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_balanced, y_train_balanced)

In [109]:
# Predict
predictions = rf_model.predict(X_test_scaled)

In [110]:
# Evaluations
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Legitemate [0]", "Fraud [1]"], columns=["Legitemate [0]", "Fraud [1]"]
)
acc_score = accuracy_score(y_test, predictions)


In [111]:

# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Legitemate [0],Fraud [1]
Legitemate [0],15675,83
Fraud [1],331,8


Accuracy Score : 0.9742809219109151
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     15758
           1       0.09      0.02      0.04       339

    accuracy                           0.97     16097
   macro avg       0.53      0.51      0.51     16097
weighted avg       0.96      0.97      0.97     16097



Testing balanced model approach to improve fraud detection

In [112]:
# Use class_weight='balanced' for cost-sensitive training
rf_model = RandomForestClassifier(n_estimators=500, random_state=78, class_weight='balanced')
rf_model = rf_model.fit(X_train_balanced, y_train_balanced)


In [113]:
# Predict probabilities
probabilities = rf_model.predict_proba(X_test_scaled)[:, 1]

In [114]:
# Adjust the decision threshold to increase recall
threshold = 0.3
predictions = np.where(probabilities > threshold, 1, 0)

In [115]:
# Evaluations
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Legitemate [0]", "Fraud [1]"], columns=["Legitemate [0]", "Fraud [1]"]
)
acc_score = accuracy_score(y_test, predictions)

# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Legitemate [0],Fraud [1]
Legitemate [0],15444,314
Fraud [1],317,22


Accuracy Score : 0.9608001490961049
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     15758
           1       0.07      0.06      0.07       339

    accuracy                           0.96     16097
   macro avg       0.52      0.52      0.52     16097
weighted avg       0.96      0.96      0.96     16097



Increased a recall at cost of accuracy - dismall results - attempting to improve the model with XBoost to further improve fraud detection recall

In [116]:
import xgboost as xgb

In [117]:
# Convert the dataset into an optimized data structure that XGBoost supports
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

In [118]:
scale_pos_weight_val = float((y_train == 0).sum() / (y_train == 1).sum())


In [119]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'scale_pos_weight': scale_pos_weight_val,  # corrected this line
    'eta': 0.1
}

In [120]:
# Train the model
num_rounds = 300
bst = xgb.train(params, dtrain, num_rounds)

# Predict probabilities
probabilities = bst.predict(dtest)
predictions = [1 if prob > 0.5 else 0 for prob in probabilities]

# Evaluation
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Legitemate [0]", "Fraud [1]"], columns=["Legitemate [0]", "Fraud [1]"]
)
acc_score = accuracy_score(y_test, predictions)

In [121]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Legitemate [0],Fraud [1]
Legitemate [0],12616,3142
Fraud [1],61,278


Accuracy Score : 0.8010188233832392
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.80      0.89     15758
           1       0.08      0.82      0.15       339

    accuracy                           0.80     16097
   macro avg       0.54      0.81      0.52     16097
weighted avg       0.98      0.80      0.87     16097



lassification Report:

Class 0 (Legitimate Transactions):

Precision: 1.00 (When the model predicted a transaction was legitimate, it was correct 100% of the time.)
Recall: 0.80 (The model correctly identified 80% of all legitimate transactions.)
F1-Score: 0.89 (The harmonic mean of precision and recall, giving a balanced measure for this class.)
Class 1 (Fraudulent Transactions):

Precision: 0.08 (When the model predicted a transaction was fraudulent, it was correct only 8% of the time.)
Recall: 0.84 (The model correctly identified 84% of all fraudulent transactions, which is a good recall rate for fraud detection. However, this came at the cost of having many false positives.)
F1-Score: 0.15 (Given the low precision and high recall, the F1-score for this class is low, indicating a potential area of improvement.)

Interpretation:
The model seems to be taking a conservative approach to fraud detection. While it has done a commendable job in identifying 84% of the fraudulent transactions (recall for Class 1), this high recall rate for fraudulent transactions has come at the expense of a large number of false positives (3,146 legitimate transactions incorrectly flagged as fraudulent). This is reflected in the very low precision (0.08) for Class 1.

Such a model could be problematic in real-world scenarios, especially if there are high costs or inconveniences associated with falsely flagging a transaction as fraudulent.

#############################################------- JESS END COPYING HERE ----------------------------------##################

Failed attempt at imporveing the model by increasing the accuracy for fraud detection

In [122]:
#Predict probabilities
y_prob = rf_model.predict_proba(X_test_scaled)[:, 1]  # probabilities of the positive class

In [123]:
#Determine an optimal threshold.
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)

In [124]:
# Turn these into a dataframe for ease of use
thresholds_df = pd.DataFrame({'Threshold': thresholds, 'Precision': precisions[:-1], 'Recall': recalls[:-1]})

In [125]:
#set a minimum precision value and get the threshold for it:
min_precision = 0.2  # for example
optimal_threshold = thresholds_df[thresholds_df['Precision'] > min_precision]['Threshold'].min()


In [126]:
#Convert probabilities to class predictions based on the optimal threshold
y_pred_new_threshold = [1 if prob > optimal_threshold else 0 for prob in y_prob]

In [127]:
#ompute the confusion matrix and other metrics using y_pred_new_threshold
cm_new_threshold = confusion_matrix(y_test, y_pred_new_threshold)
print(cm_new_threshold)
print(classification_report(y_test, y_pred_new_threshold))

[[15752     6]
 [  337     2]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     15758
           1       0.25      0.01      0.01       339

    accuracy                           0.98     16097
   macro avg       0.61      0.50      0.50     16097
weighted avg       0.96      0.98      0.97     16097



Increased accuracy brought the model back to extremely low recall for fraud cases. 