In [1]:
# all imports needed

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report
)
from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("cleaned_dataset.csv")
df.head()

Unnamed: 0,transaction_id,client_id,card_id,use_chip,merchant_id,mcc,fraud,has_chip,num_cards_issued,year_pin_last_changed,...,card_brand_Mastercard,card_brand_Visa,card_type_Debit,card_type_Debit (Prepaid),outlier_iqr,amount_norm,credit_limit_norm,total_debt_norm,yearly_income_norm,credit_score_norm
0,22326462,496,3186,1,30286,4814,0,1,1,2016,...,0,1,1,0,0,0.087068,0.159246,0.096864,0.129615,0.66759
1,22326465,1129,2677,1,27092,4829,0,1,1,2011,...,1,0,0,0,0,0.084347,0.053752,0.079116,0.122942,0.545706
2,22326466,114,5283,1,61195,5541,0,1,2,2013,...,0,1,1,0,0,0.077559,0.075507,0.001964,0.122913,0.65374
3,22326467,641,2774,0,75781,5411,0,1,2,2017,...,0,1,1,0,0,0.085092,0.130143,0.0,0.13403,0.68144
4,22326468,114,5283,1,61195,5541,0,1,2,2013,...,0,1,1,0,0,0.081817,0.075507,0.001964,0.122913,0.65374


Prompt: Give me the code to make sure the fraud column only has 0 and 1 in it

In [3]:
# convert fraud to numeric if needed
if df["fraud"].dtype == "object":
    df["fraud"] = df["fraud"].map({"no": 0, "yes": 1})

if df["fraud"].dtype == "bool":
    df["fraud"] = df["fraud"].astype(int)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777336 entries, 0 to 777335
Data columns (total 28 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transaction_id             777336 non-null  int64  
 1   client_id                  777336 non-null  int64  
 2   card_id                    777336 non-null  int64  
 3   use_chip                   777336 non-null  int64  
 4   merchant_id                777336 non-null  int64  
 5   mcc                        777336 non-null  int64  
 6   fraud                      777336 non-null  int64  
 7   has_chip                   777336 non-null  int64  
 8   num_cards_issued           777336 non-null  int64  
 9   year_pin_last_changed      777336 non-null  int64  
 10  card_on_dark_web           777336 non-null  int64  
 11  current_age                777336 non-null  int64  
 12  gender                     777336 non-null  int64  
 13  per_capita_income          77

Prompt: Show how many fraud and non fraud rows I have so I can see the class imbalance.

In [4]:
fraud_counts = df["fraud"].value_counts()
fraud_percent = df["fraud"].value_counts(normalize=True) * 100

print("Fraud counts:")
print(fraud_counts)
print("\nFraud percentages:")
print(fraud_percent.round(4))

Fraud counts:
fraud
0    775976
1      1360
Name: count, dtype: int64

Fraud percentages:
fraud
0    99.825
1     0.175
Name: proportion, dtype: float64


Prompt: code to split the data into train and test sets with stratification on fraud so that the imbalance stays.

In [5]:
RANDOM_STATE = 42

X = df.drop(columns=["fraud"])
y = df["fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

print("\nTrain fraud distribution:")
print(y_train.value_counts(normalize=True) * 100)

print("\nTest fraud distribution:")
print(y_test.value_counts(normalize=True) * 100)

Train shape: (621868, 27)
Test shape: (155468, 27)

Train fraud distribution:
fraud
0    99.825043
1     0.174957
Name: proportion, dtype: float64

Test fraud distribution:
fraud
0    99.825044
1     0.174956
Name: proportion, dtype: float64


Prompt: scale all features with StandardScaler fitted only on the training data, then transform both train and test.

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Prompt: code to apply SMOTE on the scaled training data with sampling strategy 0.1.

In [7]:
smote = SMOTE(
    sampling_strategy=0.1,
    random_state=RANDOM_STATE
)

X_train_res, y_train_res = smote.fit_resample(
    X_train_scaled,
    y_train
)

print("After SMOTE, train shape:", X_train_res.shape)
print("After SMOTE fraud distribution:")
print(y_train_res.value_counts(normalize=True) * 100)

After SMOTE, train shape: (682858, 27)
After SMOTE fraud distribution:
fraud
0    90.909091
1     9.090909
Name: proportion, dtype: float64


Prompt: create Logistic Regression and Decision Tree models without class_weight since I already applied SMOTE

In [8]:
log_model = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

tree_model = DecisionTreeClassifier(
    random_state=RANDOM_STATE,
    class_weight="balanced",
)

Prompt: give me the code to run 5 fold cross validation on the resampled training data for both models and print mean scores

In [9]:
scoring = {
    "accuracy": "accuracy",
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision"
}

models = [
    ("Logistic Regression", log_model),
    ("Decision Tree", tree_model)
]

for name, model in models:
    print(f"\n{name}")
    cv_results = cross_validate(
        model,
        X_train_res,
        y_train_res,
        cv=5,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    for metric_name in scoring.keys():
        scores = cv_results[f"test_{metric_name}"]
        print(f"{metric_name}: {scores.mean():.4f} (+/- {scores.std():.4f})")


Logistic Regression
accuracy: 0.6701 (+/- 0.0021)
precision: 0.1653 (+/- 0.0014)
recall: 0.6488 (+/- 0.0036)
f1: 0.2634 (+/- 0.0020)
roc_auc: 0.7246 (+/- 0.0027)
pr_auc: 0.1946 (+/- 0.0019)

Decision Tree
accuracy: 0.9973 (+/- 0.0014)
precision: 0.9860 (+/- 0.0026)
recall: 0.9839 (+/- 0.0172)
f1: 0.9848 (+/- 0.0078)
roc_auc: 0.9912 (+/- 0.0085)
pr_auc: 0.9715 (+/- 0.0136)


In [10]:
log_model.fit(X_train_res, y_train_res)
tree_model.fit(X_train_res, y_train_res)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


Prompt: give me the code to get predicted probabilities and whether or not it is fraud on the scaled test set for both models with threshold 0.5.

In [11]:
# Logistic Regression
y_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]
# later changed to 0.2 manually
y_pred_log = (y_prob_log >= 0.2).astype(int)

# Decision Tree
y_prob_tree = tree_model.predict_proba(X_test_scaled)[:, 1]
y_pred_tree = (y_prob_tree >= 0.5).astype(int)


Prompt: give me the code to print a confusion matrix, accuracy, precision, recall, f1 score, roc auc, pr auc, and a report.

In [12]:
def evaluate_model(model_name, y_true, y_pred, y_prob):
    print(f"\n==== {model_name} ====")

    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print("Confusion matrix:")
    print(cm)
    print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}\n")

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)

    print(f"Accuracy:      {acc:.4f}")
    print(f"Precision:     {prec:.4f}")
    print(f"Recall:        {rec:.4f}")
    print(f"F1 score:      {f1:.4f}")
    print(f"ROC AUC:       {roc:.4f}")
    print(f"PR AUC:        {pr_auc:.4f}\n")

    print("Classification report:")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))


In [13]:
evaluate_model("Logistic Regression", y_test, y_pred_log, y_prob_log)
evaluate_model("Decision Tree", y_test, y_pred_tree, y_prob_tree)



==== Logistic Regression ====
Confusion matrix:
[[ 15291 139905]
 [     7    265]]
TN: 15291, FP: 139905, FN: 7, TP: 265

Accuracy:      0.1001
Precision:     0.0019
Recall:        0.9743
F1 score:      0.0038
ROC AUC:       0.6957
PR AUC:        0.0055

Classification report:
              precision    recall  f1-score   support

           0     0.9995    0.0985    0.1794    155196
           1     0.0019    0.9743    0.0038       272

    accuracy                         0.1001    155468
   macro avg     0.5007    0.5364    0.0916    155468
weighted avg     0.9978    0.1001    0.1791    155468


==== Decision Tree ====
Confusion matrix:
[[154989    207]
 [   155    117]]
TN: 154989, FP: 207, FN: 155, TP: 117

Accuracy:      0.9977
Precision:     0.3611
Recall:        0.4301
F1 score:      0.3926
ROC AUC:       0.7144
PR AUC:        0.1563

Classification report:
              precision    recall  f1-score   support

           0     0.9990    0.9987    0.9988    155196
           1

Prompt: give me the code to create a dummy model so we have something to compare against.

In [14]:
dummy = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)
dummy.fit(X_train, y_train)

y_dummy = dummy.predict(X_test)
y_dummy_prob = dummy.predict_proba(X_test)[:, 1]

evaluate_model("Dummy majority baseline", y_test, y_dummy, y_dummy_prob)



==== Dummy majority baseline ====
Confusion matrix:
[[155196      0]
 [   272      0]]
TN: 155196, FP: 0, FN: 272, TP: 0

Accuracy:      0.9983
Precision:     0.0000
Recall:        0.0000
F1 score:      0.0000
ROC AUC:       0.5000
PR AUC:        0.0017

Classification report:
              precision    recall  f1-score   support

           0     0.9983    1.0000    0.9991    155196
           1     0.0000    0.0000    0.0000       272

    accuracy                         0.9983    155468
   macro avg     0.4991    0.5000    0.4996    155468
weighted avg     0.9965    0.9983    0.9974    155468

