In [2]:
import pandas as pd
df = pd.read_csv('../data/processed/cleaned_data.csv')

# Quick look
df.head()

Unnamed: 0,amt,gender,city_pop,is_fraud,trans_hour,trans_day,trans_weekday,trans_month,age,merchant_enc,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,state_enc
0,-0.424463,0,0.816521,0,-0.118789,0.506526,1.502388,-1.773597,0.30331,-0.622497,...,0,0,0,0,0,1,0,0,0,0.022567
1,-0.252337,1,-0.292685,0,-0.118789,0.506526,1.502388,-1.773597,-0.900787,-0.406216,...,0,0,0,0,0,1,0,0,0,0.008382
2,-0.179353,1,-0.178853,0,-0.118789,0.506526,1.502388,-1.773597,0.188635,-0.538633,...,1,0,0,0,0,0,0,0,0,0.064633
3,-0.059605,0,-0.111371,0,-0.118789,0.506526,1.502388,-1.773597,-0.786111,-0.732844,...,0,0,0,0,1,0,0,0,0,0.032578
4,-0.422358,0,-0.289942,0,-0.118789,0.506526,1.502388,-1.773597,1.048704,-2.237981,...,0,0,0,0,0,0,0,0,1,0.035397


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   amt                      555719 non-null  float64
 1   gender                   555719 non-null  int64  
 2   city_pop                 555719 non-null  float64
 3   is_fraud                 555719 non-null  int64  
 4   trans_hour               555719 non-null  float64
 5   trans_day                555719 non-null  float64
 6   trans_weekday            555719 non-null  float64
 7   trans_month              555719 non-null  float64
 8   age                      555719 non-null  float64
 9   merchant_enc             555719 non-null  float64
 10  job_enc                  555719 non-null  float64
 11  category_food_dining     555719 non-null  int64  
 12  category_gas_transport   555719 non-null  int64  
 13  category_grocery_net     555719 non-null  int64  
 14  cate

In [4]:
# Separate target and features
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Check shapes
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (555719, 24)
y shape: (555719,)


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
#stratify=y is used because the the target distrubution is highly imbalanced, if 0.4% of the taget is fraud then while splitting it 0.2% can be sent to training and 0.8% can be sent to testing which will mean the model learns poorly, so if we use stratify the splitting of the target is even in the traing and testing set

Train shape: (444575, 24)
Test shape: (111144, 24)


In [6]:
print("Train distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest distribution:")
print(y_test.value_counts(normalize=True))

Train distribution:
is_fraud
0    0.99614
1    0.00386
Name: proportion, dtype: float64

Test distribution:
is_fraud
0    0.99614
1    0.00386
Name: proportion, dtype: float64


In [7]:
#we can see both traing and testing sets have same proportion of distribution of sets

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    class_weight='balanced',   # handles imbalance
    #Automatically increases the importance (weight) of fraud samples
    #Penalizes mistakes on fraud more than mistakes on non-fraud
    #internally [ weight(class) = total_samples / (n_classes × samples_in_class) ]  Improves fraud recall,Prevents model from predicting “not fraud” always
    max_iter=1000, #Ensures convergence
    n_jobs=-1 #Uses all available CPU cores
)

lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [9]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94    110715
           1       0.02      0.74      0.05       429

    accuracy                           0.89    111144
   macro avg       0.51      0.81      0.49    111144
weighted avg       1.00      0.89      0.94    111144

ROC-AUC: 0.9144184542369069


Precision: Of all the times the model predicted 'Positive,' how often was it actually right?, So it means all the prediction of non fraud was true but only 2% of fraud prediction was actually fraud.

Recall : Of all the actual 'Positive' cases that exist, how many did the model find?, So it means that 89% of genuine transaction was cleared, rest 11% was detected fraud, and 74% of fraud transaction was detected and 26% of fraud traction was not detected.

The F1-Score is the "middle ground." It combines Precision and Recall into a single number using a harmonic mean.F1 = 2 × (Precision × Recall) / (Precision + Recall).Precision is extremely low in fraud detected. F1 penalizes imbalance between precision and recall.

Support is the number of non fraud and fraud.