In [8]:
!pip install pandas numpy scikit-learn matplotlib seaborn imbalanced-learn xgboost

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Handle class imbalance
from imblearn.over_sampling import SMOTE

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')




In [11]:
df = pd.read_csv("Fraud.csv")
df.head()


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [12]:
# Shape of dataset
df.shape



(6362620, 11)

In [13]:

df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [14]:

df.isnull().sum()


step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [15]:

df['isFraud'].value_counts()

df['isFraud'].value_counts(normalize=True) * 100


isFraud
0    99.870918
1     0.129082
Name: proportion, dtype: float64

In [16]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)


In [17]:
df.columns


Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud'],
      dtype='object')

In [21]:

df = pd.get_dummies(df, columns=['type'], drop_first=True)

df.columns

Index(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud', 'isFlaggedFraud', 'type_CASH_OUT',
       'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER'],
      dtype='object')

In [22]:
# sender
df['orig_balance_diff'] = df['oldbalanceOrg'] - df['newbalanceOrig']
# receiver
df['dest_balance_diff'] = df['newbalanceDest'] - df['oldbalanceDest']


In [23]:
df[['orig_balance_diff', 'dest_balance_diff']].head()


Unnamed: 0,orig_balance_diff,dest_balance_diff
0,9839.64,0.0
1,1864.28,0.0
2,181.0,0.0
3,181.0,-21182.0
4,11668.14,0.0


In [24]:
from sklearn.model_selection import train_test_split

X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


In [25]:
y_train.value_counts(normalize=True) * 100


isFraud
0    99.870926
1     0.129074
Name: proportion, dtype: float64

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1216347   54534]
 [     45    1598]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1270881
           1       0.03      0.97      0.06      1643

    accuracy                           0.96   1272524
   macro avg       0.51      0.96      0.52   1272524
weighted avg       1.00      0.96      0.98   1272524




Fraud is highly imbalanced (~0.13%).
Accuracy alone is misleading.
Priority: maximize recall for fraud class; tolerate some false alarms.
Dropped identifiers: nameOrig, nameDest.
Encoded 'type' using one-hot encoding.
Created balance change features:
- orig_balance_diff
- dest_balance_diff
These capture abnormal account balance movements.
Model used: Random Forest.
Reasons:
- handles nonlinearity
- robust to noise
- works well on tabular financial data
- avoids heavy SMOTE on large dataset
High recall for fraud → most fraud detected.
Low precision → some false alarms.
Acceptable for fraud detection since missing fraud is costly.



In [34]:
importance = pd.Series(model.coef_[0], index=X.columns)
importance.sort_values(ascending=False).head(10)


orig_balance_diff    66.206992
type_CASH_OUT        29.188178
type_TRANSFER        27.546699
oldbalanceOrg        18.701647
isFlaggedFraud       11.842737
oldbalanceDest        1.735457
step                  0.561248
type_DEBIT           -1.262817
newbalanceDest       -2.185604
type_PAYMENT         -7.553241
dtype: float64


Dropped identifiers: nameOrig, nameDest.
Encoded 'type' using one-hot encoding.
Created balance change features:
- orig_balance_diff
- dest_balance_diff
These capture abnormal account balance movements.

Key predictors of fraud:

1) orig_balance_diff – large drop in sender balance
2) type_CASH_OUT – cash-out transactions associated with fraud
3) type_TRANSFER – transfers strongly linked to fraud behaviour
4) oldbalanceOrg – high sender balance increases risk
5) isFlaggedFraud – aligns with rule-based thresholds

Low-risk transactions:
- PAYMENT and DEBIT types show negative fraud association.

