1. Import Required Libraries

In [72]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif

# ML model + evaluation tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

2. Load Dataset & Display First Rows

In [73]:
import pandas as pd

df = pd.read_csv("PS_20174392719_1491204439457_log.csv")
df.head()


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


3. Dataset Structure & Statistical Summary

In [74]:
df.info() # View dataset summary
df.describe() # Statistical summary for numerical features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


4. Data Understanding & Data Cleaning

In [75]:
df.isnull().sum()
df['type'].value_counts()
df['isFraud'].value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

This block checks dataset quality and distribution of key columns.

0 (normal): 6,354,407 transactions

1 (fraud): 8,213 transactions

In [76]:
df['type'].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

Counts how many times each category appears in the type column.

In [None]:
# Remove account ID columns that do not provide useful info
df = df.drop(['nameOrig', 'nameDest'], axis=1)

# Convert transaction type (categorical) into numeric dummy variables
df = pd.get_dummies(df, columns=['type'], drop_first=True)

df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,False,False,True,False
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,False,False,True,False
2,1,181.0,181.0,0.0,0.0,0.0,1,0,False,False,False,True
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,True,False,False,False
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,False,False,True,False


ML models cannot use text columns like type, nameOrig (Sender Account), nameDest (Receiver Account)

5. Feature Engineering

Feature Engineering is the process of selecting, creating or modifying features like input variables or data to help machine learning models learn patterns more effectively.

In [78]:
# 1. Balance difference for origin account
df['balanceDiffOrig'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df[['balanceDiffOrig', 'newbalanceOrig', 'oldbalanceOrg', 'isFraud']].head()

Unnamed: 0,balanceDiffOrig,newbalanceOrig,oldbalanceOrg,isFraud
0,-9839.64,160296.36,170136.0,0
1,-1864.28,19384.72,21249.0,0
2,-181.0,0.0,181.0,1
3,-181.0,0.0,181.0,1
4,-11668.14,29885.86,41554.0,0


This feature measures how much money left or entered the sender’s (origin) account after the transaction.

* For normal transactions: newbalanceOrig = oldbalanceOrg - amount
    - The difference should be negative (money is deducted).

* For suspicious/fraud transactions:
Balance inconsistencies may occur:
    - money not deducted properly
    - negative balances
    - unchanged balances despite large transfers

In [None]:
# 2. Balance difference for destination account
df['balanceDiffDest'] = df['newbalanceDest'] - df['oldbalanceDest']
df[['balanceDiffDest', 'newbalanceOrig', 'oldbalanceOrg', 'isFraud']].head()

Unnamed: 0,balanceDiffDest,newbalanceOrig,oldbalanceOrg,isFraud
0,0.0,160296.36,170136.0,0
1,0.0,19384.72,21249.0,0
2,0.0,0.0,181.0,1
3,-21182.0,0.0,181.0,1
4,0.0,29885.86,41554.0,0


This feature measures how much the receiver’s (destination) account balance changed.

Normal behavior: balanceDiffDest should be positive

- Destination balance increases by amount.

Abnormal behavior: 0 or negative
- No change in destination balance
- Excessive increase
- Negative or inconsistent calculations

In [None]:
# 3. Error value for origin account (ideal normal transaction = 0)
df['errorBalanceOrig'] = df['oldbalanceOrg'] - df['amount'] - df['newbalanceOrig']
df[['errorBalanceOrig', 'oldbalanceOrg', 'amount', 'newbalanceOrig', 'isFraud']].head()

Unnamed: 0,errorBalanceOrig,oldbalanceOrg,amount,newbalanceOrig,isFraud
0,0.0,170136.0,9839.64,160296.36,0
1,0.0,21249.0,1864.28,19384.72,0
2,0.0,181.0,181.0,0.0,1
3,0.0,181.0,181.0,0.0,1
4,0.0,41554.0,11668.14,29885.86,0


If the equation is correct: errorBalanceOrig = 0

If the equation is incorrect: errorBalanceOrig ≠ 0

Fraudulent transactions often break this rule because:
- balances don’t update
- accounts are fake
- money amounts do not match origin balance

In [81]:
# 4. Error value for destination account
df['errorBalanceDest'] = df['newbalanceDest'] + df['amount'] - df['oldbalanceDest']

df[['errorBalanceDest', 'newbalanceDest', 'amount', 'oldbalanceDest', 'isFraud']].head()

Unnamed: 0,errorBalanceDest,newbalanceDest,amount,oldbalanceDest,isFraud
0,9839.64,0.0,9839.64,0.0,0
1,1864.28,0.0,1864.28,0.0,0
2,181.0,0.0,181.0,0.0,1
3,-21001.0,0.0,181.0,21182.0,1
4,11668.14,0.0,11668.14,0.0,0


- Correct update: errorBalanceDest = 0
- Incorrect update: errorBalanceDest ≠ 0

Fraud accounts (destination side) often:
- Never reflect the deposit
- Have zero balance behavior
- Have sudden large jumps
- Or remain unchanged

In [82]:
# 5. Ratio of amount vs available balance (high ratio = suspicious)
df['ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)  # +1 prevents division by zero

df[['ratio', 'amount', 'oldbalanceOrg', 'isFraud']].head()

Unnamed: 0,ratio,amount,oldbalanceOrg,isFraud
0,0.057834,9839.64,170136.0,0
1,0.087731,1864.28,21249.0,0
2,0.994505,181.0,181.0,1
3,0.994505,181.0,181.0,1
4,0.280788,11668.14,41554.0,0


This feature measures how large the transaction is relative to the sender’s balance.

- High ratio (~1 or above): Sender is transferring almost all their money → suspicious, often fraud.

- Low ratio: Small portion of balance → normal behavior.

The +1 avoids division by zero errors.

6. Feature Selection Using Chi-Square Test

After completing data cleaning and feature engineering, the next step is to determine
which features are statistically most important for predicting fraud.  
For this purpose, we use the **Chi-Square (χ²) test**, a widely used method for feature
selection in classification tasks.

In [None]:
#Separating Features (X) and Target (y)
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [None]:
#Scaling the Features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

#Applying Chi-Square Test (SelectKBest)
chi2_selector = SelectKBest(score_func=chi2, k=10)
chi2_selector.fit(X_scaled, y)

#Displaying Feature Importance Scores
chi2_scores = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': chi2_selector.scores_
}).sort_values(by='Chi2 Score', ascending=False)

chi2_scores

Unnamed: 0,Feature,Chi2 Score
10,type_TRANSFER,16917.030737
6,isFlaggedFraud,12379.217338
9,type_PAYMENT,2780.783232
1,amount,820.617059
0,step,714.605961
7,type_CASH_OUT,522.612423
11,balanceDiffOrig,150.551063
2,oldbalanceOrg,110.146644
3,newbalanceOrig,85.182381
8,type_DEBIT,53.550397


The Chi-Square table ranks ALL your features by importance — from highest score to lowest score.

These results show:

- Transfer transactions are heavily associated with fraud
- Flagged transactions also correlate strongly
- Payment amounts and time steps have moderate influence
- Other engineered features (balance differences, errors) also contribute

Feature selection

In [2]:
selected_features = [
    'amount',
    'oldbalanceOrg',
    'newbalanceDest',
    'errorBalanceOrig',
    'errorBalanceDest',
    'ratio',
    'type_TRANSFER',
    'type_CASH_OUT'
]


Chi-Square feature selection to identify the most statistically important variables related to fraud.

However, Chi-Square alone cannot capture logical fraud behaviors,
so I selected a hybrid set of features:

- High Chi2-scoring features (transfer types, amount) for statistical significance
- Logically important features (error balances, ratio) that indicate abnormal behavior

Together, these features give stronger predictive power for identifying fraud patterns.

7. Model Training, Prediction, and Evaluation

Build a machine learning model that can learn patterns from past transactions and classify new transactions as fraud (1) or
non-fraud (0). This process involves splitting the data, training the model, making predictions,
and evaluating how well the model performs.

Split the dataset into:

- 80% Training data → used to teach the model
- 20% Testing data → used to evaluate the model’s performance

In [None]:
#Selecting Features (X) and Target (y)
X = df[selected_features]
y = df['isFraud']

# Train/Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Count fraud vs non-fraud
total_normal = df[df['isFraud'] == 0].shape[0]
total_fraud = df[df['isFraud'] == 1].shape[0]

print("Number of non-fraud transactions :", total_normal)
print("Number of fraud transactions     :", total_fraud)

Accuracy: 0.9992157318840352
F1-score: 0.5959514170040486

Confusion Matrix:
 [[1270790     114]
 [    884     736]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       0.87      0.45      0.60      1620

    accuracy                           1.00   1272524
   macro avg       0.93      0.73      0.80   1272524
weighted avg       1.00      1.00      1.00   1272524

Number of non-fraud transactions : 6354407
Number of fraud transactions     : 8213


Accuracy: 0.9992
But for imbalanced datasets, accuracy can be misleading because predicting everything as “non-fraud” still gives high accuracy.

F1-score: 0.5959
Balances precision and recall, especially important for fraud detection.


Confusion Matrix:

- 736 fraud cases correctly detected
- 884 fraud cases missed
- 114 normal cases incorrectly flagged as fraud

This shows:

- Excellent detection of normal transactions
- Moderate detection of fraud cases. (Because fraud is extremely rare.

This shows how imbalanced the dataset is:

- 6,354,407 non-fraud
- 8,213 fraud
Fraud = only 0.12% of the dataset.

This imbalance explains:
- High accuracy
- Lower fraud recall
- Difficulty detecting minority class

In [87]:
import pandas as pd

# Create 10 synthetic transactions for testing
test_samples = pd.DataFrame([
    # amount, oldOrg, newDest, errOrig, errDest, ratio, transfer, cashout
    [50000, 0, 0, -50000, 0, 50.0, 1, 0],        # Normal PAYMENT-like
    [49900, 49000, 0, -49900, 0, 1.01, 0, 1],    # TRANSFER almost entire balance
    [100, 1500, 1600, -100, 100, 0.06, 0, 0], # Normal small transaction
    [20000, 25000, 100000, -20000, 80000, 0.80, 1, 0], # Classic Fraud pattern
    [3000, 10000, 5000, -3000, 3000, 0.30, 1, 0], # Balanced TRANSFER
    [15000, 16000, -5000, -15000, -15000, 0.93, 1, 0], # High ratio, suspicious
    [80, 15000, 0, -80, 0, 0.005, 0, 0],      # Very normal micro-payment
    [50000, 51000, 0, -50000, 0, 0.98, 1, 1], # Extreme Fraud pattern
    [600, 600, 0, -600, 0, 1.0, 1, 0],        # Suspicious: draining account
    [19995, 20000, 0, -19995, 0, 0.999, 0, 1], # Normal TRANSFER
], columns=selected_features)

# Predict with logistic regression model
test_predictions = model.predict(test_samples)
test_probs = model.predict_proba(test_samples)[:, 1]

# Combine results
results = test_samples.copy()
results['predicted_fraud'] = test_predictions
results['fraud_probability'] = test_probs

results



Unnamed: 0,amount,oldbalanceOrg,newbalanceDest,errorBalanceOrig,errorBalanceDest,ratio,type_TRANSFER,type_CASH_OUT,predicted_fraud,fraud_probability
0,50000,0,0,-50000,0,50.0,1,0,0,0.00108
1,49900,49000,0,-49900,0,1.01,0,1,0,0.000502
2,100,1500,1600,-100,100,0.06,0,0,0,0.001322
3,20000,25000,100000,-20000,80000,0.8,1,0,0,0.000675
4,3000,10000,5000,-3000,3000,0.3,1,0,0,0.001225
5,15000,16000,-5000,-15000,-15000,0.93,1,0,0,0.001273
6,80,15000,0,-80,0,0.005,0,0,0,0.001253
7,50000,51000,0,-50000,0,0.98,1,1,0,0.000491
8,600,600,0,-600,0,1.0,1,0,0,0.001308
9,19995,20000,0,-19995,0,0.999,0,1,0,0.000634
