In [16]:
# Import necessary libraries for data analysis, machine learning, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# For reproducibility
np.random.seed(42)

import joblib
from sklearn.metrics import confusion_matrix
from scipy.stats import chi2_contingency
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls /content/drive/MyDrive/

'AI FINANCE PROJECT.docx'
'AIML Dataset.csv'
 Book1-1.xlsx
'Brighton Jeremy Resume.docx'
'Colab Notebooks'
'Copy of AIML Dataset (1).csv'
'Copy of AIML Dataset (2).csv'
'Copy of AIML Dataset (3).csv'
'Copy of AIML Dataset.csv'
 EXCEL04.xlsx
'For building AI systems to detect suspicious financial transactions.docx'
'NIRU roadmap.docx'
'NIRU roadmap.gdoc'
'PERFORMANCE APPRAISAL TOOL FOR USE 2023 2024 FINAL.doc'
 SPEAR.docx
 SPEAR.gdoc


In [6]:
import pandas as pd

file_path = "/content/drive/MyDrive/AIML Dataset.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [7]:
# Columns to drop: IDs, Names, Rule-based flag
drop_cols = ['nameOrig', 'nameDest', 'isFlaggedFraud', 'step']  # step may be used later, remove only if not needed now

# Drop columns safely
df_clean = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')
print(f"✅ Columns after dropping: {df_clean.columns.tolist()}")

# Check dataset shape after dropping
print(f"New dataset shape: {df_clean.shape}")

✅ Columns after dropping: ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud']
New dataset shape: (6362620, 7)


In [8]:
# HANDLE MISSING VALUES & REMOVE DUPLICATES

#  Separate numeric and categorical columns
numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != 'isFraud']  # exclude target

categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

# Fill missing values
# Numeric → median
for col in numeric_cols:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Categorical → 'missing'
for col in categorical_cols:
    df_clean[col].fillna('missing', inplace=True)

# Remove duplicates
df_clean = df_clean.drop_duplicates()

# Final checks
print(f"Data cleaned. Shape after cleaning: {df_clean.shape}")
print("\n--- Missing values after cleaning ---")
print(df_clean.isnull().sum())
print(f"Number of duplicates after cleaning: {df_clean.duplicated().sum()}")

# 5Preview
display(df_clean.head())


Data cleaned. Shape after cleaning: (6264740, 7)

--- Missing values after cleaning ---
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64
Number of duplicates after cleaning: 0


Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0


In [9]:
# SEPARATE FRAUD AND NON-FRAUD
fraud_df = df[df['isFraud'] == 1]  # Keep all frauds
nonfraud_df = df[df['isFraud'] == 0]  # Non-fraud for sampling

print("Fraud rows:", len(fraud_df))
print("Non-fraud rows:", len(nonfraud_df))

Fraud rows: 8213
Non-fraud rows: 6354407


In [10]:
# CREATE AMOUNT BUCKETS FOR NON-FRAUD STRATIFICATION
# Example: 5 quantile-based buckets
nonfraud_df['amount_bucket'] = pd.qcut(nonfraud_df['amount'], q=5, labels=False)

# Now stratify by 'type' and 'amount_bucket'
strata_cols = ['type', 'amount_bucket']

In [11]:
# STRATIFIED SAMPLING OF NON-FRAUD
target_nonfraud_rows = 100_000 - len(fraud_df)  # total ~100k rows
sampled_nonfraud = nonfraud_df.groupby(strata_cols, group_keys=False).apply(
    lambda x: x.sample(
        n=int(target_nonfraud_rows * len(x)/len(nonfraud_df)),
        random_state=42
    )
)

print("Sampled non-fraud rows:", len(sampled_nonfraud))

Sampled non-fraud rows: 91777


In [12]:
# COMBINE FRAUD + SAMPLED NON-FRAUD
df_sampled = pd.concat([fraud_df, sampled_nonfraud], ignore_index=True)

print("Total sampled rows (~1M):", len(df_sampled))
print("Fraud ratio in sampled dataset:", df_sampled['isFraud'].mean())

Total sampled rows (~1M): 99990
Fraud ratio in sampled dataset: 0.08213821382138214


In [13]:
# OPTIONAL: Drop temporary columns
if 'amount_bucket' in df_sampled.columns:
    df_sampled.drop(columns=['amount_bucket'], inplace=True)

In [14]:
# Basic info
print("Dataset Information:")
df_sampled.info(memory_usage='deep')

# 2Quick stats
print("\nDataset Description:")
display(df_sampled.describe(include='all'))

#  Check class distribution
fraud_count = df_sampled['isFraud'].sum()
nonfraud_count = len(df_sampled) - fraud_count
total = len(df_sampled)
print(f"\nTotal transactions: {total}")
print(f"Fraudulent: {fraud_count} ({fraud_count/total:.4%})")
print(f"Non-Fraudulent: {nonfraud_count} ({nonfraud_count/total:.4%})")

# Optional: first 5 rows
print("\nPreview of sampled dataset:")
display(df_sampled.head())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99990 entries, 0 to 99989
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            99990 non-null  int64  
 1   type            99990 non-null  object 
 2   amount          99990 non-null  float64
 3   nameOrig        99990 non-null  object 
 4   oldbalanceOrg   99990 non-null  float64
 5   newbalanceOrig  99990 non-null  float64
 6   nameDest        99990 non-null  object 
 7   oldbalanceDest  99990 non-null  float64
 8   newbalanceDest  99990 non-null  float64
 9   isFraud         99990 non-null  int64  
 10  isFlaggedFraud  99990 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 22.8 MB

Dataset Description:


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,99990.0,99990,99990.0,99990,99990.0,99990.0,99990,99990.0,99990.0,99990.0,99990.0
unique,,5,,99987,,,93251,,,,
top,,CASH_OUT,,C896313584,,,C991148487,,,,
freq,,36375,,2,,,5,,,,
mean,254.027273,,284986.5,,903425.7,804739.9,,1057731.0,1232540.0,0.082138,0.00016
std,153.290297,,975737.1,,2958978.0,2867433.0,,3575339.0,3893350.0,0.274577,0.012649
min,1.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0
25%,156.0,,14809.86,,0.0,0.0,,0.0,0.0,0.0,0.0
50%,252.0,,87019.52,,20390.0,0.0,,97028.63,202919.8,0.0,0.0
75%,350.0,,233436.7,,173745.6,112988.6,,875174.8,1111135.0,0.0,0.0



Total transactions: 99990
Fraudulent: 8213 (8.2138%)
Non-Fraudulent: 91777 (91.7862%)

Preview of sampled dataset:


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,0
4,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1,0


In [19]:
# Save the cleaned 100k dataset to Google Drive
df_sampled.to_csv('/content/drive/MyDrive/AIML_Dataset_100k.csv', index=False)
print("✅ 100k cleaned dataset saved successfully to Drive!")

✅ 100k cleaned dataset saved successfully to Drive!
