In [11]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import warnings
import os
warnings.filterwarnings('ignore')

In [13]:
# ===================== IMPORTS =====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ===================== LOAD DATA =====================
# Change path if needed
file_path = "AIML Dataset3.csv"
df = pd.read_csv(file_path)

print("Shape of dataset:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# ===================== RENAME COLUMNS =====================
col_map = {
    'step': 'time_step',
    'type': 'transaction_type',
    'amount': 'amount',
    'nameOrig': 'origin_account',
    'oldbalanceOrg': 'origin_old_balance',
    'newbalanceOrig': 'origin_new_balance',
    'nameDest': 'destination_account',
    'oldbalanceDest': 'destination_old_balance',
    'newbalanceDest': 'destination_new_balance',
    'isFraud': 'is_fraud',
    'isFlaggedFraud': 'is_flagged_fraud'
}
df.rename(columns={k: v for k, v in col_map.items() if k in df.columns}, inplace=True)

print("\nColumns after renaming:", df.columns.tolist())

# ===================== CLEAN NUMERIC COLUMNS =====================
numeric_cols = [
    "amount",
    "origin_old_balance", "origin_new_balance",
    "destination_old_balance", "destination_new_balance"
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  # convert to numeric

# Drop rows where key numeric values are missing
df = df.dropna(subset=numeric_cols)

# ===================== FRAUD COUNTS =====================
fraud_counts = df['is_fraud'].value_counts()
print("\nFraud vs Non-Fraud Counts:")
print(fraud_counts)

fraud_percentage = (fraud_counts[1] / fraud_counts.sum()) * 100
print("\nFraud percentage: {:.4f}%".format(fraud_percentage))

# ===================== DESCRIPTIVE STATS =====================
print("\nSummary Statistics (numeric columns):")
print(df.describe())

# ===================== FEATURE ENGINEERING =====================
df['balance_change_origin'] = df['origin_old_balance'] - df['origin_new_balance']
df['balance_change_dest'] = df['destination_new_balance'] - df['destination_old_balance']
df['transaction_hour'] = df['time_step'] % 24
df['is_round_amount'] = (df['amount'] % 1000 == 0).astype(int)

# ===================== VISUALIZATIONS =====================

# Fraud vs Non-Fraud count
plt.figure(figsize=(6,4))
sns.countplot(x='is_fraud', data=df, palette=['#4CAF50','#F44336'])
plt.title("Fraud vs Non-Fraud Transactions")
plt.xlabel("Transaction Type (0 = Normal, 1 = Fraud)")
plt.ylabel("Count")
plt.show()

# Transaction amount distribution (log scale)
plt.figure(figsize=(8,5))
sns.histplot(data=df, x='amount', hue='is_fraud', bins=100, log_scale=(True, False))
plt.title("Transaction Amount Distribution (Log Scale)")
plt.show()

# Fraud rate by transaction type
if 'transaction_type' in df.columns:
    plt.figure(figsize=(8,5))
    fraud_rate = df.groupby('transaction_type')['is_fraud'].mean() * 100
    sns.barplot(x=fraud_rate.index, y=fraud_rate.values, palette="Reds")
    plt.title("Fraud Rate by Transaction Type (%)")
    plt.ylabel("Fraud Rate %")
    plt.show()

# Hourly fraud rate
if 'time_step' in df.columns:
    plt.figure(figsize=(10,5))
    hourly = df.groupby(df['transaction_hour'])['is_fraud'].mean() * 100
    sns.lineplot(x=hourly.index, y=hourly.values, marker='o')
    plt.title("Fraud Rate by Hour of Day")
    plt.xlabel("Hour of Day")
    plt.ylabel("Fraud Rate (%)")
    plt.show()



Shape of dataset: (16426, 11)

First 5 rows:
   step      type      amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0   187   CASH_IN    46291.77   C229926567     2098868.16      2145159.92   
1    42  CASH_OUT   149509.45   C156437847           0.00            0.00   
2    98  CASH_OUT     8055.06  C1544350298        8055.06            0.00   
3   550  TRANSFER   342309.91   C662184778      342309.91            0.00   
4   586  CASH_OUT  2581549.92   C648614053     2581549.92            0.00   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0   C851786608       532456.02       486164.25        0               0  
1  C1660179148       183524.95       333034.40        0               0  
2   C912405348            0.00         8055.06        1               0  
3  C1740503020            0.00            0.00        1               0  
4   C213455810            0.00      2581549.92        1               0  

Columns after renaming: ['time_step', 'transact

In [1]:
from sklearn.utils import resample
import pandas as pd
import numpy as np
import os

# Load dataset
file_in = "AIML Dataset.csv"
if not os.path.exists(file_in):
    raise FileNotFoundError(f"File not found: {file_in}")

df = pd.read_csv("AIML Dataset.csv")
print("Loaded:", df.shape)
print("\nOriginal dtypes:")
print(df.dtypes)

# --- Define desired column types (adjust these if your real column names differ) ---
# Use: 'int', 'float', 'category', 'string' (or 'object' to keep as Python object)
desired_types = {
    'step': 'int',
    'type': 'category',            # transaction type (categorical)
    'amount': 'float',
    'nameOrig': 'string',          # origin account id/name
    'oldbalanceOrg': 'float',
    'newbalanceOrig': 'float',
    'nameDest': 'string',          # destination account id/name
    'oldbalanceDest': 'float',
    'newbalanceDest': 'float',
    'isFraud': 'int',
    'isFlaggedFraud': 'int'
}

# If your dataset column names differ (eg. isFraud vs is_fraud), map them here:
# e.g., desired_types = {'step':'int', 'type':'category', 'amount':'float', 'is_fraud':'int', ...}

# --- Convert columns safely ---
df_copy = df.copy()
conversion_errors = {}

for col, kind in desired_types.items():
    if col not in df_copy.columns:
        print(f"⚠️ Column '{col}' not found in dataset — skipping conversion for this column.")
        continue

    if kind == 'int':
        # convert to numeric then downcast to smallest integer
        before_nonnull = df_copy[col].notna().sum()
        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
        coerced = df_copy[col].isna().sum()
        if coerced > 0:
            conversion_errors[col] = coerced
        # drop rows with NaN produced by coercion (we'll log how many)
        # but don't drop immediately every column; postpone final drop to keep track of total failures
    elif kind == 'float':
        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
        coerced = df_copy[col].isna().sum()
        if coerced > 0:
            conversion_errors[col] = coerced
    elif kind == 'category':
        # treat as string then convert to category (handles missing)
        df_copy[col] = df_copy[col].astype('string').fillna("UNKNOWN")
        df_copy[col] = df_copy[col].astype('category')
    elif kind in ('string', 'object'):
        df_copy[col] = df_copy[col].astype('string')
    else:
        print(f"⚠️ Unknown desired type '{kind}' for column '{col}'")

# Report conversion problems
if conversion_errors:
    print("\nConversion summary (columns where coercion produced NaNs):")
    for c, cnt in conversion_errors.items():
        print(f" - {c}: {cnt} values could not be converted and became NaN")

# Drop rows with any NaN in columns we converted to numeric (to keep dataset clean)
numeric_cols = [c for c, t in desired_types.items() if t in ('int', 'float') and c in df_copy.columns]
rows_before = len(df_copy)
df_copy = df_copy.dropna(subset=numeric_cols)
rows_after = len(df_copy)
dropped = rows_before - rows_after
if dropped > 0:
    print(f"\nDropped {dropped} rows due to non-convertible numeric values (see summary above).")
else:
    print("\nNo rows dropped during numeric conversion.")

# Now downcast numeric columns to save memory
for c in numeric_cols:
    if c not in df_copy.columns:
        continue
    if desired_types[c] == 'int':
        # convert floats that are actually integers
        df_copy[c] = pd.to_numeric(df_copy[c], downcast='integer')
    else:  # float
        df_copy[c] = pd.to_numeric(df_copy[c], downcast='float')

# Ensure integer columns are ints (no fractional parts after coercion)
for c, t in desired_types.items():
    if t == 'int' and c in df_copy.columns:
        # If column is float (rare after downcast), convert safely to int
        if pd.api.types.is_float_dtype(df_copy[c]):
            if (df_copy[c] % 1 != 0).any():
                print(f"⚠️ Column '{c}' has non-integer values after conversion; keeping as float.")
            else:
                df_copy[c] = df_copy[c].astype('int64')

print("\nDtypes after conversion/downcast:")
print(df_copy.dtypes)
print("\nMemory usage (MB):", df_copy.memory_usage(deep=True).sum() / 1024**2)

# --- Balance dataset via undersampling (your original logic) ---
# If column name is 'isFraud' or 'is_fraud', ensure we use the correct one:
target_col = None
if 'isFraud' in df_copy.columns:
    target_col = 'isFraud'
elif 'is_fraud' in df_copy.columns:
    target_col = 'is_fraud'
elif 'isFraud' in df.columns:
    target_col = 'isFraud'
else:
    raise KeyError("Target column 'isFraud' (or 'is_fraud') not found in dataset after conversion.")

fraud_df = df_copy[df_copy[target_col] == 1]
nonfraud_df = df_copy[df_copy[target_col] == 0]

print("\nClass counts before balancing:")
print(df_copy[target_col].value_counts())

# If fraud is the minority, undersample non-fraud to match fraud count.
n_fraud = len(fraud_df)
n_nonfraud = len(nonfraud_df)

if n_fraud == 0:
    raise ValueError("No fraud samples found in dataset! Cannot balance by undersampling.")

if n_nonfraud <= n_fraud:
    print("Warning: Non-fraud class has less or equal samples than fraud. Skipping downsampling.")
    df_balanced = df_copy.copy()
else:
    nonfraud_downsampled = resample(
        nonfraud_df,
        replace=False,
        n_samples=n_fraud,
        random_state=42
    )
    df_balanced = pd.concat([fraud_df, nonfraud_downsampled], ignore_index=True)
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nAfter Balancing class counts:")
print(df_balanced[target_col].value_counts())

# Save to CSV
out_file = "AIML Dataset3.csv"
df_balanced.to_csv(out_file, index=False)
print(f"\n✅ Balanced dataset saved as '{out_file}' (rows: {len(df_balanced)})")


  df = pd.read_csv("AIML Dataset.csv")


Loaded: (6362620, 11)

Original dtypes:
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest     object
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

Conversion summary (columns where coercion produced NaNs):
 - oldbalanceDest: 1 values could not be converted and became NaN

Dropped 1 rows due to non-convertible numeric values (see summary above).

Dtypes after conversion/downcast:
step                       int16
type                    category
amount                   float64
nameOrig          string[python]
oldbalanceOrg            float64
newbalanceOrig           float64
nameDest          string[python]
oldbalanceDest           float64
newbalanceDest           float64
isFraud                     int8
isFlaggedFraud              int8
dtype: object

Memory usage (MB): 1043.455451965332

Class co

In [14]:
# Logistic Regression
# 1. Load dataset
file_path = "AIML Dataset3.csv"   # change path if needed
data = pd.read_csv(file_path)

# 2. Drop irrelevant columns (IDs)
X = data.drop(columns=["isFraud", "nameOrig", "nameDest", "isFlaggedFraud"])
y = data["isFraud"]

print(X.info())

# 3. Encode categorical column "type"
X = pd.get_dummies(X, columns=["type"], drop_first=True)

# Save the feature columns order
feature_columns = X.columns
joblib.dump(feature_columns, "feature_columns.pkl")

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
log_reg.fit(X_train_scaled, y_train)

# 7. Predictions & evaluation
y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16426 entries, 0 to 16425
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            16426 non-null  int64  
 1   type            16426 non-null  object 
 2   amount          16426 non-null  float64
 3   oldbalanceOrg   16426 non-null  float64
 4   newbalanceOrig  16426 non-null  float64
 5   oldbalanceDest  16426 non-null  float64
 6   newbalanceDest  16426 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 898.4+ KB
None
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      1643
           1       0.93      0.93      0.93      1643

    accuracy                           0.93      3286
   macro avg       0.93      0.93      0.93      3286
weighted avg       0.93      0.93      0.93      3286


Confusion Matrix:
 [[1533  110]
 [ 120 1523]]

ROC-AUC Score: 0.98

In [None]:
# Isolation Forest

# 1. Load dataset
file_path = "AIML Dataset3.csv"   # change path if needed
data = pd.read_csv(file_path)

# 2. Drop irrelevant columns (IDs)
X = data.drop(columns=["isFraud", "nameOrig", "nameDest", "isFlaggedFraud"])
y = data["isFraud"]

# 3. Encode categorical column "type"
X = pd.get_dummies(X, columns=["type"], drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Scale features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Train Isolation Forest
contamination_rate = y_train.mean()  # fraction of fraud cases in training set
iso_forest = IsolationForest(
    contamination=contamination_rate, random_state=42
)
iso_forest.fit(X_train_scaled)

# 7. Predictions
y_pred_if = iso_forest.predict(X_test_scaled)

# IsolationForest outputs: -1 = anomaly (fraud), 1 = normal
# Convert to 0/1 labels (fraud=1, normal=0)
y_pred_if = [1 if p == -1 else 0 for p in y_pred_if]

# 8. Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred_if))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_if))

# IsolationForest also gives anomaly scores (for ROC-AUC)
y_scores = -iso_forest.decision_function(X_test_scaled)
print("\nROC-AUC Score:", roc_auc_score(y_test, y_scores))


Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.47      0.47      1643
           1       0.47      0.47      0.47      1643

    accuracy                           0.47      3286
   macro avg       0.47      0.47      0.47      3286
weighted avg       0.47      0.47      0.47      3286


Confusion Matrix:
 [[775 868]
 [864 779]]

ROC-AUC Score: 0.476254598623645


In [15]:
#Random forest

# 1. Load dataset
file_path = "AIML Dataset3.csv"   # change path if needed
data = pd.read_csv(file_path)

# 2. Drop irrelevant columns (IDs)
X = data.drop(columns=["isFraud", "nameOrig", "nameDest", "isFlaggedFraud"])
y = data["isFraud"]

print(X.info())

# 3. Encode categorical column "type"
X = pd.get_dummies(X, columns=["type"], drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. (Optional) Scale features – Random Forest doesn’t need scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X_train.columns.tolist(), "feature_columns.pkl")


# 6. Train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=200,        # number of trees
    max_depth=None,          # let trees expand fully
    class_weight="balanced", # handle fraud imbalance
    random_state=42,
    n_jobs=-1                # use all CPU cores
)
rf_model.fit(X_train_scaled, y_train)

# 7. Predictions & evaluation
y_pred_rf = rf_model.predict(X_test_scaled)
y_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob_rf))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16426 entries, 0 to 16425
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            16426 non-null  int64  
 1   type            16426 non-null  object 
 2   amount          16426 non-null  float64
 3   oldbalanceOrg   16426 non-null  float64
 4   newbalanceOrig  16426 non-null  float64
 5   oldbalanceDest  16426 non-null  float64
 6   newbalanceDest  16426 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 898.4+ KB
None
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      1643
           1       0.99      1.00      0.99      1643

    accuracy                           0.99      3286
   macro avg       0.99      0.99      0.99      3286
weighted avg       0.99      0.99      0.99      3286


Confusion Matrix:
 [[1619   24]
 [   1 1642]]

ROC-AUC Score: 0.99

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 1. Load dataset
file_path = "AIML Dataset.csv"   # change path if needed
data = pd.read_csv(file_path)

# 2. Drop irrelevant columns (IDs)
X = data.drop(columns=["isFraud", "nameOrig", "nameDest", "isFlaggedFraud"])
y = data["isFraud"]

# 3. Encode categorical column "type"
X = pd.get_dummies(X, columns=["type"], drop_first=True)

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. (Optional) Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)

# 7. Training and Testing Accuracy
train_acc = accuracy_score(y_train, rf_model.predict(X_train_scaled))
test_acc = accuracy_score(y_test, rf_model.predict(X_test_scaled))

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy:  {test_acc:.4f}")


Training Accuracy: 1.0000
Testing Accuracy:  0.9933


In [12]:
import joblib

# Assuming you already trained these models earlier:
# log_reg  -> Logistic Regression
# iso_forest -> Isolation Forest
# rf_model -> Random Forest

# Save models
joblib.dump(log_reg, "logistic_regression_model.pkl")
joblib.dump(iso_forest, "isolation_forest_model.pkl")
joblib.dump(rf_model, "random_forest_model.pkl")

print("✅ Models saved successfully!")

# ----------------------------
# To load the models later:
# ----------------------------
# log_reg_loaded = joblib.load("logistic_regression_model.pkl")
# iso_forest_loaded = joblib.load("isolation_forest_model.pkl")
# rf_model_loaded = joblib.load("random_forest_model.pkl")


✅ Models saved successfully!
