In [12]:
####  Imports  Libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
import networkx as nx

In [None]:
##### Load & Preprocess Raw Data
RAW_DATA_PATH = r"your raw data csv"
df = pd.read_csv(RAW_DATA_PATH)
df.dropna(inplace=True)

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,371,CASH_OUT,367336.05,sdv-pii-r8zd6,4514816.83,2108392.86,sdv-pii-q6998,1265486.06,2454140.46,0,0
1,368,TRANSFER,238.63,sdv-pii-xq6z3,430944.71,1865444.6,sdv-pii-n2ql8,107927.46,2021.16,0,0
2,141,CASH_OUT,254.93,sdv-pii-805w0,839593.53,8008353.88,sdv-pii-yo0z6,773352.22,20.79,0,0
3,191,CASH_IN,501547.39,sdv-pii-279tw,41226.4,28633.52,sdv-pii-9zlyl,6825363.55,16442078.24,0,0
4,169,TRANSFER,71832.0,sdv-pii-ksz58,248694.6,793617.86,sdv-pii-0ykbo,579313.76,829850.96,0,0


In [4]:
# Label encode categorical columns
label_enc = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = label_enc.fit_transform(df[col])

In [5]:
# Feature-target split
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [6]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# Make directory for unsupervised models
os.makedirs("Trained_Models/Unsupervised", exist_ok=True)

In [27]:
### Data Preprocess
def preprocess_data(df: pd.DataFrame, feature_cols: list, scaler_path: str = "Trained_Models/Unsupervised/scaler.pkl"):
    os.makedirs("models", exist_ok=True)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[feature_cols])
    joblib.dump(scaler, scaler_path)
    return X_scaled

In [28]:
#### 1. Isolation Forest -----
def train_isolation_forest(X: np.ndarray, model_path: str = "Trained_Models/Unsupervised/isolation_forest.pkl"):
    model = IsolationForest(contamination=0.05, random_state=42)
    model.fit(X)
    joblib.dump(model, model_path)
    return model

In [29]:
#### 2. One-Class SVM -----
def train_one_class_svm(X: np.ndarray, model_path: str = "Trained_Models/Unsupervised/one_class_svm.pkl"):
    model = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.05)
    model.fit(X)
    joblib.dump(model, model_path)
    return model

In [30]:
#### 3. Autoencoders ----- Deep Learning-based Anomaly Detection
def build_autoencoder(input_dim: int) -> Model:
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(14, activation='relu', activity_regularizer=regularizers.l1(1e-4))(input_layer)
    encoded = Dense(7, activation='relu')(encoded)
    decoded = Dense(14, activation='relu')(encoded)
    output_layer = Dense(input_dim, activation='linear')(decoded)

    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    return autoencoder

def train_autoencoder(X: np.ndarray, model_path: str = "Trained_Models/Unsupervised/autoencoder_model.h5"):
    autoencoder = build_autoencoder(X.shape[1])
    autoencoder.fit(X, X, epochs=50, batch_size=32, shuffle=True)
    autoencoder.save(model_path)
    return autoencoder

In [31]:
##### 4. Graph-Based Analysis --- To Detecting fraudulent networks and relationships
def build_entity_graph(edges: list) -> nx.Graph:
    G = nx.Graph()
    G.add_edges_from(edges)
    return G

def detect_communities(graph: nx.Graph):
    from networkx.algorithms.community import greedy_modularity_communities
    communities = list(greedy_modularity_communities(graph))
    return communities

In [32]:
#### To Run the Unsupervised Models
df = pd.DataFrame({
    'amount': [100, 200, 150, 12000, 300, 5000],
    'transaction_duration': [2, 3, 2.5, 50, 1.5, 40],
    'location_code': [101, 102, 101, 103, 102, 101]
})

feature_cols = ['amount', 'transaction_duration', 'location_code']
X_scaled = preprocess_data(df, feature_cols)

print("** Training Isolation Forest...")
train_isolation_forest(X_scaled)

print("** Training One-Class SVM...")
train_one_class_svm(X_scaled)

print("** Training Autoencoder...")
train_autoencoder(X_scaled)

edges = [('user1', 'merchantA'), ('user2', 'merchantA'), ('user2', 'merchantB'), ('user3', 'merchantB')]
G = build_entity_graph(edges)
communities = detect_communities(G)
print(f"[✔] Detected Communities: {communities}")

** Training Isolation Forest...
** Training One-Class SVM...
** Training Autoencoder...
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.0424
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 1.0355
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 1.0286
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 1.0219
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 1.0154
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 1.0090
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 1.0026
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 0.9948
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 0.9871
Epoch 10/50
[1m1/1[0m [32m━━━



[✔] Detected Communities: [frozenset({'merchantA', 'user2', 'user1'}), frozenset({'user3', 'merchantB'})]
