In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [63]:
df = pd.read_csv("/kaggle/input/data-ml/cleaned_aml.csv")

In [67]:
print(f"laundering = 1 shape: {df[df['is_laundering']==1].shape}\n launder = 0 shape: {df[df['is_laundering']==0].shape}")

laundering = 1 shape: (830, 13)
 launder = 0 shape: (916247, 13)


In [5]:
df.columns

Index(['amount', 'is_laundering', 'hour', 'dayofweek', 'change_currency',
       'change_location', 'payment_type_ACH', 'payment_type_Cash Deposit',
       'payment_type_Cash Withdrawal', 'payment_type_Cheque',
       'payment_type_Credit card', 'payment_type_Cross-border',
       'payment_type_Debit card'],
      dtype='object')

In [155]:
df_laundering_1 = df[df['is_laundering'] == 1] 
df_laundering_0_sampled = df[df['is_laundering'] == 0].sample(n=8300, random_state=42)  
df_balanced = pd.concat([df_laundering_1, df_laundering_0_sampled])

In [72]:
df = df_balanced

In [None]:
y = df['is_laundering']
x = df.drop(columns=["is_laundering"])

In [153]:
x.shape, y.shape

((9130, 12), (9130,))

In [160]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [146]:
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [161]:
print(f"Before SMOTE: {sum(y_train==1)} positives, {sum(y_train==0)} negatives")
print(f"After SMOTE: {sum(y_train_smote==1)} positives, {sum(y_train_smote==0)} negatives")

Before SMOTE: 664 positives, 6640 negatives
After SMOTE: 5477 positives, 5477 negatives


In [148]:
def evaluation_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
        

In [162]:
rf_model_no_smote = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf_model_no_smote.fit(X_train, y_train)
y_pred = rf_model_no_smote.predict(X_test)
print("**RandomForest without SMOTE**")
evaluation_metrics(y_test, y_pred)

**RandomForest without SMOTE**
Precision: 0.525
Recall: 0.187
F1 Score: 0.276


In [163]:
rf_model_smote = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

rf_model_smote.fit(X_train_smote, y_train_smote)
y_pred = rf_model_smote.predict(X_test)
print("**RandomForest with SMOTE**")
evaluation_metrics(y_test, y_pred)

**RandomForest with SMOTE**
Precision: 0.663
Recall: 0.783
F1 Score: 0.718


In [164]:
xgb_model = XGBClassifier(
    n_estimators=100,  
    max_depth=6,       
    learning_rate=0.01, 
    scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train), 
    random_state=42    
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]  

roc_auc = roc_auc_score(y_test, y_pred_prob)
print("**XGBoost without SMOTE**")
print(f"Test AUC: {roc_auc}")

evaluation_metrics(y_test, y_pred)

**XGBoost without SMOTE**
Test AUC: 0.7821690375961677
Precision: 0.290
Recall: 0.578
F1 Score: 0.386


In [165]:
xgb_model_smote = XGBClassifier(
    n_estimators=100,  
    max_depth=6,       
    learning_rate=0.01, 
    scale_pos_weight = (len(y_train_smote) - sum(y_train_smote)) / sum(y_train_smote), 
    random_state=42    
)


xgb_model_smote.fit(X_train_smote, y_train_smote)

y_pred = xgb_model_smote.predict(X_test)
y_pred_prob = xgb_model_smote.predict_proba(X_test)[:, 1]  

roc_auc = roc_auc_score(y_test, y_pred_prob)
print("**XGBoost with SMOTE**")
print(f"Test AUC: {roc_auc}")
evaluation_metrics(y_test, y_pred)

**XGBoost with SMOTE**
Test AUC: 0.8016838438089708
Precision: 0.333
Recall: 0.512
F1 Score: 0.404


In [176]:
logreg = LogisticRegression(
    max_iter=1000,       
    class_weight='balanced',  
    random_state=42
)

# Train the model
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("**Logistic Regression without SMOTE**")
print(f"Test ROC-AUC: {roc_auc:.4f}")
evaluation_metrics(y_test, y_pred)

**Logistic Regression without SMOTE**
Test ROC-AUC: 0.7593
Precision: 0.290
Recall: 0.614
F1 Score: 0.394


In [177]:
logreg_smote = LogisticRegression(
    max_iter=1000,       
    class_weight='balanced',  
    random_state=42
)

# Train the model
logreg_smote.fit(X_train_smote, y_train_smote)
y_pred = logreg_smote.predict(X_test)
y_pred_prob = logreg_smote.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("**Logistic Regression with SMOTE**")
print(f"Test ROC-AUC: {roc_auc:.4f}")
evaluation_metrics(y_test, y_pred)

**Logistic Regression with SMOTE**
Test ROC-AUC: 0.7678
Precision: 0.289
Recall: 0.620
F1 Score: 0.395


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Convert to tensor and move to float32
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)

X_train_tensor_smote = torch.tensor(X_train_smote.values, dtype=torch.float32)
y_train_tensor_smote = torch.tensor(y_train_smote.values, dtype=torch.float32).unsqueeze(1)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

In [189]:
model = SimpleNN(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# Predict and evaluate
model.eval()
with torch.no_grad():
    y_pred_prob = model(X_test_tensor).numpy()
    y_pred = (y_pred_prob >= 0.5).astype(int)
print("**Simple Neural Network without SMOTE**")
evaluation_metrics(y_test, y_pred)

**Simple Neural Network without SMOTE**
Precision: 0.750
Recall: 0.018
F1 Score: 0.035


In [192]:
model = SimpleNN(X_train_smote.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor_smote)
    loss = criterion(outputs, y_train_tensor_smote)
    loss.backward()
    optimizer.step()

# Predict and evaluate
model.eval()
with torch.no_grad():
    y_pred_prob = model(X_test_tensor).numpy()
    y_pred = (y_pred_prob >= 0.5).astype(int)


print("**Simple Neural Network with SMOTE**")
evaluation_metrics(y_test, y_pred)

**Simple Neural Network with SMOTE**
Precision: 0.285
Recall: 0.608
F1 Score: 0.388


In [193]:
import joblib
joblib.dump(rf_model_smote, '/kaggle/working/best_random_forest_model.joblib')

['/kaggle/working/best_random_forest_model.joblib']