## Unsupervised Models 

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_recall_curve, auc, roc_curve, make_scorer
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Import your existing utility module
import credit_card_fraud_utils as ccf

In [None]:
from sklearn.ensemble import IsolationForest

X_train, X_test, y_train, y_test = test_train_split(df)



model = IsolationForest(contamination=0.001727, random_state=42, n_estimators=100, max_samples='auto')
model.fit(X_train)
y_pred = model.predict(X_test)

# Convert predictions from (-1, 1) to (0, 1) format
y_pred = np.where(y_pred == -1, 1, 0)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
precision, recall, _ = precision_recall_curve(y_test, y_pred)
auprc = auc(recall, precision)
print(f"AUPRC: {auprc:.4f}")

In [None]:
# Get anomaly scores (the lower the score, the more likely it's an outlier)
y_scores = -model.score_samples(X_test)  # Negative because lower scores mean more anomalous

# Get binary predictions for metrics
y_pred = model.predict(X_test)
y_pred = np.where(y_pred == -1, 1, 0)  # Convert to 0/1 format

# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_scores)

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_scores)

# Plot both curves
plt.figure(figsize=(10, 5))
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('Recall (True Positive Rate)')
plt.ylabel('Precision')
plt.title('Precision-Recall vs ROC Curves')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_curve, precision_recall_curve

def calculate_recall_at_fpr(y_true, y_scores, k=0.005):
    """Calculate recall at a specific false positive rate."""
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    idx = np.argmin(np.abs(fpr - k))
    return tpr[idx]

def calculate_precision_at_recall(y_true, y_scores, k=0.005):
    """Calculate precision at a specific recall rate."""
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    idx = np.argmin(np.abs(recall - k))
    return precision[idx]

def calculate_ks_statistic(y_true, y_scores):
    """Calculate Kolmogorov-Smirnov statistic."""
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    return np.max(np.abs(fpr - tpr))

# Now calculate the metrics
cm = confusion_matrix(y_test, y_pred)
recall_at_k = calculate_recall_at_fpr(y_test, y_scores, k=0.005)
precision_at_k = calculate_precision_at_recall(y_test, y_scores, k=0.005)
mcc = matthews_corrcoef(y_test, y_pred)
ks = calculate_ks_statistic(y_test, y_scores)

print(f"Confusion Matrix:\n{cm}")
print(f"Recall at 0.5% FPR: {recall_at_k:.4f}")
print(f"Precision at 0.5% Recall: {precision_at_k:.4f}")
print(f"Matthews Correlation Coefficient: {mcc:.4f}")
print(f"Kolmogorov-Smirnov Statistic: {ks:.4f}")

In [None]:
y_scores = model.score_samples(X_test)

# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_scores)

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_scores)

# Plot both curves
plt.figure(figsize=(10, 5))
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('Recall (True Positive Rate)')
plt.ylabel('Precision')
plt.title('Precision-Recall vs ROC Curves')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Train Isolation Forest model
print("Training Isolation Forest model...")
if_model = ccf.train_isolation_forest(X_train, config)
if_preds, if_scores = ccf.get_model_predictions(if_model, X_test)

In [None]:
# Train Local Outlier Factor model
print("Training Local Outlier Factor model...")
lof_model = ccf.train_lof(X_train, config)
lof_preds, lof_scores = ccf.get_model_predictions(lof_model, X_test, is_isolation_forest=False)

In [None]:
# Evaluate Isolation Forest
if_metrics = ccf.calculate_metrics(y_test, if_preds, if_scores)
print("Isolation Forest Results:")
print(f"Accuracy: {if_metrics['accuracy']:.4f}")
print("\nConfusion Matrix:")
print(if_metrics['confusion_matrix'])
print("\nClassification Report:")
print(if_metrics['classification_report'])

In [None]:
# Evaluate LOF
lof_metrics = ccf.calculate_metrics(y_test, lof_preds, lof_scores)
print("Local Outlier Factor Results:")
print(f"Accuracy: {lof_metrics['accuracy']:.4f}")
print("\nConfusion Matrix:")
print(lof_metrics['confusion_matrix'])
print("\nClassification Report:")
print(lof_metrics['classification_report'])

In [None]:
# Plot ROC curves
plt_roc, auc_if, auc_lof = ccf.plot_roc_curves(y_test, if_scores, lof_scores)
plt_roc.show()

In [None]:
# Plot Precision-Recall curves
plt_pr = ccf.plot_pr_curves(y_test, if_scores, lof_scores)
plt_pr.show()

In [None]:
# Plot anomaly score distributions
plt_scores, eval_df = ccf.plot_anomaly_scores(if_scores, lof_scores, y_test)
plt_scores.show()

In [None]:
# Plot anomaly score comparison
plt_comparison = ccf.plot_score_comparison(eval_df)
plt_comparison.show()

In [None]:
# Compare model performance
comparisons = ccf.compare_models(y_test, if_preds, lof_preds, if_scores, lof_scores, auc_if, auc_lof)
print("Model Comparison:")
comparisons.style.highlight_max(axis=0)