# AI-Powered Fraud Detection System

This notebook implements a machine learning model to detect fraudulent transactions with minimal false positives. The workflow includes data exploration, preprocessing, model training, evaluation, and simulated deployment.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

In [None]:
# Load and Explore the Dataset
train_df = pd.read_csv('train_hsbc_df.csv')
test_df = pd.read_csv('test_hsbc_df.csv')

print('Train Data Shape:', train_df.shape)
print('Test Data Shape:', test_df.shape)

print('Train Data Sample:')
display(train_df.head())

print('Class Distribution:')
if 'is_fraud' in train_df.columns:
    sns.countplot(x='is_fraud', data=train_df)
    plt.title('Fraudulent vs Non-Fraudulent Transactions')
    plt.show()
    print(train_df['is_fraud'].value_counts(normalize=True))
else:
    print('No fraud label found in train data.')

In [None]:
# Preprocess Data (Cleaning & Feature Engineering)
# Handle missing values
train_df = train_df.fillna(train_df.median(numeric_only=True))
test_df = test_df.fillna(train_df.median(numeric_only=True))

# Encode categorical features
cat_cols = train_df.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

# Scale numerical features
num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns.drop('is_fraud', errors='ignore')
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

In [None]:
# Split Data into Training and Test Sets
X = train_df.drop('is_fraud', axis=1)
y = train_df['is_fraud']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print('Training set shape:', X_train.shape)
print('Validation set shape:', X_val.shape)

In [None]:
# Train Fraud Detection Model
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

print('Model training complete.')

In [None]:
# Evaluate Model Performance
val_preds = rf.predict(X_val)
val_probs = rf.predict_proba(X_val)[:, 1]

precision = precision_score(y_val, val_preds)
recall = recall_score(y_val, val_preds)
f1 = f1_score(y_val, val_preds)
auc = roc_auc_score(y_val, val_probs)
cm = confusion_matrix(y_val, val_preds)

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('AUC-ROC:', auc)
print('Confusion Matrix:\n', cm)

fpr, tpr, _ = roc_curve(y_val, val_probs)
plt.plot(fpr, tpr, label='ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Tune Model to Minimize False Positives
# Adjust decision threshold
threshold = 0.5  # Default
custom_preds = (val_probs > threshold).astype(int)

custom_precision = precision_score(y_val, custom_preds)
custom_recall = recall_score(y_val, custom_preds)
custom_f1 = f1_score(y_val, custom_preds)
custom_cm = confusion_matrix(y_val, custom_preds)

print(f'Custom Threshold: {threshold}')
print('Precision:', custom_precision)
print('Recall:', custom_recall)
print('F1 Score:', custom_f1)
print('Confusion Matrix:\n', custom_cm)

# You can tune threshold to minimize false positives by iterating over possible values and selecting the best trade-off.

In [None]:
# Export Model for Deployment
joblib.dump(rf, 'fraud_detection_model.joblib')
print('Model saved as fraud_detection_model.joblib')

In [None]:
# Deploy Model (Simulated Prediction on New Data)
loaded_model = joblib.load('fraud_detection_model.joblib')
test_preds = loaded_model.predict(test_df)
test_probs = loaded_model.predict_proba(test_df)[:, 1]

# Export predictions to CSV
output = pd.DataFrame({'transaction_id': test_df.index, 'is_fraud_pred': test_preds, 'fraud_probability': test_probs})
output.to_csv('fraud_predictions.csv', index=False)
print('Predictions saved to fraud_predictions.csv')

# Conclusion & Next Steps

- The RandomForest model was trained to detect fraudulent transactions with a focus on minimizing false positives.
- Key metrics (precision, recall, F1, AUC-ROC) were evaluated to ensure robust performance.
- Feature importance can be visualized and analyzed to understand which features drive predictions.
- The model and predictions are exported for deployment and further analysis.

**Next Steps:**
- Further tune model hyperparameters and thresholds for optimal trade-off.
- Explore additional feature engineering and advanced models (e.g., XGBoost).
- Integrate the model into a real-time transaction monitoring system.