In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("C:\\Users\\Hasan\\Desktop\\data science folder\\Fraud_Data_Final.csv")

# Convert timestamps
df["purchase_time"] = pd.to_datetime(df["purchase_time"])

# Check class distribution
print(df["class"].value_counts(normalize=True) * 100)


class
0    90.635423
1     9.364577
Name: proportion, dtype: float64


In [2]:
def rule_based_fraud_detection(df):
    df["rule_fraud"] = 0  # Default: Not fraud

    # Use 'purchase_value' 
    df.loc[df["purchase_value"] > 10000, "rule_fraud"] = 1

    # Rule 2: Transactions made between 12 AM - 4 AM
    df["hour"] = df["purchase_time"].dt.hour
    df.loc[df["hour"].between(0, 4), "rule_fraud"] = 1

    # Rule 3: Multiple transactions within 10 minutes by the same user
    df["time_diff"] = df.groupby("user_id")["purchase_time"].diff().dt.seconds
    df.loc[df["time_diff"] < 600, "rule_fraud"] = 1

    return df

df = rule_based_fraud_detection(df)
print(df[["purchase_value", "rule_fraud"]].head())  # Check the rule-based fraud column


   purchase_value  rule_fraud
0              34           1
1              16           1
2              15           0
3              44           0
4              39           0


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
import shap
import matplotlib.pyplot as plt

# ✅ Convert timestamps
df["purchase_time"] = pd.to_datetime(df["purchase_time"])
df["signup_time"] = pd.to_datetime(df["signup_time"])

# ✅ Drop unnecessary columns
df = df.drop(columns=["signup_time", "device_id"])

# ✅ Feature Engineering: Extract useful time-based features
df["purchase_hour"] = df["purchase_time"].dt.hour
df["purchase_day"] = df["purchase_time"].dt.day
df["purchase_weekday"] = df["purchase_time"].dt.weekday

# ✅ Remove original timestamp columns (they are now encoded in time-based features)
df = df.drop(columns=["purchase_time"])

# ✅ Convert categorical features to numerical using One-Hot Encoding
df = pd.get_dummies(df, columns=["source", "browser", "sex"], drop_first=True)

# ✅ Check if all columns are numeric
print(df.dtypes)


  from .autonotebook import tqdm as notebook_tqdm


user_id               int64
purchase_value        int64
age                   int64
ip_address            int64
class                 int64
rule_fraud            int64
hour                  int32
time_diff           float64
purchase_hour         int32
purchase_day          int32
purchase_weekday      int32
source_Direct          bool
source_SEO             bool
browser_FireFox        bool
browser_IE             bool
browser_Opera          bool
browser_Safari         bool
sex_M                  bool
dtype: object


In [4]:
def rule_based_fraud_detection(df):
    df["rule_fraud"] = 0  # Default: Not fraud

    # ✅ Rule 1: Transactions with high value (> $10,000)
    df.loc[df["purchase_value"] > 10000, "rule_fraud"] = 1

    # ✅ Rule 2: Transactions at unusual times (Midnight - 4 AM)
    df.loc[df["purchase_hour"].between(0, 4), "rule_fraud"] = 1

    # ✅ Rule 3: Multiple transactions by the same user within 10 minutes
    df["time_diff"] = df.groupby("user_id")["purchase_hour"].diff()
    df.loc[df["time_diff"] < 0.167, "rule_fraud"] = 1  # 10 minutes = 0.167 hours

    return df

# ✅ Apply the rule-based system
df = rule_based_fraud_detection(df)

# ✅ Check rule-based fraud cases
print(df[["rule_fraud"]].value_counts())


rule_fraud
0             119852
1              31260
Name: count, dtype: int64


In [5]:
# ✅ Define Features (`X`) and Target (`y`)
X = df.drop(columns=["class", "rule_fraud"])  # Use all features except labels
y = df["class"]  # Fraud (1) or Non-Fraud (0)

# ✅ Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ✅ Make Predictions
y_pred = model.predict(X_test)

# ✅ Evaluate Model Performance
print("🔹 Machine Learning Model Performance:")
print(classification_report(y_test, y_pred))


🔹 Machine Learning Model Performance:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



In [6]:
# ✅ Predict fraud probability using the trained ML model
df["ml_fraud_prob"] = model.predict_proba(X)[:, 1]

# ✅ Hybrid Fraud Detection: If either Rule-Based OR ML detects fraud, flag it
df["hybrid_fraud"] = ((df["rule_fraud"] == 1) | (df["ml_fraud_prob"] > 0.8)).astype(int)

# ✅ Compare results
print(df[["class", "rule_fraud", "ml_fraud_prob", "hybrid_fraud"]].head(10))


   class  rule_fraud  ml_fraud_prob  hybrid_fraud
0      0           1           0.08             1
1      0           1           0.02             1
2      1           0           0.97             1
3      0           0           0.00             0
4      0           0           0.03             0
5      0           0           0.01             0
6      0           1           0.02             1
7      0           0           0.02             0
8      0           0           0.00             0
9      0           0           0.02             0


In [9]:
def evaluate_model(y_true, y_pred, name):
    print(f"\n🔹 {name} Model Performance:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")

# ✅ Evaluate models
evaluate_model(df["class"], df["rule_fraud"], "Rule-Based System")
evaluate_model(df["class"], (df["ml_fraud_prob"] > 0.8).astype(int), "ML-Based System")
evaluate_model(df["class"], df["hybrid_fraud"], "Hybrid System")



🔹 Rule-Based System Model Performance:
Accuracy: 0.7367
Precision: 0.0900
Recall: 0.1987

🔹 ML-Based System Model Performance:
Accuracy: 0.9553
Precision: 1.0000
Recall: 0.5227

🔹 Hybrid System Model Performance:
Accuracy: 0.7764
Precision: 0.2363
Recall: 0.6221


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.impute import SimpleImputer

# ✅ Define Features (`X`) and Target (`y`)
drop_cols = [col for col in ["class", "rule_fraud"] if col in df.columns]
X = df.drop(columns=drop_cols)
y = df["class"]

# ✅ Handle missing values using SimpleImputer (mean imputation)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)  # Impute missing values in X

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42, stratify=y)

# ✅ Define Machine Learning Models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced"),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "SVM": SVC(probability=True, kernel="linear", class_weight="balanced")
}

# ✅ Train & Evaluate Each Model
results = {}

for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict using the test set
    
    # Store model results
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=1),
        "Recall": recall_score(y_test, y_pred, zero_division=1),
        "F1-Score": 2 * (precision_score(y_test, y_pred) * recall_score(y_test, y_pred)) / (precision_score(y_test, y_pred) + recall_score(y_test, y_pred) + 1e-9)
    }
    
    print(classification_report(y_test, y_pred))

# ✅ Convert Results to DataFrame for Comparison
results_df = pd.DataFrame(results).T
print("\n🔹 Model Performance Summary:")
print(results_df)

# ✅ Select Best Model (Highest Recall)
best_model_name = results_df["Recall"].idxmax()
best_model = models[best_model_name]
print(f"\n✅ Best Model Selected: {best_model_name}")

# ✅ Predict Fraud Probability using Best ML Model
df["ml_fraud_prob"] = best_model.predict_proba(X_imputed)[:, 1]

# ✅ Hybrid Fraud Detection: Rule-Based OR ML Model
fraud_threshold = 0.6  # Adjust if needed
df["hybrid_fraud"] = ((df["rule_fraud"] == 1) | (df["ml_fraud_prob"] > fraud_threshold)).astype(int)

# ✅ Compare Results
print(df[["class", "rule_fraud", "ml_fraud_prob", "hybrid_fraud"]].head(10))





🔹 Training Random Forest...
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     27393
           1       1.00      0.91      0.95      2830

    accuracy                           0.99     30223
   macro avg       0.99      0.95      0.97     30223
weighted avg       0.99      0.99      0.99     30223


🔹 Training Logistic Regression...
              precision    recall  f1-score   support

           0       0.91      0.49      0.63     27393
           1       0.09      0.52      0.16      2830

    accuracy                           0.49     30223
   macro avg       0.50      0.50      0.40     30223
weighted avg       0.83      0.49      0.59     30223


🔹 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.99      1.00      1.00     27393
           1       1.00      0.91      0.95      2830

    accuracy                           0.99     30223
   macro avg       0.99      0.95      0.97     30223
weighted avg       0.99      0.99      0.99     30223


🔹 Training SVM...
