<a href="https://colab.research.google.com/github/imoleayomideajay/Rule_based_fraud_detection/blob/main/Rule_based_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import timedelta
from collections import deque
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample


In [27]:
# Step 1: Generate synthetic data
fake = Faker()
np.random.seed(42)

n_customers = 1000
n_transactions = 10000
customer_ids = [f"CUST{str(i).zfill(5)}" for i in range(n_customers)]
transaction_types = ['POS', 'Transfer', 'ATM', 'USSD', 'Mobile App']

transactions = []
for _ in range(n_transactions):
    cust_id = random.choice(customer_ids)
    trans_id = f"TXN{fake.unique.random_number(digits=10)}"
    amount = round(np.random.exponential(scale=5000), 2)
    trans_type = random.choice(transaction_types)
    timestamp = fake.date_time_between(start_date='-1y', end_date='now')
    location = fake.city()
    device_id = fake.uuid4()
    balance = round(amount + np.random.uniform(1000, 100000), 2)
    is_fraud = 1 if np.random.rand() < 0.01 else 0

    transactions.append([
        trans_id, cust_id, amount, trans_type, timestamp, location,
        device_id, balance, is_fraud
    ])

df = pd.DataFrame(transactions, columns=[
    'TransactionID', 'CustomerID', 'Amount', 'TransactionType',
    'Timestamp', 'Location', 'DeviceID', 'AccountBalance', 'IsFraud'
])


In [28]:
df.head()

Unnamed: 0,TransactionID,CustomerID,Amount,TransactionType,Timestamp,Location,DeviceID,AccountBalance,IsFraud
0,TXN3110344135,CUST00733,2346.34,USSD,2024-08-04 02:29:00.690581,East Brianmouth,f68ac6a3-48cf-4f73-8b67-0473f45fe497,97467.06,0
1,TXN6916530613,CUST00726,4564.71,ATM,2024-06-19 01:21:56.031549,Port Dianachester,b1724e74-29a2-4333-8d7f-1a08f7cfa92f,21010.56,0
2,TXN6847026703,CUST00004,299.19,POS,2024-08-03 10:31:23.852105,Port Lindsayberg,612ca5bb-b1ae-4220-a48a-fd5296e9dd21,87050.63,0
3,TXN1337312660,CUST00798,6156.25,ATM,2025-03-27 16:46:20.801561,Stevensonchester,87182439-a664-4940-be4d-b065b1efd28b,9194.11,0
4,TXN9338504752,CUST00849,8932.15,USSD,2024-10-30 00:27:35.629492,Mayerchester,a622214d-38e3-4372-8257-a135369f3566,30953.72,0


In [29]:


# Step 2: Rule-Based Detection
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.hour
df['R1_HighAmount'] = df['Amount'] > 100000
df['R2_OddHour'] = df['Hour'].between(0, 4)

df = df.sort_values(by=['CustomerID', 'Timestamp'])
df['R3_HighFreq'] = False
for cust_id, group in df.groupby('CustomerID'):
    times = group['Timestamp'].tolist()
    idxs = group.index.tolist()
    window = deque()
    for i, t in enumerate(times):
        while window and (t - window[0][1]) > timedelta(hours=1):
            window.popleft()
        window.append((idxs[i], t))
        if len(window) > 5:
            df.loc[idxs[i], 'R3_HighFreq'] = True

df['R4_LowBalance'] = (df['AccountBalance'] - df['Amount']) < 500
df['RulesTriggered'] = df[['R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']].sum(axis=1)
df['LikelyFraud'] = df['RulesTriggered'] >= 2

# Step 3: Prepare features for modeling
df['TransactionTypeEncoded'] = LabelEncoder().fit_transform(df['TransactionType'])

features = [
    'Amount', 'Hour', 'AccountBalance', 'TransactionTypeEncoded',
    'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance'
]

df_ml = df.copy()



In [30]:
df_ml.head()

Unnamed: 0,TransactionID,CustomerID,Amount,TransactionType,Timestamp,Location,DeviceID,AccountBalance,IsFraud,Hour,R1_HighAmount,R2_OddHour,R3_HighFreq,R4_LowBalance,RulesTriggered,LikelyFraud,TransactionTypeEncoded
7186,TXN5727488700,CUST00000,9552.28,Mobile App,2024-05-22 10:49:11.346729,South Jacob,56259929-b74b-4621-8b85-81fdebbfe5c0,48279.28,0,10,False,False,False,False,0,False,1
1008,TXN8505669638,CUST00000,291.25,ATM,2024-08-08 20:55:21.324662,Gregoryfort,d25a5068-9adb-4a5e-9dd5-f540fae94814,38309.43,0,20,False,False,False,False,0,False,0
4178,TXN3798426736,CUST00000,809.25,USSD,2024-08-10 17:32:54.867990,Lake Matthewfurt,0c602957-540e-4494-a75b-2e7554d4762d,59246.71,0,17,False,False,False,False,0,False,4
8009,TXN6522869783,CUST00000,600.76,Mobile App,2024-08-17 23:42:57.310166,Port Johnfurt,9dbf94fe-ab0c-4a27-adee-eaa770702ed2,22164.6,0,23,False,False,False,False,0,False,1
8193,TXN9409113544,CUST00000,213.88,POS,2024-11-10 12:52:53.341137,West Jamesbury,7cf41751-5f19-4313-b842-a80cd3722d9b,51114.35,0,12,False,False,False,False,0,False,2


In [31]:
# Step 4: Create 48% fraud dataset
target_fraud_ratio = 0.48
total_records = 10000
n_fraud = int(total_records * target_fraud_ratio)
n_non_fraud = total_records - n_fraud

fraud_samples = df_ml[df_ml['IsFraud'] == 1]
non_fraud_samples = df_ml[df_ml['IsFraud'] == 0]

fraud_upsampled = resample(
    fraud_samples, replace=True, n_samples=n_fraud, random_state=42
)

non_fraud_downsampled = resample(
    non_fraud_samples, replace=False, n_samples=n_non_fraud, random_state=42
)

df_48_fraud = pd.concat([fraud_upsampled, non_fraud_downsampled])
df_48_fraud = df_48_fraud.sample(frac=1, random_state=42).reset_index(drop=True)




In [32]:
# Step 5: Train Random Forest on 48% fraud data
X = df_48_fraud[features]
y = df_48_fraud['IsFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred)
roc = roc_auc_score(y_test, y_prob)

print("Classification Report:\n", report)
print("ROC-AUC Score:", roc)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1514
           1       1.00      1.00      1.00      1486

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

ROC-AUC Score: 1.0


In [33]:
import joblib

# Save the trained model
joblib.dump(rf, 'fraud_detection_model.pkl')

# Save the encoder for TransactionType
joblib.dump(LabelEncoder().fit(df['TransactionType']), 'transaction_type_encoder.pkl')


['transaction_type_encoder.pkl']

In [34]:
# fraud_dashboard.py
import streamlit as st
import pandas as pd
import joblib

model = joblib.load('fraud_detection_model.pkl')
encoder = joblib.load('transaction_type_encoder.pkl')

st.title("💳 Real-Time Fraud Detection")

with st.form("input_form"):
    amount = st.number_input("Transaction Amount", value=1000.0)
    account_balance = st.number_input("Account Balance", value=5000.0)
    trans_type = st.selectbox("Transaction Type", ['POS', 'Transfer', 'ATM', 'USSD', 'Mobile App'])
    timestamp = st.text_input("Timestamp (YYYY-MM-DD HH:MM:SS)", value="2025-04-16 12:00:00")
    r1 = st.checkbox("High Amount?", value=False)
    r2 = st.checkbox("Odd Hour?", value=False)
    r3 = st.checkbox("High Frequency?", value=False)
    r4 = st.checkbox("Low Balance After?", value=False)

    submit = st.form_submit_button("Run Prediction")

if submit:
    df = pd.DataFrame([{
        'Amount': amount,
        'Hour': pd.to_datetime(timestamp).hour,
        'AccountBalance': account_balance,
        'TransactionType': trans_type,
        'R1_HighAmount': r1,
        'R2_OddHour': r2,
        'R3_HighFreq': r3,
        'R4_LowBalance': r4,
        'Timestamp': timestamp
    }])
    df['TransactionTypeEncoded'] = encoder.transform(df['TransactionType'])

    features = ['Amount', 'Hour', 'AccountBalance', 'TransactionTypeEncoded',
                'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']
    prediction = model.predict(df[features])[0]
    prob = model.predict_proba(df[features])[0][1]

    st.success(f"Prediction: {'Fraud' if prediction == 1 else 'Not Fraud'}")
    st.info(f"Fraud Probability: {prob:.4f}")




In [None]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.225.49.229:8501[0m
[0m


In [19]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd

# Load model and encoder
model = joblib.load('fraud_detection_model.pkl')
encoder = joblib.load('transaction_type_encoder.pkl')

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    df = pd.DataFrame([data])

    # Feature engineering
    df['TransactionTypeEncoded'] = encoder.transform(df['TransactionType'])
    df['Hour'] = pd.to_datetime(df['Timestamp']).dt.hour

    features = ['Amount', 'Hour', 'AccountBalance', 'TransactionTypeEncoded',
                'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']
    prediction = model.predict(df[features])[0]
    probability = model.predict_proba(df[features])[0][1]

    return jsonify({'prediction': int(prediction), 'probability': float(probability)})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
