<a href="https://colab.research.google.com/github/imoleayomideajay/Rule_based_fraud_detection/blob/main/Rule_based_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta
from collections import deque


In [11]:
df = pd.read_csv('/content/sample_data/Synthetic_Transactions.csv')

In [12]:
df.head(n=5)

Unnamed: 0,TransactionID,CustomerID,Amount,TransactionType,Timestamp,Location,DeviceID,AccountBalance,IsFraud,R1_HighAmount,Hour,R2_OddHour,R3_HighFreq,R4_LowBalance,RulesTriggered,LikelyFraud,TransactionTypeEncoded
0,TXN413902284,CUST00787,109.32,ATM,2024-09-02 01:09:34,Priceview,a20757f5-28bf-419c-b3a2-e89053f6abe8,48985.03,0,False,1,True,False,False,1,False,0
1,TXN7617106309,CUST00591,6491.05,ATM,2025-01-01 19:17:35,Justinmouth,ea951e5b-c18b-42ef-aa9d-e36618a21242,44916.13,1,False,19,False,False,False,0,False,0
2,TXN8226402936,CUST00038,4478.38,ATM,2025-01-28 20:06:42,West Geoffreystad,f4d46d0b-25cd-446b-bffb-f16774655fc0,6731.51,1,False,20,False,False,False,0,False,0
3,TXN4028780039,CUST00568,5554.65,ATM,2024-09-02 00:49:24,Rodriguezchester,4681f2f1-9cbb-413c-944a-76640c04118b,90442.8,1,False,0,True,False,False,1,False,0
4,TXN6487442006,CUST00030,1608.2,Mobile App,2024-12-04 10:56:41,Kellerside,0f58f95d-8361-48d8-8ddc-416e21cfafee,16649.25,1,False,10,False,False,False,0,False,1


In [13]:

# Assuming display_dataframe_to_user is a custom function, define it here.
# Replace this with the actual implementation of your display function
def display_dataframe_to_user(name, dataframe):
    """Displays the dataframe to the user."""
    print(f"Dataframe: {name}")
    # Assuming you have a way to display dataframes, use it here
    print(dataframe)

# Reset df
df_rules = df.copy()
df_rules['Timestamp'] = pd.to_datetime(df_rules['Timestamp'])
df_rules = df_rules.sort_values(by=['CustomerID', 'Timestamp'])

# Rule 1: High transaction amount
df_rules['R1_HighAmount'] = df_rules['Amount'] > 100000

# Rule 2: Odd hours
df_rules['Hour'] = df_rules['Timestamp'].dt.hour
df_rules['R2_OddHour'] = df_rules['Hour'].between(0, 4)

# Rule 3: More than 5 transactions within 1 hour per customer
df_rules['R3_HighFreq'] = False
grouped = df_rules.groupby('CustomerID')

for cust_id, group in grouped:
    times = group['Timestamp'].tolist()
    idxs = group.index.tolist()
    window = deque()

    for i, t in enumerate(times):
        # Remove timestamps older than 1 hour from current
        while window and (t - window[0][1]) > timedelta(hours=1):
            window.popleft()
        window.append((idxs[i], t))

        if len(window) > 5:
            df_rules.loc[idxs[i], 'R3_HighFreq'] = True

# Rule 4: Low balance after transaction
df_rules['R4_LowBalance'] = (df_rules['AccountBalance'] - df_rules['Amount']) < 500

# Combine rules
df_rules['RulesTriggered'] = df_rules[['R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']].sum(axis=1)
df_rules['LikelyFraud'] = df_rules['RulesTriggered'] >= 2

# Display relevant columns
cols_to_show = [
    'TransactionID', 'CustomerID', 'Amount', 'TransactionType', 'Timestamp',
    'AccountBalance', 'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq',
    'R4_LowBalance', 'RulesTriggered', 'LikelyFraud'
]

# Call the display function, which is now defined
display_dataframe_to_user(name="Rules-Based Detection (Fixed)", dataframe=df_rules[cols_to_show])

Dataframe: Rules-Based Detection (Fixed)
      TransactionID CustomerID    Amount TransactionType           Timestamp  \
1018  TXN7929474601  CUST00000   6490.80      Mobile App 2024-04-17 13:39:00   
1258  TXN5112638806  CUST00000      1.55             ATM 2024-09-03 19:27:04   
1809  TXN2237655697  CUST00000   5072.90            USSD 2025-01-20 02:42:46   
752   TXN1244777677  CUST00001   2082.59            USSD 2024-06-12 11:04:46   
246   TXN1215011469  CUST00001   7964.34      Mobile App 2024-07-10 00:32:25   
...             ...        ...       ...             ...                 ...   
8049  TXN2032816652  CUST00999   3462.44            USSD 2024-09-16 02:31:10   
9549   TXN802972959  CUST00999   6171.73        Transfer 2024-11-12 12:41:29   
2850   TXN963563401  CUST00999   5121.68            USSD 2025-02-20 12:04:33   
4804  TXN7090143861  CUST00999  11791.27            USSD 2025-03-27 04:49:17   
2122  TXN4549048286  CUST00999  11389.89      Mobile App 2025-04-06 09:43:55   

In [16]:
pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [17]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import timedelta
from collections import deque
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample

# Step 1: Generate synthetic data
fake = Faker()
np.random.seed(42)

n_customers = 1000
n_transactions = 10000
customer_ids = [f"CUST{str(i).zfill(5)}" for i in range(n_customers)]
transaction_types = ['POS', 'Transfer', 'ATM', 'USSD', 'Mobile App']

transactions = []
for _ in range(n_transactions):
    cust_id = random.choice(customer_ids)
    trans_id = f"TXN{fake.unique.random_number(digits=10)}"
    amount = round(np.random.exponential(scale=5000), 2)
    trans_type = random.choice(transaction_types)
    timestamp = fake.date_time_between(start_date='-1y', end_date='now')
    location = fake.city()
    device_id = fake.uuid4()
    balance = round(amount + np.random.uniform(1000, 100000), 2)
    is_fraud = 1 if np.random.rand() < 0.01 else 0

    transactions.append([
        trans_id, cust_id, amount, trans_type, timestamp, location,
        device_id, balance, is_fraud
    ])

df = pd.DataFrame(transactions, columns=[
    'TransactionID', 'CustomerID', 'Amount', 'TransactionType',
    'Timestamp', 'Location', 'DeviceID', 'AccountBalance', 'IsFraud'
])

# Step 2: Rule-Based Detection
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.hour
df['R1_HighAmount'] = df['Amount'] > 100000
df['R2_OddHour'] = df['Hour'].between(0, 4)

df = df.sort_values(by=['CustomerID', 'Timestamp'])
df['R3_HighFreq'] = False
for cust_id, group in df.groupby('CustomerID'):
    times = group['Timestamp'].tolist()
    idxs = group.index.tolist()
    window = deque()
    for i, t in enumerate(times):
        while window and (t - window[0][1]) > timedelta(hours=1):
            window.popleft()
        window.append((idxs[i], t))
        if len(window) > 5:
            df.loc[idxs[i], 'R3_HighFreq'] = True

df['R4_LowBalance'] = (df['AccountBalance'] - df['Amount']) < 500
df['RulesTriggered'] = df[['R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']].sum(axis=1)
df['LikelyFraud'] = df['RulesTriggered'] >= 2

# Step 3: Prepare features for modeling
df['TransactionTypeEncoded'] = LabelEncoder().fit_transform(df['TransactionType'])

features = [
    'Amount', 'Hour', 'AccountBalance', 'TransactionTypeEncoded',
    'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance'
]

df_ml = df.copy()

# Step 4: Create 48% fraud dataset
target_fraud_ratio = 0.48
total_records = 10000
n_fraud = int(total_records * target_fraud_ratio)
n_non_fraud = total_records - n_fraud

fraud_samples = df_ml[df_ml['IsFraud'] == 1]
non_fraud_samples = df_ml[df_ml['IsFraud'] == 0]

fraud_upsampled = resample(
    fraud_samples, replace=True, n_samples=n_fraud, random_state=42
)

non_fraud_downsampled = resample(
    non_fraud_samples, replace=False, n_samples=n_non_fraud, random_state=42
)

df_48_fraud = pd.concat([fraud_upsampled, non_fraud_downsampled])
df_48_fraud = df_48_fraud.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 5: Train Random Forest on 48% fraud data
X = df_48_fraud[features]
y = df_48_fraud['IsFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred)
roc = roc_auc_score(y_test, y_prob)

print("Classification Report:\n", report)
print("ROC-AUC Score:", roc)


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1514
           1       1.00      1.00      1.00      1486

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

ROC-AUC Score: 1.0


In [18]:
import joblib

# Save the trained model
joblib.dump(rf, 'fraud_detection_model.pkl')

# Save the encoder for TransactionType
joblib.dump(LabelEncoder().fit(df['TransactionType']), 'transaction_type_encoder.pkl')


['transaction_type_encoder.pkl']

In [22]:
# fraud_dashboard.py
import streamlit as st
import pandas as pd
import joblib

model = joblib.load('fraud_detection_model.pkl')
encoder = joblib.load('transaction_type_encoder.pkl')

st.title("💳 Real-Time Fraud Detection")

with st.form("input_form"):
    amount = st.number_input("Transaction Amount", value=1000.0)
    account_balance = st.number_input("Account Balance", value=5000.0)
    trans_type = st.selectbox("Transaction Type", ['POS', 'Transfer', 'ATM', 'USSD', 'Mobile App'])
    timestamp = st.text_input("Timestamp (YYYY-MM-DD HH:MM:SS)", value="2025-04-16 12:00:00")
    r1 = st.checkbox("High Amount?", value=False)
    r2 = st.checkbox("Odd Hour?", value=False)
    r3 = st.checkbox("High Frequency?", value=False)
    r4 = st.checkbox("Low Balance After?", value=False)

    submit = st.form_submit_button("Run Prediction")

if submit:
    df = pd.DataFrame([{
        'Amount': amount,
        'Hour': pd.to_datetime(timestamp).hour,
        'AccountBalance': account_balance,
        'TransactionType': trans_type,
        'R1_HighAmount': r1,
        'R2_OddHour': r2,
        'R3_HighFreq': r3,
        'R4_LowBalance': r4,
        'Timestamp': timestamp
    }])
    df['TransactionTypeEncoded'] = encoder.transform(df['TransactionType'])

    features = ['Amount', 'Hour', 'AccountBalance', 'TransactionTypeEncoded',
                'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']
    prediction = model.predict(df[features])[0]
    prob = model.predict_proba(df[features])[0][1]

    st.success(f"Prediction: {'Fraud' if prediction == 1 else 'Not Fraud'}")
    st.info(f"Fraud Probability: {prob:.4f}")


2025-04-16 09:02:12.097 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-04-16 09:02:12.130 Session state does not function when running a script without `streamlit run`


In [24]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.225.49.229:8501[0m
[0m
[34m  Stopping...[0m
^C


In [19]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd

# Load model and encoder
model = joblib.load('fraud_detection_model.pkl')
encoder = joblib.load('transaction_type_encoder.pkl')

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    df = pd.DataFrame([data])

    # Feature engineering
    df['TransactionTypeEncoded'] = encoder.transform(df['TransactionType'])
    df['Hour'] = pd.to_datetime(df['Timestamp']).dt.hour

    features = ['Amount', 'Hour', 'AccountBalance', 'TransactionTypeEncoded',
                'R1_HighAmount', 'R2_OddHour', 'R3_HighFreq', 'R4_LowBalance']
    prediction = model.predict(df[features])[0]
    probability = model.predict_proba(df[features])[0][1]

    return jsonify({'prediction': int(prediction), 'probability': float(probability)})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
