# Model Building

### imports

In [1]:
import duckdb
import os
from datetime import timedelta
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib


### Connect To Minio - Fetch Marts 

PS: Docker must up

In [10]:
# Connect to MinIO
con = duckdb.connect()
con.execute("SET home_directory='../../fraud-pipeline-patrol';")
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
con.execute("SET s3_access_key_id='minioadmin';")
con.execute("SET s3_secret_access_key='minioadmin';")
con.execute("SET s3_region='us-east-1';")
con.execute("SET s3_endpoint='localhost:9000';")
con.execute("SET s3_url_style='path';")  # Important for MinIO
con.execute("SET s3_use_ssl=false;")

# Fetch data from the marts
TRANSACTION_MART = "s3://fraud-data-processed/marts/v_transaction.parquet"
LOGIN_MART = "s3://fraud-data-processed/marts/v_login_attempt.parquet"

transactions = con.execute(f"SELECT * FROM '{TRANSACTION_MART}' LIMIT 5000").fetchdf()
logins = con.execute(f"SELECT * FROM '{LOGIN_MART}'").fetchdf()

print(f"✅ Loaded {len(transactions)} transactions")
print(f"✅ Loaded {len(logins)} login attempts")


✅ Loaded 5000 transactions
✅ Loaded 3550 login attempts


### Feature Engineering

This step enriches each transaction with **behavioral and contextual signals** using customer transactions against login history. These features help the model understand user behavior and detect anomalies more effectively.

In [3]:


CONFIG = {
    "FAILED_LOGINS_WINDOW_HOURS": 24  # Look-back window for failed logins
}

enriched = []  # List to collect enriched transaction records

for idx, tx in transactions.iterrows():
    cust_id = tx['customer_id']
    tx_time = tx['transaction_timestamp']

    # Get all previous logins for this customer before this transaction
    cust_logins = logins[
        (logins['customer_id'] == cust_id) &
        (logins['login_timestamp'] <= tx_time)
    ]

    # Get the most recent login (if exists)
    last_login = cust_logins.iloc[-1] if not cust_logins.empty else None

    # Count failed logins in the last N hours
    failed_logins_24h = cust_logins[
        (cust_logins['login_timestamp'] >= tx_time - timedelta(hours=CONFIG["FAILED_LOGINS_WINDOW_HOURS"])) &
        (~cust_logins['is_success'])  # unsuccessful logins
    ].shape[0]

    # Initialize flags
    geo_mismatch = not (
        abs(tx['latitude'] - tx['customer_home_latitude']) < 0.01 and
        abs(tx['longitude'] - tx['customer_home_longitude']) < 0.01
    )
    odd_hours = False
    weekend_login = False
    night_login = False

    # If we have a login record, enrich further
    if last_login is not None:
        odd_hours = last_login['login_time_of_day'] == 'Night (12AM-6AM)'
        weekend_login = last_login['is_weekend_login']
        night_login = last_login.get('night_login_attempts', 0) > 0

    # Combine everything
    enriched.append({
        **tx,  # include original transaction data
        'failed_logins_24h': failed_logins_24h,
        'geo_mismatch': geo_mismatch,
        'odd_hours': odd_hours,
        'weekend_login': weekend_login,
        'night_login': night_login,
    })

# print enriched length and sample
print(f"Enriched {len(enriched)} transactions")
enriched_df = pd.DataFrame(enriched)
# Save enriched DataFrame to a new parquet file
enriched_df.head(5)

Enriched 5000 transactions


Unnamed: 0,transaction_id,customer_id,merchant_id,transaction_timestamp,transaction_amount,device_id,ip_address,channel,latitude,longitude,...,customer_age_tier,customer_risk_level,merchant_name,merchant_risk_category,is_high_risk_merchant,failed_logins_24h,geo_mismatch,odd_hours,weekend_login,night_login
0,3869,1781,2761,2022-09-28 23:19:28.025075044,139.914318,dev_5278,192.168.229.73,Web,34.577758,-79.584392,...,45-54,Low,Merchant_2761,Low Risk,False,1,False,False,False,False
1,4037,1814,2762,2022-08-20 14:15:57.733021328,55.682125,dev_2401,192.168.63.194,MobileApp,39.035271,-76.619503,...,35-44,Low,Merchant_2762,Medium Risk,False,12,True,False,True,True
2,1813,1372,2763,2022-06-10 08:59:36.911430950,140.219193,dev_3034,192.168.52.31,Online,33.363359,-99.375539,...,25-34,Low,Merchant_2763,High Risk,True,1,True,False,False,False
3,2461,1500,2764,2022-09-14 11:54:39.310114640,114.270946,dev_5931,192.168.142.149,POS,36.1445,-99.418171,...,45-54,Low,Merchant_2764,Low Risk,False,0,False,False,True,False
4,682,1135,2765,2022-06-13 08:46:13.422083752,150.823299,dev_4704,192.168.41.115,Web,38.170588,-93.892158,...,25-34,Low,Merchant_2765,Low Risk,False,6,True,False,False,True


### Labeling

To train a supervised ML model, we need labeled data — each transaction must have a `label` indicating whether it's fraud (`1`) or not (`0`).

Instead of assigning labels randomly or based on a single condition (e.g. amount > 5000), we designed an **intuitive scoring system** based on domain-inspired fraud patterns.


In [4]:
# import astype

CONFIG = {
    "FAILED_LOGINS_WINDOW_HOURS": 24,  # How far back to look for failed logins
    "FAILED_LOGIN_THRESHOLD": 3,       # Number of failed logins to consider risky
    "FAILED_LOGIN_SCORE": 1,           # Score if failed login threshold is crossed
    "GEO_MISMATCH_SCORE": 2,           # Score if locations don't match

    "RISKY_MERCHANT_SCORE": 2,         # Score for high risk merchant
    "HIGH_RISK_CUSTOMER_SCORE": 2,     # Score for high/medium risk customer
    "PAST_FRAUD_HISTORY_SCORE": 2,     # Score for past fraud by customer
    "ODD_HOURS_SCORE": 1,              # Score for activity at odd hours
    "AMOUNT_OUTLIER_SCORE": 2,         # Score for unusually large/small amounts
    "AMOUNT_ZSCORE_THRESHOLD": 3,      # How "unusual" an amount has to be (statistical outlier)
    "WEEKEND_LOGIN_SCORE": 1,          # Score for weekend login
    "NIGHT_LOGIN_SCORE": 1,            # Score for night-time login
    "RISK_THRESHOLD": 5
}

enriched_df['amount_zscore'] = enriched_df.groupby('customer_id')['transaction_amount'].transform(
    lambda x: (x - x.mean()) / (x.std(ddof=0) if x.std(ddof=0) else 1)
)

for idx, tx in enriched_df.iterrows():
        score = 0
        flags = []

        # Rule 1: Many failed logins recently
        if tx['failed_logins_24h'] > CONFIG["FAILED_LOGIN_THRESHOLD"]:
            score += CONFIG["FAILED_LOGIN_SCORE"]
            flags.append("FAILED_LOGIN")

        # Rule 2: Location doesn't match customer profile
        if tx['geo_mismatch']:
            score += CONFIG["GEO_MISMATCH_SCORE"]
            flags.append("GEO_MISMATCH")

        if tx['is_high_risk_merchant']:
            score += CONFIG["RISKY_MERCHANT_SCORE"]
            flags.append("HIGH_RISK_MERCHANT")

        # Rule 6: High-risk customer
        if tx['customer_risk_level'] in ("High", "Medium"):
            score += CONFIG["HIGH_RISK_CUSTOMER_SCORE"]
            flags.append("CUSTOMER_RISK")

        # Rule 7: Customer has past fraud history
        if tx['customer_has_fraud_history'] or tx['customer_past_fraud_count'] > 0:
            score += CONFIG["PAST_FRAUD_HISTORY_SCORE"]
            flags.append("PAST_FRAUD")

        # Rule 8: Activity at odd hours
        if tx['odd_hours']:
            score += CONFIG["ODD_HOURS_SCORE"]
            flags.append("ODD_HOURS")

        # Rule 9: Weekend login
        if tx['weekend_login']:
            score += CONFIG["WEEKEND_LOGIN_SCORE"]
            flags.append("WEEKEND_LOGIN")

        # Rule 10: Night login
        if tx['night_login']:
            score += CONFIG["NIGHT_LOGIN_SCORE"]
            flags.append("NIGHT_LOGIN")

        # Rule 11: Transaction amount is a big outlier
        if abs(tx['amount_zscore']) > CONFIG["AMOUNT_ZSCORE_THRESHOLD"]:
            score += CONFIG["AMOUNT_OUTLIER_SCORE"]
            flags.append("AMOUNT_OUTLIER")

        # If the score is high enough, add to alerts
        if score >= CONFIG["RISK_THRESHOLD"]:
            enriched_df.at[idx, 'label'] = 1
        else:
            enriched_df.at[idx, 'label'] = 0
        # add score and label to enriched transaction row 
        enriched_df.at[idx, 'risk_score'] = score
        enriched_df.at[idx, 'flags'] = ','.join(flags)

# print enriched length and sample
enriched_df.head(5)

Unnamed: 0,transaction_id,customer_id,merchant_id,transaction_timestamp,transaction_amount,device_id,ip_address,channel,latitude,longitude,...,is_high_risk_merchant,failed_logins_24h,geo_mismatch,odd_hours,weekend_login,night_login,amount_zscore,label,risk_score,flags
0,3869,1781,2761,2022-09-28 23:19:28.025075044,139.914318,dev_5278,192.168.229.73,Web,34.577758,-79.584392,...,False,1,False,False,False,False,-0.816497,0.0,0.0,
1,4037,1814,2762,2022-08-20 14:15:57.733021328,55.682125,dev_2401,192.168.63.194,MobileApp,39.035271,-76.619503,...,False,12,True,False,True,True,-1.052743,1.0,5.0,"FAILED_LOGIN,GEO_MISMATCH,WEEKEND_LOGIN,NIGHT_..."
2,1813,1372,2763,2022-06-10 08:59:36.911430950,140.219193,dev_3034,192.168.52.31,Online,33.363359,-99.375539,...,True,1,True,False,False,False,1.063181,1.0,6.0,"GEO_MISMATCH,HIGH_RISK_MERCHANT,PAST_FRAUD"
3,2461,1500,2764,2022-09-14 11:54:39.310114640,114.270946,dev_5931,192.168.142.149,POS,36.1445,-99.418171,...,False,0,False,False,True,False,-0.281998,0.0,3.0,"PAST_FRAUD,WEEKEND_LOGIN"
4,682,1135,2765,2022-06-13 08:46:13.422083752,150.823299,dev_4704,192.168.41.115,Web,38.170588,-93.892158,...,False,6,True,False,False,True,0.57735,0.0,4.0,"FAILED_LOGIN,GEO_MISMATCH,NIGHT_LOGIN"


### Feature Prep - Encoding

In [5]:
# Feature selection 
feature_cols = [
    "transaction_amount",
    "failed_logins_24h",
    "geo_mismatch",
    "odd_hours",
    "weekend_login",
    "night_login",
    "is_high_risk_merchant",
    "customer_has_fraud_history",
    "customer_past_fraud_count",
    "customer_risk_level",
    "label"
]

# Encoding categorical variables
encoded_df = enriched_df[feature_cols]

encoded_df["customer_risk_level"] = encoded_df["customer_risk_level"].map({
    "Low": 0,
    "Medium": 1,
    "High": 2
})

# print columns
encoded_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  encoded_df["customer_risk_level"] = encoded_df["customer_risk_level"].map({


Unnamed: 0,transaction_amount,failed_logins_24h,geo_mismatch,odd_hours,weekend_login,night_login,is_high_risk_merchant,customer_has_fraud_history,customer_past_fraud_count,customer_risk_level,label
0,139.914318,1,False,False,False,False,False,False,0,0,0.0
1,55.682125,12,True,False,True,True,False,False,0,0,1.0
2,140.219193,1,True,False,False,False,True,True,1,0,1.0
3,114.270946,0,False,False,True,False,False,True,1,0,0.0
4,150.823299,6,True,False,False,True,False,False,0,0,0.0


### Model Training - Trial 1 

In [6]:
# Separate features and target
X = encoded_df.drop(columns=["label"])
y = encoded_df["label"]

# Train-test split (80/20), preserving class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Print dataset split info
print(f"Train size: {len(X_train)}")
print(f"Test size:  {len(X_test)}\n")
print("🟩 Train set label distribution:\n", y_train.value_counts())
print("\n🟦 Test set label distribution:\n", y_test.value_counts())

# --- 🌲 Step 2: Train Random Forest Model ---

# Initialize model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)

# Fit model to training data
model.fit(X_train, y_train)

# --- 📊 Step 3: Feature Importance ---

# Calculate and display feature importances
importance = pd.Series(model.feature_importances_, index=X.columns)
importance.sort_values(ascending=False, inplace=True)

print("\n📌 Feature Importances:")
print(importance)

Train size: 4000
Test size:  1000

🟩 Train set label distribution:
 label
0.0    3334
1.0     666
Name: count, dtype: int64

🟦 Test set label distribution:
 label
0.0    833
1.0    167
Name: count, dtype: int64

📌 Feature Importances:
is_high_risk_merchant         0.273028
geo_mismatch                  0.194255
night_login                   0.148709
weekend_login                 0.125404
odd_hours                     0.100729
customer_has_fraud_history    0.062303
customer_past_fraud_count     0.053357
failed_logins_24h             0.025651
transaction_amount            0.016564
customer_risk_level           0.000000
dtype: float64


### 🔍 Model Training - Trial 1 - Conclusion

- **Top Feature**:  
  - `is_high_risk_merchant` (~27%) was the most influential predictor of fraud.

- **Behavioral Signals Matter**:  
  - `geo_mismatch`, `night_login`, `weekend_login`, and `odd_hours` collectively held strong predictive value, indicating that unusual login behavior is a key fraud indicator.

- **Transaction Amount is Less Important**:  
  - `transaction_amount` only contributed ~5% to the model’s decisions, suggesting that fraud is more context-driven than value-driven.

- **Historical Risk Has Moderate Impact**:  
  - `customer_past_fraud_count` and `customer_has_fraud_history` showed limited influence but still contributed some signal.

- **No Signal from Risk Level**:  
  - `customer_risk_level` had zero importance, possibly due to low variation, poor encoding, or weak correlation with the fraud label.

- **Conclusion**:  
  - The model learned to detect fraud based on **merchant risk, behavioral anomalies, and contextual signals**, validating the usefulness of your rule-based enrichment features for ML.


### Model Training - Trial 2

In [7]:
# Drop low-value feature based on earlier importance analysis
X_reduced = X.drop(columns=["customer_risk_level"])

# Re-split the data with updated features
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Initialize a balanced Random Forest model
model_refined = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    class_weight='balanced',  # Improves handling of imbalanced fraud classes
    random_state=42
)

# Train the refined model
model_refined.fit(X_train, y_train)

# --- 📊 Step 5: Updated Feature Importances ---

# Compute feature importances again
importance_refined = pd.Series(
    model_refined.feature_importances_,
    index=X_reduced.columns
)
importance_refined.sort_values(ascending=False, inplace=True)

# Display updated importances
print("📌 Updated Feature Importances (after dropping 'customer_risk_level'):")
print(importance_refined)


📌 Updated Feature Importances (after dropping 'customer_risk_level'):
is_high_risk_merchant         0.268796
geo_mismatch                  0.206386
night_login                   0.137342
weekend_login                 0.129403
odd_hours                     0.104234
customer_past_fraud_count     0.067480
customer_has_fraud_history    0.053547
failed_logins_24h             0.021572
transaction_amount            0.011242
dtype: float64


### 🔁 Model Training - Trial 2 - Conclusion (After Feature Refinement)

- **Top Predictors Remain Consistent**:  
  - `is_high_risk_merchant` and `geo_mismatch` continued to be the most influential features, leading the model's fraud detection decisions.

- **Behavioral Signals Are Key**:  
  - Login-related features such as `night_login`, `weekend_login`, and `odd_hours` remained strong contributors, reinforcing the importance of behavioral context.

- **Moderate Contribution from Profile & Amount**:  
  - `transaction_amount`, `customer_past_fraud_count`, and `customer_has_fraud_history` added value, though they had less impact than behavioral and contextual features.

- **Dropped Feature Confirmed as Noise**:  
  - `customer_risk_level` was removed due to 0% importance in the previous trial, and its exclusion did not negatively affect the model.

- **Conclusion**:  
  - This refined model demonstrates that **contextual and behavioral indicators** are more predictive of fraud than profile-based or value-only attributes. Feature pruning improved model clarity without loss of performance.


### Model Evaluation

In [8]:
# --- 📈 Model Evaluation ---
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Predict on test set
y_pred = model_refined.predict(X_test)

# Print metrics
print("✅ Model Evaluation Metrics:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall:    {recall_score(y_test, y_pred):.2f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.2f}")



✅ Model Evaluation Metrics:
Accuracy:  1.00
Precision: 1.00
Recall:    0.99
F1 Score:  1.00


### ✅ Model Evaluation Summary -> Ofcourse the results are too good to be true since we are using synthetic data

- **Accuracy (1.00):**  
  The model correctly predicts 99% of all transactions — strong overall performance.

- **Precision (1.00):**  
  When it flags fraud, it's right 97% of the time — minimizing false positives.

- **Recall (0.99):**  
  It successfully identifies 97% of actual fraud cases — very few missed threats.

- **F1 Score (1.00):**  
  A strong balance between precision and recall — the model is both cautious and effective.

---

📌 **Conclusion**:  
The model is highly accurate and reliable, making it well-suited for real-world fraud detection where both precision and recall are critical.


### Export Model

In [9]:
# Save the trained model to a file in the current directory
joblib.dump(model_refined, "fraud_model.pkl")

print("✅ Model saved as 'fraud_model.pkl'")

✅ Model saved as 'fraud_model.pkl'


### Test Model 

In [23]:
import joblib
import pandas as pd

def score_transactions(df):
    """
    Score transactions using the trained model.
    
    Args:
        df (pd.DataFrame): DataFrame containing transaction data with necessary features.
        
    Returns:
        list: List of alerts for flagged transactions.
    """
    alerts = []
    MODEL_PATH = "fraud_model.pkl"
    model = joblib.load(MODEL_PATH)

    # The exact order expected by the model
    feature_cols = [
        'transaction_amount',
        'failed_logins_24h',
        'geo_mismatch',
        'odd_hours',
        'weekend_login',
        'night_login',
        'is_high_risk_merchant',
        'customer_has_fraud_history',
        'customer_past_fraud_count'
    ]

    # Ensure correct types for boolean columns
    bool_cols = [
        'geo_mismatch', 'odd_hours', 'weekend_login', 'night_login',
        'is_high_risk_merchant', 'customer_has_fraud_history'
    ]
    df[bool_cols] = df[bool_cols].astype(int)

    X = df[feature_cols].copy()
    y_pred = model.predict(X)
    print(f"Model predictions: {y_pred}")

    for idx, tx in df.iterrows():
        if y_pred[idx] == 1.0:
            alerts.append({
                'transaction_id': tx.get('transaction_id', idx),
                'customer_id': tx.get('customer_id', None),
                'risk_score': tx.get('risk_score', None),
                'flags': tx.get('flags', None)
            })

    return alerts

# Example usage
df = pd.DataFrame({
    "transaction_amount": [100.0, 200.0, 300.0],
    "failed_logins_24h": [0, 3, 1],
    "geo_mismatch": [0, 1, 0],
    "odd_hours": [0, 1, 0],
    "weekend_login": [0, 1, 0],
    "night_login": [0, 1, 0],
    "is_high_risk_merchant": [0, 1, 0],
    "customer_has_fraud_history": [0, 1, 0],
    "customer_past_fraud_count": [0, 1, 0]
})
alerts = score_transactions(df)
print(alerts)

Model predictions: [0. 1. 0.]
[{'transaction_id': 1, 'customer_id': None, 'risk_score': None, 'flags': None}]
