In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [12]:

pip install scikit-learn pandas joblib




In [13]:
import os

os.makedirs("lib/transaction", exist_ok=True)

features_code = r'''
import math
import pandas as pd

# words that indicate refund / verification related language
REFUND_WORDS = ["refund", "cashback", "return"]
VERIFY_WORDS = ["verify", "verification", "test transaction", "test payment"]

def shannon_entropy(s: str) -> float:
    """Compute character-level entropy of a string."""
    if not s:
        return 0.0
    freq = {}
    for ch in s:
        freq[ch] = freq.get(ch, 0) + 1
    probs = [c / len(s) for c in freq.values()]
    return -sum(p * math.log2(p) for p in probs)

def text_contains_any(text: str, words) -> int:
    text_l = text.lower()
    return int(any(w in text_l for w in words))

def extract_basic_fields(record: dict):
    """Normalize raw input into amount, upi_id, raw_text."""
    raw_amount = record.get("amount")
    if raw_amount is None:
        amount = 0.0
    else:
        try:
            amount = float(str(raw_amount).replace("‚Çπ", "").strip())
        except Exception:
            amount = 0.0

    upi_id = record.get("upi_id") or ""
    raw_text = record.get("raw_text") or ""
    return amount, upi_id, raw_text

def extract_features_from_record(record: dict) -> dict:
    """
    Input:
      record = { "amount": ..., "upi_id": ..., "raw_text": ... }
    Output:
      dict of features for ML model.
    """
    amount, upi_id, raw_text = extract_basic_fields(record)

    # 1) is_round_amount ‚Üí 10, 20, 100 etc.
    is_round_amount = int(amount % 10 == 0) if amount > 0 else 0

    # 2) refund_like_words ‚Üí refund OR verify-ish wording
    has_refund = text_contains_any(raw_text, REFUND_WORDS)
    has_verify = text_contains_any(raw_text, VERIFY_WORDS)
    refund_like_words = int(has_refund or has_verify)

    # 3) ‚Çπ1 / ‚Çπ10 pattern
    is_one_or_ten_pattern = int(amount in [1, 10])

    # 4) upi_entropy
    upi_entropy_val = shannon_entropy(upi_id)

    return {
        "amount": amount,
        "is_round_amount": is_round_amount,
        "refund_like_words": refund_like_words,
        "is_one_or_ten_pattern": is_one_or_ten_pattern,
        "upi_entropy": upi_entropy_val,
    }

def records_to_feature_df(records):
    """Convert list[dict] ‚Üí pandas.DataFrame of features."""
    feat_list = [extract_features_from_record(r) for r in records]
    return pd.DataFrame(feat_list)
'''

with open("lib/transaction/features.py", "w") as f:
    f.write(features_code)

print("‚úÖ Created lib/transaction/features.py")


‚úÖ Created lib/transaction/features.py


In [14]:
import pandas as pd
import random
import os
import numpy as np

# =========================================
# CONFIG
# =========================================
NUM_ROWS = 10000   # üî• 10K samples
SAVE_DIR = "data/transaction"
SAVE_PATH = f"{SAVE_DIR}/dataset_transaction.csv"

os.makedirs(SAVE_DIR, exist_ok=True)
random.seed(42)
np.random.seed(42)

# =========================================
# SAMPLE PATTERNS FOR SYNTHETIC GENERATION
# =========================================

refund_texts = [
    "Please send ‚Çπ1 to verify your refund",
    "Refund pending, complete ‚Çπ1 verification",
    "Refund blocked, send ‚Çπ1 to activate",
    "To process refund, send ‚Çπ10 immediately",
    "Refund verification required for your transaction",
    "Your refund will fail unless you send ‚Çπ1 now",
    "Refund is on hold, verify with ‚Çπ1",
]

verification_texts = [
    "Pay ‚Çπ10 to verify FASTag KYC",
    "Send ‚Çπ1 test transaction to activate account",
    "Complete safety verification by paying ‚Çπ5",
    "Pay ‚Çπ2 to confirm your booking verification",
    "‚Çπ1 test payment required to proceed",
    "Verification incomplete, please send ‚Çπ10",
    "Confirm your account by paying ‚Çπ2",
]

kyc_texts = [
    "Your KYC has expired, send ‚Çπ1 to update",
    "KYC blocked, pay ‚Çπ10 to unlock account",
    "Your Aadhaar KYC needs verification",
    "Bank KYC failed, complete with ‚Çπ1",
    "Update your KYC immediately to avoid blocking",
    "KYC verification pending, send ‚Çπ2 now",
]

imp_texts = [
    "I am Amazon support, please send ‚Çπ1 for verification",
    "I am bank officer, pay ‚Çπ10 to activate your card",
    "This is Paytm support, complete ‚Çπ1 test payment",
    "We detected fraud, send ‚Çπ2 to secure your account",
    "Instagram account recovery, pay ‚Çπ5 immediately",
    "This is official helpline, send ‚Çπ1 to continue",
]

safe_texts = [
    "Payment to ABC Store",
    "Payment to Grocery Store",
    "Payment to Flipkart Seller",
    "Payment to Myntra Merchant",
    "Paid for household items",
    "Electricity bill payment",
    "Mobile recharge payment",
    "Restaurant bill payment",
]

upi_ids_scam = [
    "refundverify@oksbi", "testupi@upi", "fakehelp@okaxis",
    "kycupdate@upi", "supportcheck@oksbi", "helpdesk01@upi",
    "secureupdate@upi", "kycblock@oksbi", "alertsupport@upi"
]

upi_ids_safe = [
    "legitstore@okaxis", "grocerystore@upi", "merchantpay@oksbi",
    "flipkartstore@okicici", "myntra@upi", "electricityboard@upi",
    "rechargecenter@upi", "restaurant@oksbi"
]

labels = ["refund_scam", "verification_fraud", "kyc_scam", "impersonation", "safe"]

def generate_row(label):
    # Some random jitter in amount to avoid all being exactly 1/10 for fraud
    if label == "refund_scam":
        base = random.choice([1, 10])
        amount = base + random.choice([0, 0, 0, 1, -1])  # mostly exact, sometimes +/-1
        text = random.choice(refund_texts)
        upi = random.choice(upi_ids_scam)

    elif label == "verification_fraud":
        base = random.choice([1, 2, 5, 10])
        amount = base + random.choice([0, 0, 1, -1])
        text = random.choice(verification_texts)
        upi = random.choice(upi_ids_scam)

    elif label == "kyc_scam":
        base = random.choice([1, 2, 10])
        amount = base + random.choice([0, 0, 1, -1])
        text = random.choice(kyc_texts)
        upi = random.choice(upi_ids_scam)

    elif label == "impersonation":
        base = random.choice([1, 2, 5, 10])
        amount = base + random.choice([0, 0, 1, -1])
        text = random.choice(imp_texts)
        upi = random.choice(upi_ids_scam)

    else:  # safe
        amount = random.choice([99, 149, 199, 249, 299, 349, 399, 499, 799, 999])
        text = random.choice(safe_texts)
        upi = random.choice(upi_ids_safe)

    # Ensure amount is at least 1
    if amount <= 0:
        amount = 1

    return [amount, upi, text, label]

# =========================================
# GENERATE 10K DATASET
# =========================================
rows = []
for _ in range(NUM_ROWS):
    label = random.choice(labels)
    rows.append(generate_row(label))

df = pd.DataFrame(rows, columns=["amount", "upi_id", "raw_text", "label"])

df.to_csv(SAVE_PATH, index=False)
print(f"‚úÖ Generated {len(df)} rows and saved to: {SAVE_PATH}")
df.head()
df['label'].value_counts()


‚úÖ Generated 10000 rows and saved to: data/transaction/dataset_transaction.csv


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
impersonation,2063
kyc_scam,2018
refund_scam,2000
verification_fraud,1992
safe,1927


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Create RandomForest (simple but strong)
clf = RandomForestClassifier(
    n_estimators=200,
    n_jobs=-1,
    random_state=42
)

clf.fit(X_train, y_train)

print("‚úÖ Model trained")
print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))

# Detailed report
y_pred = clf.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


‚úÖ Model trained
Train accuracy: 0.825
Test accuracy: 0.6

Classification Report:

                    precision    recall  f1-score   support

     impersonation       0.00      0.00      0.00         4
          kyc_scam       0.60      0.75      0.67         4
       refund_scam       0.50      0.50      0.50         4
              safe       1.00      1.00      1.00         5
verification_fraud       0.40      0.67      0.50         3

          accuracy                           0.60        20
         macro avg       0.50      0.58      0.53        20
      weighted avg       0.53      0.60      0.56        20


Confusion Matrix:

[[0 2 1 0 1]
 [0 3 0 0 1]
 [1 0 2 0 1]
 [0 0 0 5 0]
 [0 0 1 0 2]]


In [16]:
import joblib
import os

os.makedirs("models/transaction", exist_ok=True)
model_path = "models/transaction/transaction_model.pkl"

joblib.dump(clf, model_path)
print(f"‚úÖ Saved model to: {model_path}")


‚úÖ Saved model to: models/transaction/transaction_model.pkl


In [17]:
import pandas as pd
import joblib
from lib.transaction.features import extract_features_from_record

# Load saved model
clf = joblib.load("models/transaction/transaction_model.pkl")

# Example: suspicious refund-style ‚Çπ1 message
test_record_scam = {
    "amount": 1,
    "upi_id": "refundverify@oksbi",
    "raw_text": "Please send ‚Çπ1 to verify your refund",
}

# Example: safe purchase
test_record_safe = {
    "amount": 499,
    "upi_id": "legitstore@okaxis",
    "raw_text": "Payment to ABC Store",
}

def get_transaction_risk(record):
    feats = extract_features_from_record(record)
    X_one = pd.DataFrame([feats])
    proba = clf.predict_proba(X_one)[0]
    classes = list(clf.classes_)

    # define risk as: 1 - P(safe)
    if "safe" in classes:
        safe_index = classes.index("safe")
        risk = 1.0 - proba[safe_index]
    else:
        # fallback: max prob of any fraud class
        risk = float(max(proba))

    pred_label = clf.predict(X_one)[0]
    return {
        "pred_label": pred_label,
        "risk": float(risk),
        "proba_per_class": dict(zip(classes, map(float, proba)))
    }

print("‚ö†Ô∏è Scam-like example:")
print(get_transaction_risk(test_record_scam))

print("\n‚úÖ Safe example:")
print(get_transaction_risk(test_record_safe))



‚ö†Ô∏è Scam-like example:
{'pred_label': 'refund_scam', 'risk': 1.0, 'proba_per_class': {'impersonation': 0.3357936507936509, 'kyc_scam': 0.01725, 'refund_scam': 0.6177896825396828, 'safe': 0.0, 'verification_fraud': 0.02916666666666667}}

‚úÖ Safe example:
{'pred_label': 'safe', 'risk': 0.0, 'proba_per_class': {'impersonation': 0.0, 'kyc_scam': 0.0, 'refund_scam': 0.0, 'safe': 1.0, 'verification_fraud': 0.0}}
