In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

np.random.seed(42)

apis = [
    "PAN_OCR", "AADHAAR_OCR", "FACE_MATCH", "BANK_VERIFY", "DL_OCR",
    "PASSPORT_OCR", "MOBILE_VERIFY", "EMAIL_VERIFY", "ADDRESS_MATCH",
    "FRAUD_SCORE", "LIVENESS_CHECK", "KYC_COMPOSITE"
]

versions = ["v1", "v2", "v3"]
regions = ["Metro", "Tier1", "Tier2", "Rural"]
documents = ["PAN", "Aadhaar", "Passport", "DrivingLicense"]
clients = ["FinTech", "E-Commerce", "Bank", "Telecom"]

error_reasons = [
    "blurry_image", "timeout", "invalid_document",
    "fraud_suspected", "service_down", None
]

rows = []
start = datetime.now() - timedelta(days=30)

for _ in range(120000):
    api = random.choice(apis)
    version = random.choice(versions)
    region = random.choice(regions)
    doc = random.choice(documents)
    client = random.choice(clients)

    base_latency = {
        "Metro": 200,
        "Tier1": 300,
        "Tier2": 450,
        "Rural": 650
    }[region]

    latency = int(np.random.normal(base_latency, 80))
    latency = max(50, latency)

    failure_prob = 0.08
    if region in ["Tier2", "Rural"]:
        failure_prob += 0.07
    if api in ["PAN_OCR", "AADHAAR_OCR"]:
        failure_prob += 0.05

    status = "fail" if random.random() < failure_prob else "success"

    if status == "success":
        code = 200
        reason = None
    else:
        code = random.choice([400, 401, 500, 504])
        reason = random.choice(error_reasons[:-1])

    ts = start + timedelta(minutes=random.randint(0, 43200))

    rows.append([
        ts, api, version, region, doc, client,
        status, code, latency, reason
    ])

df = pd.DataFrame(rows, columns=[
    "timestamp", "api_name", "api_version", "region",
    "document_type", "client_type",
    "status", "status_code", "latency_ms", "error_reason"
])

df.to_csv("../data/raw_api_logs.csv", index=False)
df.head()



Unnamed: 0,timestamp,api_name,api_version,region,document_type,client_type,status,status_code,latency_ms,error_reason
0,2026-01-12 12:20:01.531176,AADHAAR_OCR,v3,Rural,Passport,Telecom,success,200,689,
1,2026-01-02 05:27:01.531176,FACE_MATCH,v3,Tier1,DrivingLicense,FinTech,fail,400,288,service_down
2,2025-12-21 06:49:01.531176,MOBILE_VERIFY,v3,Tier2,PAN,FinTech,fail,500,501,fraud_suspected
3,2026-01-01 05:55:01.531176,BANK_VERIFY,v3,Rural,PAN,FinTech,success,200,771,
4,2026-01-09 18:57:01.531176,PASSPORT_OCR,v2,Tier1,Passport,FinTech,success,200,281,
