In [22]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import timedelta

fake = Faker('en_IN')
Faker.seed(42)
np.random.seed(42)

indian_locations = [
    ("Mumbai", "Maharashtra", 19.0760, 72.8777), ("Delhi", "Delhi", 28.6139, 77.2090),
    ("Bengaluru", "Karnataka", 12.9716, 77.5946), ("Ahmedabad", "Gujarat", 23.0225, 72.5714),
    ("Kolkata", "West Bengal", 22.5726, 88.3639), ("Chennai", "Tamil Nadu", 13.0827, 80.2707),
    ("Hyderabad", "Telangana", 17.3850, 78.4867), ("Jaipur", "Rajasthan", 26.9124, 75.7873),
    ("Pune", "Maharashtra", 18.5204, 73.8567), ("Surat", "Gujarat", 21.1702, 72.8311),
    ("Lucknow", "Uttar Pradesh", 26.8467, 80.9462), ("Bhopal", "Madhya Pradesh", 23.2599, 77.4126),
    ("Patna", "Bihar", 25.5941, 85.1376), ("Indore", "Madhya Pradesh", 22.7196, 75.8577),
    ("Nagpur", "Maharashtra", 21.1458, 79.0882)
]

transaction_types = ["Send Money", "Receive Money", "Merchant Payment", "Bill Payment"]
sites = ["Amazon", "Flipkart", "PhonePe", "Swiggy", "IRCTC", "Zomato", "Google Play", "Ola", "Uber", "Paytm"]
device_types = ["Mobile", "Tablet", "Desktop"]
banks = ["SBI", "HDFC", "ICICI", "Axis", "Kotak", "PNB", "BOB", "Canara"]

# Weighted city distribution (simulate real traffic)
city_weights = [0.15 if city[0] in ["Mumbai", "Delhi", "Bengaluru", "Kolkata", "Chennai"] else 0.05 for city in indian_locations]

# Weighted transaction types
transaction_type_weights = [0.4, 0.2, 0.2, 0.2]  # Send Money is most frequent

# Weighted popular sites
site_weights = [0.2 if s in ["Amazon", "Flipkart"] else 0.15 if s in ["PhonePe", "Paytm"] else 0.05 for s in sites]

# Weighted banks
bank_weights = [0.25 if b in ["SBI", "ICICI"] else 0.1 for b in banks]

n_transactions = 1000
n_users = 50
user_ids = [fake.random_int(10000000, 99999999) for _ in range(n_users)]

# Initialize per-user data
user_data = {}
for uid in user_ids:
    income = np.random.normal(50_000, 30_000)
    income = max(income, 5000)
    user_data[uid] = {
        "income": income,
        "min_transaction": np.random.uniform(1, 500),
        "max_transaction": np.random.uniform(20_000, 5_00_000),
        "balance": np.random.uniform(10_000, 10_00_000),
        "last_device": None,
        "used_sites": set()
    }

# Generate transactions
transactions = []
start_date = pd.to_datetime("2023-01-01")

for i in range(n_transactions):
    uid = random.choice(user_ids)
    udata = user_data[uid]
    txn_value = np.random.uniform(udata['min_transaction'], udata['max_transaction'])
    txn_time = start_date + timedelta(minutes=random.randint(0, 525600))
    location = random.choices(indian_locations, weights=city_weights, k=1)[0]
    site = random.choices(sites, weights=site_weights)[0]
    lat_jitter, lon_jitter = np.random.uniform(-0.05, 0.05), np.random.uniform(-0.05, 0.05)
    latitude = round(location[2] + lat_jitter, 6)
    longitude = round(location[3] + lon_jitter, 6)

    balance = max(0, udata['balance'] - txn_value if random.random() < 0.6 else udata['balance'] + txn_value)
    device = random.choices(device_types, weights=[0.8, 0.1, 0.1])[0]

    # New flags
    device_change_flag = 1 if udata['last_device'] and device != udata['last_device'] else 0
    unusual_hour = 1 if txn_time.hour >= 0 and txn_time.hour < 5 else 0
    new_site_flag = 1 if site not in udata['used_sites'] else 0

    udata['last_device'] = device
    udata['used_sites'].add(site)

    txn = {
        "transaction_id": f"TXN{i:06d}",
        "account_number": uid,
        "transaction_time": txn_time,
        "transaction_value": round(txn_value, 2),
        "income": round(udata['income'], 2),
        "transaction_type": random.choice(transaction_types),
        "site": site,
        "balance_after_transaction": round(balance, 2),
        "city": location[0],
        "state": location[1],
        "device_type": device,
        "bank": random.choice(banks),
        "latitude": latitude,
        "longitude": longitude,
        "device_change_flag": device_change_flag,
        "unusual_hour": unusual_hour,
        "new_site_flag": new_site_flag
    }

    # Rule-based fraud label
    is_fraud = 0
    if txn["transaction_value"] > 2 * udata['income']:
        is_fraud = 1
    elif device == "Desktop" and txn["transaction_type"] == "Send Money" and random.random() < 0.3:
        is_fraud = 1
    elif random.random() < 0.002:
        is_fraud = 1

    txn["is_fraud"] = is_fraud
    transactions.append(txn)

df = pd.DataFrame(transactions)

# Adjust fraud distribution to exactly 0.1% (i.e., 100 frauds in 1 lakh rows)
df["is_fraud"] = 0
fraud_indices = np.random.choice(df.index, size=500, replace=False)
df.loc[fraud_indices, "is_fraud"] = 1

# Save and preview
df.to_csv("Minimsed_Dataset(1)_enhanced_upi_dataset.csv", index=False)
print(df.head())
print("\n✅ Enhanced dataset saved as 'enhanced_upi_dataset.csv' with", len(df), "rows.")

  transaction_id  account_number    transaction_time  transaction_value  \
0      TXN000000        71662963 2023-01-22 08:55:00            1118.97   
1      TXN000001        23756669 2023-11-09 14:18:00          215616.89   
2      TXN000002        39587039 2023-03-18 01:28:00           92566.13   
3      TXN000003        66306997 2023-09-26 11:02:00          104370.25   
4      TXN000004        22981052 2023-11-26 18:16:00          199677.40   

     income  transaction_type      site  balance_after_transaction     city  \
0  57796.48  Merchant Payment    Amazon                  727577.15    Delhi   
1  43226.71        Send Money  Flipkart                       0.00    Delhi   
2   5000.00  Merchant Payment     Paytm                       0.00    Delhi   
3  52752.82  Merchant Payment  Flipkart                  520649.35  Kolkata   
4  20283.91     Receive Money    Amazon                  481111.32   Jaipur   

         state device_type    bank   latitude  longitude  device_change_fl

In [23]:
import pandas as pd

# Load your dataset
df = pd.read_csv("/Users/jalajtrivedi/iit/pytorchDL/Minimsed_Dataset(1)_enhanced_upi_dataset.csv")  # or use your DataFrame directly

# Target column is 'is_fraud'
df['is_fraud'] = df['is_fraud'].astype(int)  # Make sure it's binary (0/1)

In [24]:
#Feature engineering
df['transaction_time'] = pd.to_datetime(df['transaction_time'])
df['transaction_hour'] = df['transaction_time'].dt.hour
df['transaction_day'] = df['transaction_time'].dt.day
df['transaction_dayofweek'] = df['transaction_time'].dt.dayofweek
df['is_weekend'] = df['transaction_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

In [25]:
columns_to_drop = ['longitude', 'latitude', 'state','transaction_time',
       'transaction_day', 'transaction_dayofweek']

df = df.drop(columns_to_drop, axis=1)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   transaction_id             1000 non-null   object 
 1   account_number             1000 non-null   int64  
 2   transaction_value          1000 non-null   float64
 3   income                     1000 non-null   float64
 4   transaction_type           1000 non-null   object 
 5   site                       1000 non-null   object 
 6   balance_after_transaction  1000 non-null   float64
 7   city                       1000 non-null   object 
 8   device_type                1000 non-null   object 
 9   bank                       1000 non-null   object 
 10  device_change_flag         1000 non-null   int64  
 11  unusual_hour               1000 non-null   int64  
 12  new_site_flag              1000 non-null   int64  
 13  is_fraud                   1000 non-null   int64 

In [27]:
import pandas as pd
from pycaret.classification import setup, compare_models, predict_model
from sklearn.model_selection import train_test_split

In [28]:
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.replace(' ', '_')

In [29]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['is_fraud'])

In [30]:
clf_setup = setup(
    data=train_df,
    target='is_fraud',
    session_id=123,
    fix_imbalance=True,
    normalize=True,
    feature_selection=True,
    verbose=False
)

best_model = compare_models(sort='F1')  # you can use sort='Recall' if catching fraud is priority

[LightGBM] [Info] Number of positive: 280, number of negative: 280
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 619
[LightGBM] [Info] Number of data points in the train set: 560, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.4911,0.5195,0.7536,0.495,0.5946,-0.0179,-0.0321,0.124
knn,K Neighbors Classifier,0.5482,0.5298,0.5536,0.5486,0.5502,0.0964,0.0968,0.082
lr,Logistic Regression,0.4536,0.4494,0.4929,0.4524,0.4686,-0.0929,-0.094,0.381
ridge,Ridge Classifier,0.4536,0.4501,0.4929,0.4524,0.4686,-0.0929,-0.094,0.085
lda,Linear Discriminant Analysis,0.4536,0.4501,0.4929,0.4524,0.4686,-0.0929,-0.094,0.086
nb,Naive Bayes,0.4429,0.4204,0.4893,0.4466,0.4644,-0.1143,-0.1161,0.086
svm,SVM - Linear Kernel,0.4554,0.4617,0.3857,0.444,0.3974,-0.0893,-0.1017,0.081
dt,Decision Tree Classifier,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.096
rf,Random Forest Classifier,0.5,0.5162,0.0,0.0,0.0,0.0,0.0,0.101
qda,Quadratic Discriminant Analysis,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.082


In [31]:
best_model = compare_models(sort='F1')  # or sort='Recall' if you care more about catching fraud

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.4911,0.5195,0.7536,0.495,0.5946,-0.0179,-0.0321,0.127
knn,K Neighbors Classifier,0.5482,0.5298,0.5536,0.5486,0.5502,0.0964,0.0968,0.1
lr,Logistic Regression,0.4536,0.4494,0.4929,0.4524,0.4686,-0.0929,-0.094,0.089
ridge,Ridge Classifier,0.4536,0.4501,0.4929,0.4524,0.4686,-0.0929,-0.094,0.116
lda,Linear Discriminant Analysis,0.4536,0.4501,0.4929,0.4524,0.4686,-0.0929,-0.094,0.094
nb,Naive Bayes,0.4429,0.4204,0.4893,0.4466,0.4644,-0.1143,-0.1161,0.104
svm,SVM - Linear Kernel,0.4554,0.4617,0.3857,0.444,0.3974,-0.0893,-0.1017,0.1
dt,Decision Tree Classifier,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.091
rf,Random Forest Classifier,0.5,0.5162,0.0,0.0,0.0,0.0,0.0,0.13
qda,Quadratic Discriminant Analysis,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.114


In [19]:
test_predictions = predict_model(best_model, data=test_df)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.45,0.405,0.3,0.4286,0.3529,-0.1,-0.1048


In [32]:
print(test_predictions.columns)

Index(['transaction_id', 'account_number', 'transaction_value', 'income',
       'transaction_type', 'site', 'balance_after_transaction', 'city',
       'device_type', 'bank', 'device_change_flag', 'unusual_hour',
       'new_site_flag', 'transaction_hour', 'is_weekend', 'is_fraud',
       'prediction_label', 'prediction_score'],
      dtype='object')


In [33]:
# Replace spaces in test_df as well (in case you missed it)
for col in test_df.select_dtypes(include='object').columns:
    test_df[col] = test_df[col].str.replace(' ', '_')

# Predict on the test set
test_predictions = predict_model(best_model, data=test_df)

# View predictions
test_predictions[['is_fraud', 'prediction_label', 'prediction_score']].head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.47,0.5198,0.74,0.4805,0.5827,-0.06,-0.0713


Unnamed: 0,is_fraud,prediction_label,prediction_score
815,1,1,0.55
597,0,1,0.54
444,0,0,0.52
112,0,0,0.51
224,1,1,0.52
