In [5]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, asin, sqrt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB

pd.options.mode.chained_assignment = None

TRAIN_PATH = '/content/fraudTrain.csv'
TEST_PATH  = '/content/fraudTest.csv'

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0
    lat1_rad, lon1_rad = np.radians(lat1), np.radians(lon1)
    lat2_rad, lon2_rad = np.radians(lat2), np.radians(lon2)
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = (np.sin(dlat / 2)**2
         + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2)
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def transform_with_fallback(le, series):
    valid = set(le.classes_)
    new_vals = []
    for val in series:
        new_vals.append(val if val in valid else 'Other')
    if 'Other' not in valid:
        le.classes_ = np.append(le.classes_, 'Other')
    return le.transform(new_vals)

def preprocess(file_path, fit_scaler=True, scaler=None, label_encoders=None):
    df = pd.read_csv(file_path).copy()

    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
    df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
    if 'amt' in df.columns and 'amount' not in df.columns:
        df.rename(columns={'amt': 'amount'}, inplace=True)

    df['unix_time'] = df['trans_date_trans_time'].astype(np.int64) // 10**9

    def compute_age(row):
        if pd.isnull(row['dob']):
            return np.nan
        return row['trans_date_trans_time'].year - row['dob'].year - (
            (row['trans_date_trans_time'].month, row['trans_date_trans_time'].day)
            < (row['dob'].month, row['dob'].day)
        )
    df['age'] = df.apply(compute_age, axis=1)

    def haversine_row(r):
        return haversine_distance(r['lat'], r['long'], r['merch_lat'], r['merch_long'])
    df['distance'] = df.apply(haversine_row, axis=1)

    # Bin lat/lon
    n_bins = 10
    df['lat_bucket'] = pd.cut(df['lat'], bins=n_bins, labels=False)
    df['long_bucket'] = pd.cut(df['long'], bins=n_bins, labels=False)
    df['merch_lat_bucket'] = pd.cut(df['merch_lat'], bins=n_bins, labels=False)
    df['merch_long_bucket'] = pd.cut(df['merch_long'], bins=n_bins, labels=False)

    drop_cols = [
        'Unnamed: 0', 'cc_num', 'trans_num', 'street', 'first', 'last',
        'city', 'state', 'zip', 'lat', 'long', 'merch_lat', 'merch_long',
        'dob', 'transaction_year', 'year_of_birth'
    ]
    for c in drop_cols:
        if c in df.columns:
            df.drop(columns=c, inplace=True, errors='ignore')

    # Ensure these exist
    for c in ['merchant', 'category', 'gender', 'job']:
        if c not in df.columns:
            df[c] = 'Missing'
    if 'city_pop' not in df.columns:
        df['city_pop'] = 0
    if 'gender' not in df.columns:
        df['gender'] = 'X'

    features = [
        'merchant','category','gender','job','amount','city_pop','unix_time','age',
        'distance','lat_bucket','long_bucket','merch_lat_bucket','merch_long_bucket'
    ]
    if 'is_fraud' not in df.columns:
        raise ValueError('Missing is_fraud in data.')

    df = df[features + ['is_fraud']].copy()
    X = df[features].copy()
    y = df['is_fraud'].copy()

    cat_cols = ['merchant','category','gender','job']
    if fit_scaler:
        label_encoders = {}
        for c in cat_cols:
            X[c] = X[c].astype(str)
            le = LabelEncoder()
            X[c] = le.fit_transform(X[c])
            label_encoders[c] = le
    else:
        for c in cat_cols:
            X[c] = X[c].astype(str)
            X[c] = transform_with_fallback(label_encoders[c], X[c])

    num_cols = [
        'amount','city_pop','unix_time','age','distance',
        'lat_bucket','long_bucket','merch_lat_bucket','merch_long_bucket'
    ]
    # Converting  numeric columns to float first to avoid the future error
    X[num_cols] = X[num_cols].astype(float)

    for nc in num_cols:
        X[nc] = X[nc].fillna(X[nc].median())

    if fit_scaler:
        scaler = StandardScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])
    else:
        X[num_cols] = scaler.transform(X[num_cols])

    return X, y, scaler, label_encoders

X_all, y_all, scaler, label_encoders = preprocess(TRAIN_PATH, fit_scaler=True)
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)

pred_val = model.predict(X_val)
val_acc = accuracy_score(y_val, pred_val)
val_prec = precision_score(y_val, pred_val, zero_division=0)
val_rec = recall_score(y_val, pred_val, zero_division=0)
val_f1 = f1_score(y_val, pred_val, zero_division=0)
print('Validation Results:')
print(f'Accuracy:  {val_acc:.4f}')
print(f'Precision: {val_prec:.4f}')
print(f'Recall:    {val_rec:.4f}')
print(f'F1 Score:  {val_f1:.4f}')

X_test, y_test, _, _ = preprocess(TEST_PATH, fit_scaler=False, scaler=scaler, label_encoders=label_encoders)
pred_test = model.predict(X_test)
test_acc = accuracy_score(y_test, pred_test)
test_prec = precision_score(y_test, pred_test, zero_division=0)
test_rec = recall_score(y_test, pred_test, zero_division=0)
test_f1 = f1_score(y_test, pred_test, zero_division=0)
print('\nTest Results:')
print(f'Accuracy:  {test_acc:.4f}')
print(f'Precision: {test_prec:.4f}')
print(f'Recall:    {test_rec:.4f}')
print(f'F1 Score:  {test_f1:.4f}')


Validation Results:
Accuracy:  0.9876
Precision: 0.2326
Recall:    0.4882
F1 Score:  0.3151

Test Results:
Accuracy:  0.9886
Precision: 0.1617
Recall:    0.4681
F1 Score:  0.2404
