In [None]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def load_preprocess_data(file_path):
    """
    Loads a fraud CSV, removes/encodes columns so that ONLY numeric features remain.
    Returns (X, y).
    """


    df = pd.read_csv(file_path)

    # Drop or convert datetime columns
    if 'trans_date_trans_time' in df.columns:
        df.drop(columns='trans_date_trans_time', inplace=True)
        
    
    if 'dob' in df.columns:
        df.drop(columns='dob', inplace=True)

    #Droping irrelevant/string columns you don’t need
    
    for col in ['Unnamed: 0', 'cc_num', 'trans_num', 'street', 
                'first','last', 'city', 'state', 'zip',
                'lat', 'long', 'merch_lat', 'merch_long']:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    
    if 'is_fraud' not in df.columns:
        raise ValueError("No 'is_fraud' column found in the dataset!")
    y = df['is_fraud']
    X = df.drop(columns=['is_fraud'])

    #Converting if there  any remaining non-numeric columns using LabelEncoder
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = LabelEncoder().fit_transform(X[col].astype(str))

    # 6. Scale numeric columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    return X, y

def naive_bayes_example(train_path='fraudTrain.csv', test_path='fraudTest.csv'):
    """
    1) Loads & preprocesses train and test data 
    2) Splits the train data into (train/val)
    3) Trains a Naive Bayes model
    4) Evaluates on val set and final test set
    """

    print("\n=== Loading and Preprocessing Training Data ===")
    X, y = load_preprocess_data(train_path)

    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Training Naive Bayes
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)

    # Validating
    y_val_pred = nb_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred, zero_division=0)
    val_recall = recall_score(y_val, y_val_pred, zero_division=0)
    val_f1 = f1_score(y_val, y_val_pred, zero_division=0)

    print("\n=== Validation Results (Naive Bayes) ===")
    print(f"Accuracy:  {val_accuracy:.4f}")
    print(f"Precision: {val_precision:.4f}")
    print(f"Recall:    {val_recall:.4f}")
    print(f"F1 Score:  {val_f1:.4f}")

    print("\n=== Loading and Preprocessing Test Data ===")
    X_test, y_test = load_preprocess_data(test_path)

    y_test_pred = nb_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, zero_division=0)
    test_recall = recall_score(y_test, y_test_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)

    print("\n=== Test Results (Naive Bayes) ===")
    print(f"Accuracy:  {test_accuracy:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall:    {test_recall:.4f}")
    print(f"F1 Score:  {test_f1:.4f}")


if __name__ == "__main__":
    naive_bayes_example()




=== Loading and Preprocessing Training Data ===

=== Validation Results (Naive Bayes) ===
Accuracy:  0.9875
Precision: 0.2310
Recall:    0.4868
F1 Score:  0.3134

=== Loading and Preprocessing Test Data ===

=== Test Results (Naive Bayes) ===
Accuracy:  0.9886
Precision: 0.1620
Recall:    0.4699
F1 Score:  0.2410
