In [1]:
import pandas as pd
import numpy as np
import sys
import os

# This command adds the 'src' folder to Python's path so we can import your files
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

# Import your specific functions
from features_stats import add_row_statistics
from model_gnb import train_gnb, evaluate_gnb
from model_logreg import train_logreg, evaluate_logreg
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
df = pd.read_csv('../data/train_magic.csv')
print(f"Data Loaded Successfully: {df.shape}")

# Separate the Target (y) from the Features (X)
target_col = 'target'
drop_cols = ['ID_code', target_col]

# We want to make sure we only use the original 200 variables for the start
original_features = [f'var_{i}' for i in range(200)]

X = df[original_features]
y = df[target_col]

print("Features and Target separated.")

Data Loaded Successfully: (200000, 402)
Features and Target separated.


In [3]:
# Create a copy so we don't mess up the original X
X_stats = X.copy()

# Run your function to add mean, std, min, max, etc.
X_stats = add_row_statistics(X_stats, original_features)

print("New statistical features created.")
print(f"Old shape: {X.shape} -> New shape: {X_stats.shape}")
X_stats.head()

Generating row-wise statistics...
New statistical features created.
Old shape: (200000, 200) -> New shape: (200000, 207)


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_197,var_198,var_199,sum,mean,std,min,max,skew,kurt
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,8.5635,12.7803,-1.0914,1456.3182,7.281591,9.33154,-21.4494,43.1127,0.10158,1.331023
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,8.7889,18.356,1.9518,1415.3636,7.076818,10.33613,-47.3797,40.5632,-0.351734,4.110215
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,8.2675,14.7222,0.3965,1240.8966,6.204483,8.753387,-22.4038,33.882,-0.056957,0.546438
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,10.2922,17.9697,-8.9996,1288.2319,6.441159,9.594064,-35.1659,38.1015,-0.480116,2.630499
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,9.5031,17.9974,-8.8104,1354.231,6.771155,11.287122,-65.4863,41.1037,-1.463426,9.787399


In [4]:
print("--- Running Gaussian Naive Bayes ---")

# 1. Split data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Train the model
gnb_model = train_gnb(X_train, y_train)

# 3. Evaluate the model
print("Evaluating on Validation Set:")
gnb_preds, gnb_score = evaluate_gnb(gnb_model, X_val, y_val)

--- Running Gaussian Naive Bayes ---
Training Gaussian Naive Bayes...
Evaluating on Validation Set:
Gaussian NB AUC Score: 0.88809


In [5]:
print("--- Running Logistic Regression ---")

# 1. Split the data that has the NEW stats features
X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(X_stats, y, test_size=0.2, random_state=42)

# 2. Train the model (This will also scale the data for you)
lr_model, scaler = train_logreg(X_train_s, y_train_s)

# 3. Evaluate the model
print("Evaluating on Validation Set:")
lr_preds, lr_score = evaluate_logreg(lr_model, scaler, X_val_s, y_val_s)

--- Running Logistic Regression ---
Training Logistic Regression...
Evaluating on Validation Set:
Logistic Regression AUC Score: 0.86780
