# 03 — Model Training
## Nifty 50 Return Direction Forecasting
This notebook trains and compares three ML models on a time-based 
80/20 train-test split to predict next day return direction.

**Models:** Logistic Regression, Random Forest, XGBoost  
**Baseline:** Naive always-Up strategy (52.8% accuracy)  
**Best Model:** Logistic Regression (53.9% accuracy)

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

# Load featured data
df = pd.read_csv('../data/nifty50_features.csv', index_col=0)
df.index = pd.to_datetime(df.index)

# Updated features list
features = ['Return', 'Return_lag1', 'Return_lag2', 'Return_lag3',
            'Volatility_10', 'RSI', 'MA_Cross', 
            'MA5_20_ratio', 'MA20_50_ratio']

X = df[features]
y = df['Target']

# Time based split
split = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 2158
Test size: 540


In [2]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)

# 2. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# 3. XGBoost
xgb = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

# Naive baseline — always predict Up
baseline_pred = np.ones(len(y_test))

# Results comparison
print("=" * 45)
print(f"{'Model':<25} {'Accuracy':>10} {'ROC-AUC':>10}")
print("=" * 45)
print(f"{'Naive Baseline':<25} {accuracy_score(y_test, baseline_pred):>10.3f} {'N/A':>10}")
print(f"{'Logistic Regression':<25} {accuracy_score(y_test, lr_pred):>10.3f} {roc_auc_score(y_test, lr_pred):>10.3f}")
print(f"{'Random Forest':<25} {accuracy_score(y_test, rf_pred):>10.3f} {roc_auc_score(y_test, rf_pred):>10.3f}")
print(f"{'XGBoost':<25} {accuracy_score(y_test, xgb_pred):>10.3f} {roc_auc_score(y_test, xgb_pred):>10.3f}")
print("=" * 45)

Model                       Accuracy    ROC-AUC
Naive Baseline                 0.528        N/A
Logistic Regression            0.539      0.517
Random Forest                  0.506      0.500
XGBoost                        0.517      0.513


In [3]:
import joblib
import os

os.makedirs('../outputs/models', exist_ok=True)

joblib.dump(lr, '../outputs/models/logistic_regression.pkl')
joblib.dump(rf, '../outputs/models/random_forest.pkl')
joblib.dump(xgb, '../outputs/models/xgboost.pkl')
joblib.dump(scaler, '../outputs/models/scaler.pkl')

print("Models saved successfully!")

Models saved successfully!
