In [None]:
# feature making


import pandas as pd
import numpy as np

# ================== CONFIGURATION ==================
main_file = "/content/drive/MyDrive/colab/Copy of nifty_spot_fut_data.csv"
output_file = "/content/drive/MyDrive/colab/nifty_all_features_with_targets_new.csv"

# ================== LOAD & PARSE DATA ==================
df = pd.read_csv(main_file)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time
df = df.sort_values(['date', 'time'])

# ================== FEATURE ENGINEERING ==================
df['atm_iv'] = df['atm_c_iv'] + df['atm_p_iv']
grouped = df.groupby('date', group_keys=False)

# Current day high/low and their differences
df['curr_day_high'] = grouped['spot_ltp'].cummax()
df['curr_day_low'] = grouped['spot_ltp'].cummin()
df['curr_day_high_diff'] = df['curr_day_high'] - df['spot_ltp']
df['curr_day_low_diff'] = df['spot_ltp'] - df['curr_day_low']

# Rolling day-based high/low features
daily_agg = df.groupby('date')['spot_ltp'].agg(['min', 'max']).rename(
    columns={'min': 'daily_low', 'max': 'daily_high'})
daily_agg['5d_high'] = daily_agg['daily_high'].shift(1).rolling(5, min_periods=1).max()
daily_agg['5d_low'] = daily_agg['daily_low'].shift(1).rolling(5, min_periods=1).min()
daily_agg['3d_high'] = daily_agg['daily_high'].shift(1).rolling(3, min_periods=1).max()
daily_agg['3d_low'] = daily_agg['daily_low'].shift(1).rolling(3, min_periods=1).min()
daily_agg['1d_high'] = daily_agg['daily_high'].shift(1)
daily_agg['1d_low'] = daily_agg['daily_low'].shift(1)

df = df.merge(daily_agg[['5d_high','5d_low','3d_high','3d_low','1d_high','1d_low']],
              left_on='date', right_index=True, how='left')

df['diff_5d_high'] = df['spot_ltp'] - df['5d_high']
df['diff_5d_low'] = df['spot_ltp'] - df['5d_low']
df['diff_3d_high'] = df['spot_ltp'] - df['3d_high']
df['diff_3d_low'] = df['spot_ltp'] - df['3d_low']
df['diff_1d_high'] = df['spot_ltp'] - df['1d_high']
df['diff_1d_low'] = df['spot_ltp'] - df['1d_low']

# Rolling window high/low and related features
windows = [60, 180, 300,600,1500,3600]
for w in windows:
    df[f'high_{w}s'] = grouped['spot_ltp'].transform(lambda x: x.rolling(w, min_periods=1).max())
    df[f'low_{w}s'] = grouped['spot_ltp'].transform(lambda x: x.rolling(w, min_periods=1).min())
    df[f'diff_high_{w}s'] = df['spot_ltp'] - df[f'high_{w}s']
    df[f'diff_low_{w}s'] = df['spot_ltp'] - df[f'low_{w}s']

# Volatility, returns, stability for spot, future, and IV
for w in windows:
    df[f'spot_ret{w}'] = grouped['spot_ltp'].transform(lambda x: x.pct_change(w, fill_method=None))
    df[f'spot_rv{w}'] = grouped['spot_ltp'].transform(lambda x: x.rolling(w).std())
    df[f'spot_stability_{w}'] = grouped['spot_ltp'].transform(lambda x: x.rolling(w).mean() / x.rolling(w).std())
    df[f'fut_ret{w}'] = grouped['fut_1_ltp'].transform(lambda x: x.pct_change(w, fill_method=None))
    df[f'fut_rv{w}'] = grouped['fut_1_ltp'].transform(lambda x: x.rolling(w).std())
    df[f'fut_stability_{w}'] = grouped['fut_1_ltp'].transform(lambda x: x.rolling(w).mean() / x.rolling(w).std())
    df[f'iv_ret{w}'] = grouped['atm_iv'].transform(lambda x: x.pct_change(w, fill_method=None))
    df[f'iv_rv{w}'] = grouped['atm_iv'].transform(lambda x: x.rolling(w).std())
    df[f'iv_stability_{w}'] = grouped['atm_iv'].transform(lambda x: x.rolling(w).mean() / x.rolling(w).std())
    df[f'fut_vol_chg{w}'] = grouped['fut_1_vol'].transform(lambda x: x.diff(w))

# Volume/OI, spread, skew
df['voi_ratio'] = df['fut_1_vol'] / df['fut_1_oi']
df['basis_spread'] = df['fut_1_ltp'] - df['spot_ltp']
df['iv_skew'] = df['atm_c_iv'] - df['atm_p_iv']

# Gap features
prev_close = df.groupby('date')['spot_ltp'].last().shift(1).rename('prev_day_close')
curr_open = df.groupby('date')['spot_ltp'].first().rename('curr_day_open')
gap_features = pd.concat([prev_close, curr_open], axis=1)
gap_features['gap'] = gap_features['curr_day_open'] - gap_features['prev_day_close']
gap_features['gap_direction'] = gap_features['gap'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
gap_features['gap_pct'] = gap_features['gap'] / gap_features['prev_day_close']
df = df.merge(gap_features, left_on='date', right_index=True, how='left')

# ================== TARGET FEATURES ==================
# Compute spot returns
df['spot_ret'] = df.groupby('date')['spot_ltp'].transform(lambda x: x.pct_change(fill_method=None))

# Re-group after spot_ret addition
grouped = df.groupby('date', group_keys=False)

# Calculate targets
for w in [60, 180, 300,600,1500,3600]:
    df[f'realized_vol_{w}'] = df.groupby('date')['spot_ret'].transform(lambda x: x.rolling(w, min_periods=1).std())
    df[f'realized_vol_target_{w}'] = (df[f'realized_vol_{w}'].shift(-w) > df[f'realized_vol_{w}']).astype(int)
    df[f'spot_dir_target_{w}'] = df.groupby('date')['spot_ltp'].transform(lambda x: (x.shift(-w) > x).astype(int))
    df[f'iv_target_{w}'] = df.groupby('date')['atm_iv'].transform(lambda x: (x.shift(-w) > x).astype(int))

# ================== FINAL CLEANUP & SAVE ==================
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.to_csv(output_file, index=False)
print(f"✅ All features and targets computed and saved to '{output_file}'")


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Path to your original file
file_path = '/content/drive/MyDrive/colab/nifty_all_features_with_targets_new.csv'

# List of columns to keep (as you provided)
columns_to_keep = [
    'date', 'time', 'dte',
    'curr_day_high_diff', 'curr_day_low_diff',
    'diff_5d_high', 'diff_5d_low', 'diff_3d_high', 'diff_3d_low', 'diff_1d_high', 'diff_1d_low',
    'diff_high_60s', 'diff_low_60s', 'diff_high_180s', 'diff_low_180s',
    'diff_high_300s', 'diff_low_300s',
    'spot_ret60', 'spot_rv60', 'spot_stability_60', 'fut_ret60', 'fut_rv60', 'fut_stability_60',
    'iv_ret60', 'iv_rv60', 'iv_stability_60', 'fut_vol_chg60',
    'spot_ret180', 'spot_rv180', 'spot_stability_180', 'fut_ret180', 'fut_rv180', 'fut_stability_180',
    'iv_ret180', 'iv_rv180', 'iv_stability_180', 'fut_vol_chg180',
    'spot_ret300', 'spot_rv300', 'spot_stability_300', 'fut_ret300', 'fut_rv300', 'fut_stability_300',
    'iv_ret300', 'iv_rv300', 'iv_stability_300', 'fut_vol_chg300',
    'spot_ret600', 'spot_rv600', 'spot_stability_600', 'fut_ret600', 'fut_rv600', 'fut_stability_600',
    'iv_ret600', 'iv_rv600', 'iv_stability_600', 'fut_vol_chg600',
    'spot_ret1500', 'spot_rv1500', 'spot_stability_1500', 'fut_ret1500', 'fut_rv1500', 'fut_stability_1500',
    'iv_ret1500', 'iv_rv1500', 'iv_stability_1500', 'fut_vol_chg1500',

    'spot_ret3600', 'spot_rv3600', 'spot_stability_3600', 'fut_ret3600', 'fut_rv3600', 'fut_stability_3600',
    'iv_ret3600', 'iv_rv3600', 'iv_stability_3600', 'fut_vol_chg3600',

    'voi_ratio', 'basis_spread', 'iv_skew',
    'prev_day_close', 'curr_day_open', 'gap', 'gap_direction', 'gap_pct',
    'spot_ret',
    'realized_vol_60', 'realized_vol_target_60', 'spot_dir_target_60', 'iv_target_60',
    'realized_vol_180', 'realized_vol_target_180', 'spot_dir_target_180', 'iv_target_180',
    'realized_vol_300', 'realized_vol_target_300', 'spot_dir_target_300', 'iv_target_300',
     'realized_vol_600', 'realized_vol_target_600', 'spot_dir_target_600', 'iv_target_600',
     'realized_vol_1500', 'realized_vol_target_1500', 'spot_dir_target_1500', 'iv_target_1500',
     'realized_vol_3600', 'realized_vol_target_3600', 'spot_dir_target_3600', 'iv_target_3600'
]

# Load the CSV
df = pd.read_csv(file_path)

# Filter the DataFrame to only the columns you want (ignore missing columns)
filtered_df = df[[col for col in columns_to_keep if col in df.columns]]

# Save to a new CSV
output_path = '/content/drive/MyDrive/colab/nifty_filtered_features_new.csv'
filtered_df.to_csv(output_path, index=False)

print(f"Filtered CSV saved to: {output_path}")


In [None]:
# 3 month training and 2 month testing (choose any 10  random samples in one trial)

import pandas as pd
import numpy as np
import itertools
import xgboost as xgb
import random
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# ========== CONFIGURATION ==========
DATA_PATH ='/content/drive/MyDrive/Copy of nifty_filtered_features.csv'
OUTPUT_CSV = "//content/drive/MyDrive/Colab Notebooks/rv60_180_300_1d_datedsplit.csv"

day_groups = {
     '5d': ['diff_5d_high', 'diff_5d_low'],
     '3d': ['diff_3d_high', 'diff_3d_low'],
    '1d': ['diff_1d_high', 'diff_1d_low'],
}

window_groups = {
    '60': [
        'diff_high_60s', 'diff_low_60s', 'spot_ret60', 'spot_rv60', 'spot_stability_60',
        'fut_ret60', 'fut_rv60', 'fut_stability_60', 'iv_ret60', 'iv_rv60', 'iv_stability_60', 'fut_vol_chg60'
    ],
    '180': [
        'diff_high_180s', 'diff_low_180s', 'spot_ret180', 'spot_rv180', 'spot_stability_180',
        'fut_ret180', 'fut_rv180', 'fut_stability_180', 'iv_ret180', 'iv_rv180', 'iv_stability_180', 'fut_vol_chg180'
    ],
    '300': [
        'diff_high_300s', 'diff_low_300s', 'spot_ret300', 'spot_rv300', 'spot_stability_300',
        'fut_ret300', 'fut_rv300', 'fut_stability_300', 'iv_ret300', 'iv_rv300', 'iv_stability_300', 'fut_vol_chg300'
    ]
}

other_features = [
    'curr_day_high_diff', 'curr_day_low_diff', 'voi_ratio', 'basis_spread', 'iv_skew',
    'prev_day_close', 'curr_day_open', 'gap', 'gap_direction', 'gap_pct', 'spot_ret'
]

target_features = [
    'iv_target_60','iv_target_180', 'iv_target_300',
     'realized_vol_target_60', 'realized_vol_target_180', 'realized_vol_target_300',

    'spot_dir_target_60', 'spot_dir_target_180', 'spot_dir_target_300']

# --- Hyperparameter Grid ---
MAX_DEPTHS = [2, 4]
SUBSAMPLES = [0.7]
COLSAMPLE_BYTREE = [0.7]
MIN_CHILD_WEIGHT = [1, 3]
GAMMA = [0.1]
REG_ALPHA = [0.01]
REG_LAMBDA = [0.1]
COLSAMPLE_BYLEVEL = [0.6]
COLSAMPLE_BYNODE = [0.6]
MAX_DELTA_STEP = [1]

HYPERPARAM_GRID = [
    {
        'max_depth': md,
        'subsample': ss,
        'colsample_bytree': cbt,
        'min_child_weight': mcw,
        'gamma': gm,
        'reg_alpha': ra,
        'reg_lambda': rl,
        'colsample_bylevel': cbl,
        'colsample_bynode': cbn,
        'max_delta_step': mds,
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'device': 'cuda',
        'nthread': 6,
        'verbosity': 2,
        'validate_parameters': False
    }
    for md in MAX_DEPTHS
    for ss in SUBSAMPLES
    for cbt in COLSAMPLE_BYTREE
    for mcw in MIN_CHILD_WEIGHT
    for gm in GAMMA
    for ra in REG_ALPHA
    for rl in REG_LAMBDA
    for cbl in COLSAMPLE_BYLEVEL
    for cbn in COLSAMPLE_BYNODE
    for mds in MAX_DELTA_STEP
]

def create_xgb_model(params):
    return xgb.XGBClassifier(
        **params
    )

def run_modeling():
    df = pd.read_csv(DATA_PATH)
    df['date'] = pd.to_datetime(df['date'])

    window_keys = list(window_groups.keys())
    window_combos = []
    for r in range(1, len(window_keys)+1):
        window_combos += list(itertools.combinations(window_keys, r))

    results = []

    for day_label, day_features in day_groups.items():
        print(f"\n=== Processing Day Group: {day_label} ===")

        for window_combo in window_combos:
            window_labels = '-'.join(window_combo)
            print(f"  > Window Combination: {window_labels}")

            window_features = []
            for w in window_combo:
                window_features += window_groups[w]

            base_features = day_features + window_features + other_features
            available_features = [f for f in base_features if f in df.columns]

            for target in target_features:
                print(f"    --- Now modeling TARGET VARIABLE: {target} ---")

                for trial_group in range(3):  # 3 groups of trials
                    if len(available_features) >= 10:
                        sampled_features = random.sample(available_features, 10)
                    else:
                        sampled_features = available_features

                    for trial in range(3):  # 3 trials per group
                        params = random.choice(HYPERPARAM_GRID)
                        y = df[target]
                        valid_idx = y.dropna().index
                        df_valid = df.loc[valid_idx].copy()
                        df_valid = df_valid.sort_values(by='date')

                        # --- Custom Date Split ---
                        train_mask = (df_valid['date'] >= '2025-01-01') & (df_valid['date'] <= '2025-03-28')
                        test_mask = (df_valid['date'] >= '2025-04-01') & (df_valid['date'] <= '2025-05-30')

                        df_train = df_valid.loc[train_mask]
                        df_test = df_valid.loc[test_mask]

                        if df_train.empty or df_test.empty or len(df_train) < 100 or len(df_test) < 50:
                            print("⚠️ Skipping due to insufficient data in date-based split.")
                            continue

                        X_train = df_train[sampled_features]
                        y_train = df_train[target]
                        X_test = df_test[sampled_features]
                        y_test = df_test[target]

                        model = create_xgb_model(params)
                        start_time = time.time()
                        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
                        elapsed = time.time() - start_time
                        y_pred = model.predict(X_test)

                        metrics = {
                            'day_group': day_label,
                            'window_combo': window_labels,
                            'target': target,
                            'trial_group': trial_group + 1,
                            'trial_in_group': trial + 1,
                            'features_used': ', '.join(sampled_features),
                            'train_start': df_train['date'].min().strftime('%Y-%m-%d'),
                            'train_end': df_train['date'].max().strftime('%Y-%m-%d'),
                            'test_start': df_test['date'].min().strftime('%Y-%m-%d'),
                            'test_end': df_test['date'].max().strftime('%Y-%m-%d'),
                            'max_depth': params['max_depth'],
                            'subsample': params['subsample'],
                            'colsample_bytree': params['colsample_bytree'],
                            'min_child_weight': params['min_child_weight'],
                            'gamma': params['gamma'],
                            'reg_alpha': params['reg_alpha'],
                            'reg_lambda': params['reg_lambda'],
                            'colsample_bylevel': params['colsample_bylevel'],
                            'colsample_bynode': params['colsample_bynode'],
                            'max_delta_step': params['max_delta_step'],
                            'accuracy': accuracy_score(y_test, y_pred),
                            'precision': precision_score(y_test, y_pred, zero_division=0),
                            'recall': recall_score(y_test, y_pred, zero_division=0),
                            'f1': f1_score(y_test, y_pred, zero_division=0),
                            'time_sec': elapsed
                        }
                        results.append(metrics)

                        print(f"      ✅ Trial {trial+1}/3 | Acc: {metrics['accuracy']:.3f} | F1: {metrics['f1']:.3f} | Time: {elapsed:.2f}s")

    results_df = pd.DataFrame(results)
    results_df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n🎯 All experiments complete! Results saved to {OUTPUT_CSV}")
    return results_df

if __name__ == "__main__":
    final_results = run_modeling()


In [None]:
# sliding time series fold cv ( 2month training 1 month testing)


import pandas as pd
import numpy as np
import xgboost as xgb
import time
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# ================== CONFIGURATION ==================
DATA_PATH ='/content/drive/MyDrive/Copy of nifty_filtered_features.csv'
OUTPUT_CSV = "nifty_date_based_folds.csv"

INPUT_FEATURES = [
   'voi_ratio', 'iv_skew', 'gap', 'curr_day_low_diff', 'gap_direction', 'diff_low_180s',
'spot_ret300']
TARGETS = [
    'iv_target_60', 'iv_target_180', 'iv_target_300',
    'realized_vol_target_60', 'realized_vol_target_180', 'realized_vol_target_300',
    'spot_dir_target_60', 'spot_dir_target_180', 'spot_dir_target_300'
]

FIXED_PARAMS = {
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 6,
    'gamma': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.3,
    'colsample_bylevel': 0.7,
    'colsample_bynode': 0.8,
    'max_delta_step': 2,
    'booster': 'gbtree',
    'nthread': 6,
    'verbosity': 2,
    'validate_parameters': False,
    'learning_rate': 0.8,
    'n_estimators': 700
}

# Fold info: (train_start, train_end, test_start, test_end)
FOLDS = [
    ("2025-01-01", "2025-02-28", "2025-03-03", "2025-03-28"),
    ("2025-02-03", "2025-03-28", "2025-04-01", "2025-04-30"),
    ("2025-03-03", "2025-04-30", "2025-05-01", "2025-05-30")
]

def run_modeling():
    df = pd.read_csv(DATA_PATH)
    df['date'] = pd.to_datetime(df['date'])  # convert date col to datetime
    all_results = []

    if not os.path.exists(OUTPUT_CSV):
        pd.DataFrame(columns=['target', 'fold', 'train_start', 'train_end', 'test_start', 'test_end', 'train_rows', 'test_rows',
                              'features_used'] + list(FIXED_PARAMS.keys()) +
                             ['accuracy', 'precision', 'recall', 'f1', 'training_time_sec']
                    ).to_csv(OUTPUT_CSV, index=False)

    global_start = time.time()

    for target in TARGETS:
        print(f"\n🔍 Processing target: {target}")
        df_target = df.dropna(subset=[target]).copy()

        if len(df_target) < 100:
            print(f"⚠️ Skipping {target}: Not enough data")
            continue

        for fold_num, (train_start, train_end, test_start, test_end) in enumerate(FOLDS, start=1):
            # Filter by date range
            train_mask = (df_target['date'] >= train_start) & (df_target['date'] <= train_end)
            test_mask = (df_target['date'] >= test_start) & (df_target['date'] <= test_end)

            df_train = df_target.loc[train_mask]
            df_test = df_target.loc[test_mask]

            if len(df_train) < 50 or len(df_test) < 50:
                print(f"⚠️ Fold K{fold_num} skipped due to insufficient data")
                continue

            X_train = df_train[INPUT_FEATURES]
            y_train = df_train[target]

            X_test = df_test[INPUT_FEATURES]
            y_test = df_test[target]

            # Train model
            start_time = time.time()
            model = xgb.XGBClassifier(**FIXED_PARAMS)
            model.fit(X_train, y_train)
            training_time = time.time() - start_time

            # Evaluate
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)

            result = {
                'target': target,
                'fold': f"K{fold_num}",
                'train_start': train_start,
                'train_end': train_end,
                'test_start': test_start,
                'test_end': test_end,
                'train_rows': len(y_train),
                'test_rows': len(y_test),
                'features_used': ', '.join(INPUT_FEATURES),
                **FIXED_PARAMS,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'training_time_sec': training_time
            }

            print(f"\n✅ Fold K{fold_num} for {target}:")
            print(f"Train: {train_start} → {train_end} ({len(y_train)} rows)")
            print(f"Test:  {test_start} → {test_end} ({len(y_test)} rows)")
            print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")
            print("------------")

            pd.DataFrame([result]).to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
            all_results.append(result)

    total_time = time.time() - global_start
    print(f"\n🎯 All folds processed in {total_time/60:.1f} minutes")
    print(f"📁 Results saved to: {OUTPUT_CSV}")
    return pd.DataFrame(all_results)

if __name__ == "__main__":
    final_results = run_modeling()





In [None]:
import pandas as pd
import numpy as np
import time
import warnings
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings("ignore")

# ================== CONFIG ==================
DATA_PATH = '/content/drive/MyDrive/Copy of nifty_filtered_features.csv'
TARGET = 'realized_vol_target_300'  # you can loop over multiple targets
DATE_COL = 'date'

# BEST features (optimized)
FEATURES = [
   'voi_ratio', 'iv_skew', 'gap_pct', 'curr_day_high_diff', 'curr_day_low_diff',
'diff_low_180s', 'diff_high_180s', 'spot_ret300', 'spot_rv180',
'iv_ret60', 'iv_rv180', 'iv_stability_180',
'fut_vol_chg180', 'fut_ret180', 'spot_stability_180'

]

# ================== PIPELINE START ==================
def load_data():
    df = pd.read_csv(DATA_PATH)
    df[DATE_COL] = pd.to_datetime(df[DATE_COL])
    df = df.dropna(subset=[TARGET])
    return df

def preprocess(df, train_start, train_end, test_start, test_end):
    # Filter by date
    train_df = df[(df[DATE_COL] >= train_start) & (df[DATE_COL] <= train_end)].sample(n=1000000, random_state=42)
    test_df = df[(df[DATE_COL] >= test_start) & (df[DATE_COL] <= test_end)].sample(n=250000, random_state=42)

    # Features and target
    X_train = train_df[FEATURES]
    y_train = train_df[TARGET]
    X_test = test_df[FEATURES]
    y_test = test_df[TARGET]

    return X_train, y_train, X_test, y_test

def build_pipeline():
    scaler = StandardScaler()
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

    xgb_model = XGBClassifier(
        n_estimators=1500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.85,
        colsample_bytree=0.8,
        reg_alpha=0.05,
        reg_lambda=1.0,
        gamma=0.2,
        use_label_encoder=False,
        eval_metric='logloss',
        n_jobs=-1
    )

    lgb_model = LGBMClassifier(
        n_estimators=1500,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.85,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        class_weight='balanced',
        n_jobs=-1
    )

    # Combine into soft voting ensemble
    ensemble = VotingClassifier(
        estimators=[('xgb', xgb_model), ('lgb', lgb_model)],
        voting='soft',
        n_jobs=-1
    )

    # Full pipeline
    pipeline = Pipeline([
        ('scale', scaler),
        ('poly', poly),
        ('model', ensemble)
    ])

    return pipeline

def evaluate(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return acc, prec, rec, f1

def run_final():
    df = load_data()
    X_train, y_train, X_test, y_test = preprocess(
        df, "2025-01-01", "2025-04-30", "2025-05-01", "2025-05-30"
    )

    pipeline = build_pipeline()

    print("🚀 Training ensemble model...")
    start = time.time()
    pipeline.fit(X_train, y_train)
    print(f"✅ Model trained in {(time.time() - start):.1f} seconds")

    print("📊 Evaluating...")
    y_pred = pipeline.predict(X_test)
    acc, prec, rec, f1 = evaluate(y_test, y_pred)

    print(f"\n🎯 Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    return acc, prec, rec, f1

# ================== EXECUTE ==================
if __name__ == "__main__":
    run_final()


In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/colab/Copy of nifty_spot_fut_data.csv")
df.columns

Index(['date', 'time', 'dte', 'spot_ltp', 'fut_1_ltp', 'fut_1_vol', 'fut_1_oi',
       'atm_c_ltp', 'atm_c_iv', 'atm_p_ltp', 'atm_p_iv'],
      dtype='object')