In [1]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from tqdm import tqdm

# --- Load and Preprocess Data (DO NOT MODIFY AFTER THIS) ---
ndf = pd.read_csv("../Downloads/st_exportd.csv").iloc[:,1:]
for i in range(len(ndf)):
    val = ndf.iloc[i,-3]
    if isinstance(val, str):
        ndf.iloc[i,-3] = val.split("GMT")[0].strip()
    else:
        ndf.iloc[i,-3] = np.nan
ndf['timestamp'] = pd.to_datetime(ndf['timestamp'], errors='coerce')
ndf = ndf.sort_values('timestamp').reset_index(drop=True)

In [2]:
import datetime

In [3]:
# --- Display Results (FULL OUTPUT) ---
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-detect terminal width
pd.set_option('display.max_colwidth', None)  # Show full content of each column

In [4]:
len(ndf)

130095

In [6]:
ndf[ndf["timestamp"]>"2025-12-14 23:50:50"].head(20)

Unnamed: 0,klineacc,spread,spreadper,x,vwap,deviation,ratio,term,sigma,e,h,asset,timestamp,gap,gaplimit
128265,52076180,-3233650.0,-140.252884,-0.788344,412.997925,-0.025685,2.402529,2.28836,0.024213,-5.832067,-0.999983,ZEC,2025-12-14 23:54:43,1m,300
128266,14879683,-591209.2,-20.994366,-0.812794,1.607381,-0.012867,1.209944,0.64774,0.008788,-1.147488,-0.81692,SUI,2025-12-14 23:54:43,1m,300
128267,15534632,-3552608.0,-56.770786,-0.684814,1.999079,-0.002891,1.567708,0.335241,0.002365,-0.642537,-0.566625,XRP,2025-12-14 23:54:43,1m,300
128268,16561084,-870372.1,-16.734903,-0.798046,889.205933,-0.003223,1.167349,0.376235,0.002979,-0.475105,-0.442314,BNB,2025-12-14 23:54:43,1m,300
128269,26446768,-9036728.0,-45.395485,-0.643425,131.196747,-0.00371,1.453955,0.224308,0.002828,-0.427826,-0.403503,SOL,2025-12-14 23:54:43,1m,300
128270,144317952,259221.3,1.801218,0.90013,89253.21875,-0.002929,0.981988,0.243918,0.002616,-0.268134,-0.261888,BTC,2025-12-14 23:54:43,1m,300
128271,126314528,-460486.9,-5.545259,-0.887925,3086.93457,-0.001362,1.055453,0.192757,0.003132,-0.088465,-0.088235,ETH,2025-12-14 23:54:43,1m,300
128272,124928640,563754.1,6.76299,0.883447,3087.11499,-0.000666,0.93237,0.184973,0.003019,-0.03803,-0.038011,ETH,2025-12-15 00:02:08,3m,100
128273,143547616,1572963.0,10.213843,0.8612,89242.09375,-0.002282,0.897862,0.242821,0.002472,-0.201254,-0.198581,BTC,2025-12-15 00:02:08,3m,100
128274,16258513,-1193558.0,-24.061993,-0.777867,889.1922,-0.002555,1.24062,0.333172,0.002894,-0.364921,-0.349541,BNB,2025-12-15 00:02:08,3m,100


# predict h

# predict e

In [7]:
import pandas as pd
import numpy as np
import pickle
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

# --- Split Data (DO NOT MODIFY AFTER THIS) ---
train_df = ndf.iloc[:128272, :].copy()
actual_x = ndf.iloc[128272:, :].copy()

# --- Define Gap Groups and Target ---
short_gaps = ['1m', '3m', '5m', '15m', '30m', '1h']
long_gaps = ['2h', '4h', '6h', '8h', '12h']
target = 'e'  # Predict 'e' instead of 'h'
features = ['klineacc', 'spread', 'vwap', 'deviation', 'ratio', 'term', 'sigma']

# --- Helper Functions ---
def get_last_train_timestamp(df, asset, gap):
    """Return the last timestamp for a given asset and gap in train_df."""
    subset = df[(df['asset'] == asset) & (df['gap'] == gap)]
    return subset['timestamp'].max()

def shortlist_actual_x(actual_x, asset, gap, end_duration):
    """Shortlist actual_x entries for evaluation."""
    mask = (
        (actual_x['asset'] == asset) &
        (actual_x['gap'] == gap) &
        (actual_x['timestamp'] <= end_duration)
    )
    return actual_x[mask]

# --- Training ---
models = {}
unique_assets = set()
for key in train_df['asset'].unique():
    unique_assets.add(key)

for asset in tqdm(unique_assets, desc="Training assets"):
    for gap in short_gaps + long_gaps:
        # Filter train_df for this asset and gap
        train_subset = train_df[(train_df['asset'] == asset) & (train_df['gap'] == gap)]
        if len(train_subset) == 0:
            continue

        # Prepare data
        X_train = train_subset[features]
        y_train = train_subset[target]

        # Train XGBoost model
        model = XGBRegressor(
            objective='reg:squarederror',
            n_estimators=200,
            max_depth=6,
            learning_rate=0.05,
            reg_lambda=1,  # L2 regularization to prevent overfitting
            random_state=42
        )
        model.fit(X_train, y_train)

        # Save model
        model_key = f"{asset}_{gap}"
        models[model_key] = model

# --- Save All Models to Disk ---
with open('trained_models_e.pkl', 'wb') as f:
    pickle.dump(models, f)

print("Training complete. Models saved to 'trained_models_e.pkl'.")

Training assets: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [01:04<00:00,  1.14it/s]


Training complete. Models saved to 'trained_models_e.pkl'.


In [8]:
# --- Evaluation ---
results = []
predicted_results = []  # Store predicted vs actual 'e' and 'h'

# Load models
with open('trained_models_e.pkl', 'rb') as f:
    models = pickle.load(f)

print(unique_assets,len(unique_assets))

for asset in tqdm(unique_assets, desc="Evaluating assets"):
    for gap in short_gaps + long_gaps:
        model_key = f"{asset}_{gap}"
        if model_key not in models:
            continue

        # Get last train timestamp
        last_train_ts = get_last_train_timestamp(train_df, asset, gap)
        if pd.isna(last_train_ts):
            continue

        # Calculate end_duration
        if gap in short_gaps:
            duration_minutes = 300
        else:
            duration_minutes = 7200
        end_duration = last_train_ts + pd.Timedelta(minutes=duration_minutes)

        # Shortlist actual_x
        eval_subset = shortlist_actual_x(actual_x, asset, gap, end_duration)
        if len(eval_subset) == 0:
            continue

        # Prepare data
        X_eval = eval_subset[features]
        y_eval_e = eval_subset[target]  # Actual 'e'
        y_eval_h = eval_subset['h']    # Actual 'h' (for directional accuracy)

        # Predict 'e'
        model = models[model_key]
        y_pred_e = model.predict(X_eval)
        y_pred_h = np.tanh(y_pred_e)    # Compute 'h' from predicted 'e'

        # Calculate metrics
        mae_e = mean_absolute_error(y_eval_e, y_pred_e)
        mae_h = mean_absolute_error(y_eval_h, y_pred_h)
        directional_acc = np.mean((np.sign(y_eval_h) == np.sign(y_pred_h)).astype(float))

        # Store evaluation results
        results.append({
            'asset': asset,
            'gap': gap,
            'last_train_timestamp': last_train_ts,
            'end_duration': end_duration,
            'n_eval_samples': len(eval_subset),
            'MAE_e': mae_e,
            'MAE_h': mae_h,
            'Directional_Accuracy': directional_acc
        })

        # Store predicted vs actual 'e' and 'h'
        eval_subset = eval_subset.reset_index(drop=True)
        for idx in range(len(eval_subset)):
            predicted_results.append({
                'asset': asset,
                'gap': gap,
                'timestamp': eval_subset.loc[idx, 'timestamp'],
                'actual_e': eval_subset.loc[idx, target],
                'predicted_e': y_pred_e[idx],
                'actual_h': eval_subset.loc[idx, 'h'],
                'predicted_h': y_pred_h[idx],
                'absolute_error_e': abs(eval_subset.loc[idx, target] - y_pred_e[idx]),
                'absolute_error_h': abs(eval_subset.loc[idx, 'h'] - y_pred_h[idx]),
                'last_train_timestamp': last_train_ts
            })

# --- Save Results ---
results_df = pd.DataFrame(results)
predicted_df = pd.DataFrame(predicted_results)

# Find the entry with the biggest error for 'e' and 'h'
biggest_error_e_df = predicted_df.loc[predicted_df.groupby(['asset', 'gap'])['absolute_error_e'].idxmax()]
biggest_error_h_df = predicted_df.loc[predicted_df.groupby(['asset', 'gap'])['absolute_error_h'].idxmax()]

# Save to CSV
results_df.to_csv('evaluation_results_e.csv', index=False)
predicted_df.to_csv('predicted_vs_actual_e_h.csv', index=False)
biggest_error_e_df.to_csv('biggest_error_e_entries.csv', index=False)
biggest_error_h_df.to_csv('biggest_error_h_entries.csv', index=False)

print("Evaluation complete. Results saved to:")
print("- evaluation_results_e.csv (summary)")
print("- predicted_vs_actual_e_h.csv (all predicted vs actual 'e' and 'h' values)")
print("- biggest_error_e_entries.csv (entries with the biggest error for 'e')")
print("- biggest_error_h_entries.csv (entries with the biggest error for 'h')")

{'BERA', 'NMR', 'FIL', 'KSM', 'LINK', 'BCH', 'TRUMP', 'BTC', 'SOL', 'CAKE', 'PYR', 'ATOM', 'VANA', 'TRB', 'MORPHO', 'WBTC', 'PAXG', 'OG', 'AAVE', 'ASR', 'ETH', 'MMT', 'EUL', 'XNO', 'VIRTUAL', 'AR', '0G', 'ENS', 'COMP', 'ZEC', 'PENDLE', 'APT', 'ENSO', 'UNI', 'WBETH', 'ORDI', 'ZRO', 'BNB', 'LTC', 'NEAR', 'TON', 'ZEN', 'BNSOL', 'BANANA', 'LPT', 'NEO', 'ORCA', 'SSV', 'FARM', 'RENDER', 'TAO', 'RAY', 'AXS', 'ETC', 'XRP', 'QNT', 'ICP', 'EGLD', 'SANTOS', 'MLN', 'INJ', 'ASTER', 'GIGGLE', 'METIS', 'ILV', 'DOT', 'AVAX', 'ALCX', 'AUCTION', 'DCR', 'TWT', 'MOVR', 'DASH', 'SUI'} 74


Evaluating assets: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:11<00:00,  6.69it/s]

Evaluation complete. Results saved to:
- evaluation_results_e.csv (summary)
- predicted_vs_actual_e_h.csv (all predicted vs actual 'e' and 'h' values)
- biggest_error_e_entries.csv (entries with the biggest error for 'e')
- biggest_error_h_entries.csv (entries with the biggest error for 'h')





In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

# --- Load and Preprocess Data (DO NOT MODIFY) ---
# (Assume ndf, train_df, actual_x are already loaded and preprocessed)

# --- Load Trained Models ---
with open('trained_models_e.pkl', 'rb') as f:
    models = pickle.load(f)

# --- Define Gap Groups and Target ---
short_gaps = ['1m', '3m', '5m', '15m', '30m', '1h']
long_gaps = ['2h', '4h', '6h', '8h', '12h']
target = 'e'  # Predict 'e' and compute 'h'
features = ['klineacc', 'spread', 'vwap', 'deviation', 'ratio', 'term', 'sigma']

# --- Helper Functions ---
def get_last_train_timestamp(df, asset, gap):
    """Return the last timestamp for a given asset and gap in train_df."""
    subset = df[(df['asset'] == asset) & (df['gap'] == gap)]
    return subset['timestamp'].max()

def shortlist_actual_x(actual_x, asset, gap, end_duration):
    """Shortlist actual_x entries for evaluation."""
    mask = (
        (actual_x['asset'] == asset) &
        (actual_x['gap'] == gap) &
        (actual_x['timestamp'] <= end_duration)
    )
    return actual_x[mask]

# --- Evaluation ---
results = []
predicted_results = []  # Store ALL predicted vs actual 'e' and 'h' for EVERY ROW

# Extract unique assets from model keys
unique_assets = set()
for key in models.keys():
    asset = key.split('_')[0]
    unique_assets.add(asset)

for asset in tqdm(unique_assets, desc="Evaluating assets"):
    for gap in short_gaps + long_gaps:
        model_key = f"{asset}_{gap}"
        if model_key not in models:
            continue

        # Get last train timestamp
        last_train_ts = get_last_train_timestamp(train_df, asset, gap)
        if pd.isna(last_train_ts):
            continue

        # Calculate end_duration
        if gap in short_gaps:
            duration_minutes = 300
        else:
            duration_minutes = 7200
        end_duration = last_train_ts + pd.Timedelta(minutes=duration_minutes)

        # Shortlist actual_x
        eval_subset = shortlist_actual_x(actual_x, asset, gap, end_duration)
        if len(eval_subset) == 0:
            continue

        # Prepare data
        X_eval = eval_subset[features]
        y_eval_e = eval_subset[target]  # Actual 'e'
        y_eval_h = eval_subset['h']    # Actual 'h'

        # Predict 'e' and compute 'h'
        model = models[model_key]
        y_pred_e = model.predict(X_eval)
        y_pred_h = np.tanh(y_pred_e)  # Compute 'h' from predicted 'e'

        # Calculate metrics (for summary)
        mae_e = mean_absolute_error(y_eval_e, y_pred_e)
        mae_h = mean_absolute_error(y_eval_h, y_pred_h)
        directional_acc = np.mean((np.sign(y_eval_h) == np.sign(y_pred_h)).astype(float))

        # Store evaluation results (summary)
        results.append({
            'asset': asset,
            'gap': gap,
            'last_train_timestamp': last_train_ts,
            'end_duration': end_duration,
            'n_eval_samples': len(eval_subset),
            'MAE_e': mae_e,
            'MAE_h': mae_h,
            'Directional_Accuracy': directional_acc
        })

        # Store predicted vs actual 'e' and 'h' for EVERY ROW in eval_subset
        eval_subset = eval_subset.reset_index(drop=True)
        for idx in range(len(eval_subset)):
            predicted_results.append({
                'asset': asset,
                'gap': gap,
                'timestamp': eval_subset.loc[idx, 'timestamp'],
                'actual_e': eval_subset.loc[idx, target],
                'predicted_e': y_pred_e[idx],
                'actual_h': eval_subset.loc[idx, 'h'],
                'predicted_h': y_pred_h[idx],
                'absolute_error_e': abs(eval_subset.loc[idx, target] - y_pred_e[idx]),
                'absolute_error_h': abs(eval_subset.loc[idx, 'h'] - y_pred_h[idx]),
                'last_train_timestamp': last_train_ts
            })

# --- Save Results ---
results_df = pd.DataFrame(results)
predicted_df = pd.DataFrame(predicted_results)

# Save to CSV
results_df.to_csv('evaluation_results_e.csv', index=False)
predicted_df.to_csv('predicted_vs_actual_e_h_detailed.csv', index=False)  # ALL ROWS

print("Evaluation complete. Results saved to:")
print("- evaluation_results_e.csv (summary metrics)")
print("- predicted_vs_actual_e_h_detailed.csv (EVERY predicted vs actual 'e' and 'h' for all rows)")

Evaluating assets: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:10<00:00,  6.74it/s]

Evaluation complete. Results saved to:
- evaluation_results_e.csv (summary metrics)
- predicted_vs_actual_e_h_detailed.csv (EVERY predicted vs actual 'e' and 'h' for all rows)





In [10]:
f=['BTC','ETH','SOL','ZEC','BNB', 'XRP','PAXG','BCH','PENDLE','AAVE','TAO']

In [11]:
results_df.sort_values(['asset', 'gap'])[results_df["asset"].isin(f)]

  results_df.sort_values(['asset', 'gap'])[results_df["asset"].isin(f)]


Unnamed: 0,asset,gap,last_train_timestamp,end_duration,n_eval_samples,MAE_e,MAE_h,Directional_Accuracy
51,AAVE,2h,2025-12-14 18:56:31,2025-12-19 18:56:31,4,0.55495,0.147817,1.0
52,AAVE,4h,2025-12-14 17:44:31,2025-12-19 17:44:31,2,0.035361,0.028291,1.0
53,AAVE,6h,2025-12-14 17:18:18,2025-12-19 17:18:18,1,0.321493,0.300182,1.0
54,AAVE,8h,2025-12-14 17:04:00,2025-12-19 17:04:00,1,0.025446,0.025356,1.0
8,BCH,2h,2025-12-14 18:56:31,2025-12-19 18:56:31,4,0.011327,0.011264,1.0
9,BCH,4h,2025-12-14 17:44:31,2025-12-19 17:44:31,2,0.029963,0.029616,1.0
10,BCH,6h,2025-12-14 17:18:18,2025-12-19 17:18:18,1,0.055309,0.054817,1.0
11,BCH,8h,2025-12-14 17:04:00,2025-12-19 17:04:00,1,0.087368,0.086373,1.0
97,BNB,15m,2025-12-14 22:49:12,2025-12-15 03:49:12,2,0.239263,0.072689,1.0
99,BNB,1h,2025-12-14 21:21:46,2025-12-15 02:21:46,1,0.349402,0.031387,1.0


In [12]:
predicted_df.sort_values(['asset', 'gap'])[predicted_df["asset"].isin(f)]

  predicted_df.sort_values(['asset', 'gap'])[predicted_df["asset"].isin(f)]


Unnamed: 0,asset,gap,timestamp,actual_e,predicted_e,actual_h,predicted_h,absolute_error_e,absolute_error_h,last_train_timestamp
189,AAVE,2h,2025-12-15 01:00:15,-0.730798,-1.961099,-0.623553,-0.961174,1.230301,0.33762,2025-12-14 18:56:31
190,AAVE,2h,2025-12-15 07:04:07,-1.167457,-2.042664,-0.823455,-0.966921,0.875207,0.143466,2025-12-14 18:56:31
191,AAVE,2h,2025-12-15 13:07:59,-0.26891,-0.199312,-0.26261,-0.196714,0.069598,0.065896,2025-12-14 18:56:31
192,AAVE,2h,2025-12-15 19:11:30,-0.072715,-0.117409,-0.072587,-0.116872,0.044694,0.044285,2025-12-14 18:56:31
193,AAVE,4h,2025-12-15 05:48:25,-1.058845,-1.081123,-0.785221,-0.793615,0.022278,0.008394,2025-12-14 17:44:31
194,AAVE,4h,2025-12-15 17:52:27,-0.095575,-0.047132,-0.095285,-0.047097,0.048444,0.048189,2025-12-14 17:44:31
195,AAVE,6h,2025-12-15 11:22:10,-0.410102,-0.088609,-0.388559,-0.088378,0.321493,0.300182,2025-12-14 17:18:18
196,AAVE,8h,2025-12-15 17:08:01,-0.04656,-0.072006,-0.046526,-0.071882,0.025446,0.025356,2025-12-14 17:04:00
16,BCH,2h,2025-12-15 01:00:15,-0.030119,-0.030026,-0.03011,-0.030017,9.3e-05,9.3e-05,2025-12-14 18:56:31
17,BCH,2h,2025-12-15 07:04:07,-0.139323,-0.131648,-0.138429,-0.130893,0.007675,0.007536,2025-12-14 18:56:31


In [13]:
biggest_error_e_df.sort_values(['asset', 'gap'])[biggest_error_e_df["asset"].isin(f)]

Unnamed: 0,asset,gap,timestamp,actual_e,predicted_e,actual_h,predicted_h,absolute_error_e,absolute_error_h,last_train_timestamp
189,AAVE,2h,2025-12-15 01:00:15,-0.730798,-1.961099,-0.623553,-0.961174,1.230301,0.33762,2025-12-14 18:56:31
194,AAVE,4h,2025-12-15 17:52:27,-0.095575,-0.047132,-0.095285,-0.047097,0.048444,0.048189,2025-12-14 17:44:31
195,AAVE,6h,2025-12-15 11:22:10,-0.410102,-0.088609,-0.388559,-0.088378,0.321493,0.300182,2025-12-14 17:18:18
196,AAVE,8h,2025-12-15 17:08:01,-0.04656,-0.072006,-0.046526,-0.071882,0.025446,0.025356,2025-12-14 17:04:00
19,BCH,2h,2025-12-15 19:11:30,-0.072754,-0.046614,-0.072626,-0.04658,0.026141,0.026046,2025-12-14 18:56:31
20,BCH,4h,2025-12-15 05:48:25,-0.139463,-0.091148,-0.138566,-0.090897,0.048314,0.047669,2025-12-14 17:44:31
22,BCH,6h,2025-12-15 11:22:10,-0.065651,-0.12096,-0.065557,-0.120373,0.055309,0.054817,2025-12-14 17:18:18
23,BCH,8h,2025-12-15 17:08:01,-0.060504,-0.147871,-0.06043,-0.146803,0.087368,0.086373,2025-12-14 17:04:00
414,BNB,15m,2025-12-15 03:26:49,-1.12836,-1.563635,-0.810457,-0.916007,0.435275,0.10555,2025-12-14 22:49:12
416,BNB,1h,2025-12-15 00:25:49,-1.709303,-2.058705,-0.936562,-0.967949,0.349402,0.031387,2025-12-14 21:21:46


In [14]:
biggest_error_h_df.sort_values(['absolute_error_h'])[biggest_error_h_df["asset"].isin(f)]

  biggest_error_h_df.sort_values(['absolute_error_h'])[biggest_error_h_df["asset"].isin(f)]


Unnamed: 0,asset,gap,timestamp,actual_e,predicted_e,actual_h,predicted_h,absolute_error_e,absolute_error_h,last_train_timestamp
85,BTC,30m,2025-12-15 00:25:44,-0.156703,-0.151186,-0.155432,-0.150044,0.005517,0.005388,2025-12-14 21:21:44
86,BTC,1h,2025-12-15 00:25:49,-0.507384,-0.483445,-0.467905,-0.448998,0.023939,0.018906,2025-12-14 21:21:46
415,BNB,30m,2025-12-15 00:25:44,-0.36242,-0.385164,-0.347344,-0.367184,0.022744,0.01984,2025-12-14 21:21:44
423,BNB,6h,2025-12-15 11:22:10,0.2105,0.187422,0.207445,0.185258,0.023078,0.022187,2025-12-14 17:18:18
196,AAVE,8h,2025-12-15 17:08:01,-0.04656,-0.072006,-0.046526,-0.071882,0.025446,0.025356,2025-12-14 17:04:00
19,BCH,2h,2025-12-15 19:11:30,-0.072754,-0.046614,-0.072626,-0.04658,0.026141,0.026046,2025-12-14 18:56:31
91,BTC,4h,2025-12-15 05:48:25,-0.293999,-0.323664,-0.285811,-0.312816,0.029665,0.027005,2025-12-14 17:44:31
526,XRP,8h,2025-12-15 17:08:01,-0.138583,-0.108035,-0.137703,-0.107617,0.030548,0.030086,2025-12-14 17:04:00
416,BNB,1h,2025-12-15 00:25:49,-1.709303,-2.058705,-0.936562,-0.967949,0.349402,0.031387,2025-12-14 21:21:46
424,BNB,8h,2025-12-15 17:08:01,0.146489,0.178929,0.14545,0.177044,0.03244,0.031594,2025-12-14 17:04:00
