In [None]:
from datetime import datetime
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split,TimeSeriesSplit
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from scipy.stats import pearsonr
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
train= True      # amke false for prediction

def clean(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df =df.set_index('timestamp')
    
    df.dropna(inplace=True)
    low , high = df['mid_price'].quantile([0.02,0.98])   # removed outliers 
    df= df[df['mid_price'].between(low,high)]  
    return df

def realized_volatility(series,n):
    return series.ewm(span=n).std()

def features(df):
    X = pd.DataFrame(index=df.index)
    X['wap1']= (df['bid_price1']*df['ask_volume1'] + df['ask_price1']*df['bid_volume1'])/(df['bid_volume1']+df['ask_volume1'])
    X['wap2']= (df['bid_price2']*df['ask_volume2'] + df['ask_price2']*df['bid_volume2'])/(df['bid_volume2']+df['ask_volume2'])
    X['wap3']= (df['bid_price3']*df['ask_volume3'] + df['ask_price3']*df['bid_volume3'])/(df['bid_volume3']+df['ask_volume3'])
    X['wap4']= (df['bid_price4']*df['ask_volume4'] + df['ask_price4']*df['bid_volume4'])/(df['bid_volume4']+df['ask_volume4'])
    X['wap5']= (df['bid_price5']*df['ask_volume5'] + df['ask_price5']*df['bid_volume5'])/(df['bid_volume5']+df['ask_volume5'])
    X['mid_return'] = df['mid_price'].diff()
    X['spread_bps'] = (df['ask_price1'] - df['bid_price1']) / df['mid_price'] * 10000
    for n in [10,20,50,100,200,300,450,600]:
        for i in range(1,6):
            X[f'wap{i}_rol{n}'] = X[f'wap{i}'].ewm(span=n).mean()
            X[f'rv_wap{i}_rol{n}'] = X[f'wap{i}'].ewm(span=n).std()
    X['log_return1'] = X['wap1'].diff()
    X['log_return2'] =  (X['wap2']).diff()
    X['log_return3'] =  (X['wap3']).diff()
    X['log_return4'] =  (X['wap4']).diff()
    X['log_return5'] =  (X['wap5']).diff()
    X['log_return_ask1'] = (df['ask_price1']).diff()
    X['log_return_ask2'] = (df['ask_price2']).diff()
    X['log_return_bid1'] = (df['bid_price1']).diff()
    X['log_return_bid2'] = (df['bid_price2']).diff()
    for col in X.columns:
        if 'log' in col:
            for n in [10,20,50,100,200,300,450,600]:
                X[f'rv_{col}_{n}']= realized_volatility(X[col],n)
    X['wa_bal1'] = np.abs(X['wap1']-X['wap2'])
    X['bid_spread1'] = df['bid_price1'] - df['bid_price2']
    X['bid_spread2'] =df['bid_price2'] - df['bid_price3']
    X['ask_spread1'] = df['ask_price1'] - df['ask_price2']
    X['ask_spread2'] = df['ask_price2'] - df['ask_price3']
    X['total_ask_volume'] = df['ask_volume1'] + df['ask_volume2'] + df['ask_volume3'] + df['ask_volume4'] + df['ask_volume5'] 
    X['total_bid_volume'] = df['bid_volume1'] + df['bid_volume2'] + df['bid_volume3'] + df['bid_volume4'] + df['bid_volume5'] 
    X['total_volume'] = X['total_ask_volume']+X['total_bid_volume']
    X['volume_imbalance'] = X['total_ask_volume'] - X['total_bid_volume']
    X['vol_ratio'] = X['total_bid_volume']/X['total_ask_volume']
    X['order_flow_pressure'] = (X['total_bid_volume']-X['total_ask_volume'])/(X['total_volume']+1)
    for n in [10,20,50,100,200,300,450,600]:
        X[f'ofp_rol_{n}']= X['order_flow_pressure'].ewm(span=n).mean()
        X[f'ofp_rol_diff_{n}'] = X[f'ofp_rol_{n}'].diff()
        X[f'rv_mid_return_{n}']   =X['mid_return'].ewm(span=n).std()
    for n in range(1,6):
        X[f'vol_imb_{n}'] = (df[f'bid_volume{n}'] - df[f'ask_volume{n}'] ) / (df[f'bid_volume{n}'] + df[f'ask_volume{n}'])
        X[f'toxic_flow{n}'] = -(X[f'vol_imb_{n}']* X[f'log_return{n}'])
    #for func in ['count','sum','skew','kurt']:
    
    return X
# custom evaluation metric for the validating the model
def pearson_metric(y_true, y_pred):  
    return pearsonr(y_true, y_pred)[0]
pearson_scorer = make_scorer(pearson_metric, greater_is_better=True) 

def train(X,y,tscv):
    model = lgb.LGBMRegressor(
    #device='GPU',    use GPU if gpu available
    objective='regression',
    verbosity=-1,
    n_jobs=-1,
    #random_state=42
    )

    cv_search_space = {
    'n_estimators': np.arange(100, 3000, 100),
    'max_depth': np.arange(2, 20, 2),
    'num_leaves': np.arange(20, 100, 20),
    'min_child_samples': np.arange(10, 1000, 50),
    'max_bin': np.arange(10, 200, 5),
    'reg_alpha': np.linspace(0.01, 5.0, 5),
    'reg_lambda': np.linspace(0.01, 5.0, 5),
    'colsample_bytree': np.linspace(0.1, 0.5, 5),
    'min_child_weight': np.linspace(0.01, 10, 2),
    'subsample': np.linspace(0.01, 1, 2),
    }

    rand_cv = RandomizedSearchCV(
    estimator=model, 
    param_distributions=cv_search_space,
    scoring=pearson_scorer,
    n_iter=5,
    verbose=3,
    cv=tscv,
    n_jobs=1
    )

    rand_cv.fit(X, y)  # No early_stopping_rounds here
    print(f"Best ROC AUC: {rand_cv.best_score_:.4f}")
    print(f"Best params: {rand_cv.best_params_}")
    model = rand_cv.best_estimator_
    return model

train=pd.read_csv("/kaggle/input/gq-implied-volatility-forecasting/train/ETH.csv")
test=pd.read_csv("/kaggle/input/gq-implied-volatility-forecasting/test/ETH.csv")
sub=pd.read_csv("/kaggle/input/gq-implied-volatility-forecasting/submission.csv")
train =clean(train)
X = features(train)
y= train['label']
tscv = TimeSeriesSplit(n_splits=5)
model = training(X,y,tscv)
test = features(test)
if set(test)==set(X):
    print('model ready to predict')
    sub['labels'] = model.predict(test)
else:
    print('check features for error')
    
sub.to_csv("sub.csv", index=False)
print('file is ready to be submitted')