Reference


https://www.kaggle.com/artgor/eda-feature-engineering-and-everything

https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

https://www.kaggle.com/christofhenkel/market-data-nn-baseline

   ### Getting data and importing libraries

In [None]:
import gc

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
#from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.kernel_ridge import KernelRidge
#from sklearn.svm import NuSVR
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
#import xgboost as xgb
import lightgbm as lgb

In [None]:
# official way to get the data
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
print('Done!')

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

In [None]:
pd.set_option("display.max_rows",10)
market_train_df

## Market data

We have a really interesting dataset which contains stock prices for many companies over a decade!

For now let's have a look at the data itself and not think about the competition. We can see long-term trends, appearing and declining companies and many other things.

In [None]:
print(f'{market_train_df.shape[0]} samples and {market_train_df.shape[1]} features in the training market dataset.')

In [None]:

market_train_df['price_diff'] = market_train_df['close'] - market_train_df['open']
grouped = market_train_df.groupby('time').agg({'price_diff': ['std', 'min']}).reset_index()
market_train_df


In [None]:
print(f"Average standard deviation of price change within a day in {grouped['price_diff']['std'].mean():.4f}.")

We can see huge price fluctiations when market crashed. Just think about it... **But this is wrong!** There was no huge crash on January 2010... Let's dive into the data!

### Possible data errors

At first let's simply sort data by the difference between open and close prices.

In [None]:
market_train_df.sort_values('price_diff')[:10]

In [None]:
market_train_df['close_to_open'] =  np.abs(market_train_df['close'] / market_train_df['open'])

In [None]:
print(f"In {(market_train_df['close_to_open'] >= 1.2).sum()} lines price increased by 20% or more.")
print(f"In {(market_train_df['close_to_open'] <= 0.8).sum()} lines price decreased by 20% or more.")

Well, this isn't much considering we have more than 4 million lines and a lot of these cases are due to price falls during market crash. Well just need to deal with outliers.

In [None]:
print(f"In {(market_train_df['close_to_open'] >= 2).sum()} lines price increased by 100% or more.")
print(f"In {(market_train_df['close_to_open'] <= 0.5).sum()} lines price decreased by 100% or more.")

For a quick fix I'll replace outliers in these lines with mean open or close price of this company.

In [None]:

market_train_df['assetName_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['assetName_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')

# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= 2].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']
        
for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 0.5].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']


In [None]:
market_train_df.drop(columns=['price_diff', 'close_to_open', 'assetName_mean_open', 'assetName_mean_close'], inplace=True)

## News data

reference
https://www.kaggle.com/jsaguiar/baseline-with-news

## 2. Preprocessing News
We are going to remove some columns for now and apply label encoding to a few others:

In [None]:
#news_train_df.columns

In [None]:
#news_num_cols = ['marketCommentary',
#                   'sentenceCount', 'wordCount',
#                   'firstMentionSentence', 'relevance', 'sentimentClass',
#                   'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
#                   'sentimentWordCount']

In [None]:
#news_train_df.head(3)

In [None]:
'''
def preprocess_news(news_train):
    drop_list = [
        'audiences', 'subjects', 'assetName',
        'headline', 'firstCreated', 'sourceTimestamp','headlineTag', 'provider', 'sourceId',
        'noveltyCount12H', 'noveltyCount24H',
        'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
        'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
        'volumeCounts7D'
    ]
    news_train.drop(drop_list, axis=1, inplace=True)
    
    news_train[news_num_cols] = news_train[news_num_cols].fillna(news_train[news_num_cols].mean())
    # Factorize categorical columns
    #for col in ['headlineTag', 'provider', 'sourceId']:
    #    news_train[col], uniques = pd.factorize(news_train[col])
    #    del uniques
    
    # Remove {} and '' from assetCodes column
    news_train['assetCodes'] = news_train['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train

news_train_df = preprocess_news(news_train_df)
'''

In [None]:
#news_train_df.head(3)

In [None]:
'''
def unstack_asset_codes(news_train):
    codes = []
    indexes = []
    for i, values in news_train['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

index_df = unstack_asset_codes(news_train_df)
index_df.head()
'''

In [None]:
'''
def merge_news_on_index(news_train, index_df):
    news_train['news_index'] = news_train.index.copy()

    # Merge news on unstacked assets
    news_unstack = index_df.merge(news_train, how='left', on='news_index')
    news_unstack.drop(['news_index', 'assetCodes'], axis=1, inplace=True)
    return news_unstack

news_unstack = merge_news_on_index(news_train_df, index_df)
del news_train_df, index_df
gc.collect()
'''

In [None]:
'''
def group_news(news_frame):
    news_frame['date'] = news_frame.time.dt.date  # Add date column
    
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'date']).agg(aggregations)
    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'date']}
    return gp.astype(float_cols)

news_agg = group_news(news_unstack)
del news_unstack; gc.collect()
'''

In [None]:
#market_train_df['date'] = market_train_df.time.dt.date
#market_train_df = market_train_df.merge(news_agg, how='left', on=['assetCode', 'date'])
#gc.collect()


In [None]:
#from datetime import datetime, timedelta
#start_date = datetime(2009, 1, 1, 0, 0, 0).date()
#market_train_df = market_train_df[market_train_df.time.dt.date >= start_date].reset_index(drop=True)


In [None]:
#gc.collect()

In [None]:
#market_train_df.sentimentNegative_mean.isnull()

## Modelling

It's time to build a model!
I think that in this case we should build a binary classifier - we will simply predict whether the target goes up or down.
1. NN
2. LGBM
3. CATBOOST
4. ENSEMBLE(STACKING)

In [None]:
#numerical columns
cat_cols = ['assetCode']
num_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
            'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 
            'returnsClosePrevMktres10','returnsOpenPrevMktres10']

#new_num_cols = num_cols.append(news_num_cols)
#num_cols = new_num_cols
from sklearn.model_selection import train_test_split

train_indices, val_indices = train_test_split(market_train_df.index.values,test_size=0.25, random_state=23)

In [None]:
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]


for i, cat in enumerate(cat_cols):
    print('encoding %s ...' % cat, end=' ')
    encoders[i] = {l: id for id, l in enumerate(market_train_df.loc[train_indices, cat].astype(str).unique())}
    market_train_df[cat] = market_train_df[cat].astype(str).apply(lambda x: encode(encoders[i], x))
    print('Done')

embed_sizes = [len(encoder) + 1 for encoder in encoders] #+1 for possible unknown assets

In [None]:
from sklearn.preprocessing import StandardScaler
 
market_train_df[num_cols] = market_train_df[num_cols].fillna(0)
print('scaling numerical columns')

scaler = StandardScaler()

#col_mean = market_train[col].mean()
#market_train[col].fillna(col_mean, inplace=True)
scaler = StandardScaler()
market_train_df[num_cols] = scaler.fit_transform(market_train_df[num_cols])

In [None]:
market_train_df

In [None]:
def get_input(market_train, indices):
    X_num = market_train.loc[indices, num_cols].values
    X = {'num':X_num}
    for cat in cat_cols:
        X[cat] = market_train.loc[indices, cat_cols].values
    y = (market_train.loc[indices,'returnsOpenNextMktres10'] >= 0).values
    r = market_train.loc[indices,'returnsOpenNextMktres10'].values
    u = market_train.loc[indices, 'universe']
    d = market_train.loc[indices, 'time'].dt.date
    return X,y,r,u,d

# r, u and d are used to calculate the scoring metric
X_train,y_train,r_train,u_train,d_train = get_input(market_train_df, train_indices)
X_valid,y_valid,r_valid,u_valid,d_valid = get_input(market_train_df, val_indices)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, BatchNormalization, Dropout
from keras.losses import binary_crossentropy, mse

class NN_base:        
        
    def __init__(self, include_cat=True, n_features=11):
        
        if include_cat:
            categorical_inputs = []
            for cat in cat_cols:
                categorical_inputs.append(Input(shape=[1], name=cat))

            categorical_embeddings = []
            for i, cat in enumerate(cat_cols):
                categorical_embeddings.append(Embedding(embed_sizes[i], 10)(categorical_inputs[i]))


            #categorical_logits = Concatenate()([Flatten()(cat_emb) for cat_emb in categorical_embeddings])
            categorical_logits = Flatten()(categorical_embeddings[0])
            categorical_logits = Dense(32,activation='relu')(categorical_logits)

            #categorical_logits = Flatten()(categorical_embeddings[0])
            #categorical_logits = Dense(32,activation='relu')(categorical_logits)
            categorical_logits = Dropout(0.5)(categorical_logits)
            categorical_logits = BatchNormalization()(categorical_logits)
            categorical_logits = Dense(32,activation='relu')(categorical_logits)
        
        
        numerical_inputs = Input(shape=(n_features,), name='num')
        numerical_logits = numerical_inputs
        numerical_logits = BatchNormalization()(numerical_logits)

        numerical_logits = Dense(128,activation='relu')(numerical_logits)
        numerical_logits = Dropout(0.5)(numerical_logits)
        numerical_logits = BatchNormalization()(numerical_logits)
        #numerical_logits = Dense(128,activation='relu')(numerical_logits)
        #numerical_logits = Dense(64,activation='relu')(numerical_logits)
         
        numerical_logits = Dense(128,activation='relu')(numerical_logits)
        numerical_logits = Dense(64,activation='relu')(numerical_logits)

        if include_cat:
            logits = Concatenate()([numerical_logits,categorical_logits])
        else:
            logits = numerical_logits
        logits = Dense(64,activation='relu')(logits)
        out = Dense(1, activation='sigmoid')(logits)
        
        if include_cat:
            self.model = Model(inputs = categorical_inputs + [numerical_inputs], outputs=out)
        else:
            self.model = Model(inputs = [numerical_inputs], outputs=out)
        self.model.compile(optimizer='adam',loss=binary_crossentropy)
        
    def fit(self,X_train,y_train,epochs=3, validation_data_=None):
        from keras.callbacks import EarlyStopping, ModelCheckpoint

        check_point = ModelCheckpoint('model.hdf5',verbose=True, save_best_only=True)
        early_stop = EarlyStopping(patience=5,verbose=True)
        return self.model.fit(X_train,y_train.astype(int),
                  validation_data=validation_data_,
                  epochs=epochs,
                  batch_size=10000,
                  verbose=True,
                  callbacks=[early_stop,check_point]) 
    
    def predict(self,X_test):
        return self.model.predict(X_test)
    
    def summary(self):
        self.model.summary()



In [None]:
#model_lgb_tmp = lgb.LGBMClassifier(objective='binary',learning_rate=0.05, bagging_fraction = 0.8,
#                                bagging_freq = 5, n_estimators=10,boosting_type = 'dart',
#                                num_leaves = 2452, min_child_samples = 212, reg_lambda=0.01)

In [None]:
#model_lgb_tmp.fit(X_train['num'],y_train)

In [None]:
#model_xgb_tmp = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468, 
#                             learning_rate=0.05, max_depth=6, 
#                             min_child_weight=1.7817, n_estimators=10,
#                             reg_alpha=0.4640, reg_lambda=0.8571,
#                             subsample=0.5213, silent=1,
#                             random_state =7, nthread = -1)

In [None]:
#model_xgb_tmp.fit(X_train['num'],y_train)

In [None]:
'''
from sklearn.metrics import accuracy_score
confidence_valid_lgb_only = model_lgb_tmp.predict(X_valid['num'])[:]*2 -1
print(accuracy_score(confidence_valid_lgb_only>0,y_valid))
'''

In [None]:
'''
from sklearn.metrics import accuracy_score
confidence_valid_numNN = NN_num.predict(X_valid['num'])[:,0]*2 -1
print(accuracy_score(confidence_valid_lgb_only>0,y_valid))
'''

In [None]:
'''
# calculation of actual metric that is used to calculate final score
r_valid = r_valid.clip(-1,1) # get rid of outliers. Where do they come from??
x_t_i = confidence_valid_numNN * r_valid * u_valid
data = {'day' : d_valid, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print(score_valid)
'''

In [None]:
#import gc
#gc.collect()

In [None]:
#from catboost import CatBoostClassifier
#t_cat = CatBoostClassifier(thread_count=4, n_estimators=500, max_depth=8, eta=0.1, loss_function='Logloss' , verbose=10)
#t_cat.fit(X_train['num'],y_train)

In [None]:
#confidence_valid=t_cat.predict(X_valid['num'])[:]*2-1


In [None]:
#confidence_valid

In [None]:
'''
from sklearn.metrics import accuracy_score
print(accuracy_score(confidence_valid>0,y_valid))

# calculation of actual metric that is used to calculate final score
r_valid = r_valid.clip(-1,1) # get rid of outliers. Where do they come from??
x_t_i = confidence_valid * r_valid * u_valid
data = {'day' : d_valid, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print(score_valid)
'''

In [None]:
'''
from catboost import CatBoostClassifier
# these are tuned params I found
x_1 = [0.19000424246380565, 2452, 212, 328, 202]#328
x_2 = [0.19016805202090095, 2583, 213, 312, 220]#312

params_1 = {
        'task': 'train',
        'boosting_type': 'dart',
        'objective': 'binary',
        'learning_rate': x_1[0],
        'num_leaves': x_1[1],
        'min_data_in_leaf': x_1[2],
        'num_iteration': x_1[3],
        'max_bin': x_1[4],
        'verbose': 1
    }

params_2 = {
        'task': 'train',
        'boosting_type': 'dart',
        'objective': 'binary',
        'learning_rate': x_2[0],
        'num_leaves': x_2[1],
        'min_data_in_leaf': x_2[2],
        'num_iteration': x_2[3],
        'max_bin': x_2[4],
        'verbose': 1
    }

N_fold = 6
X_num = X_train['num']
X_cat = X_train['assetCode']
kfold = KFold(n_splits=N_fold, shuffle=True, random_state=156)
test_data = lgb.Dataset(X_valid['num'], label=y_valid.astype(int), free_raw_data=False)
#lgb_datasets = []
#for train_index, holdout_index in kfold.split(X_num, y_train):
#    lgb_datasets.append(lgb.Dataset(X_num[train_index], label=y_train[train_index].astype(int),free_raw_data=False ))

NN_list=[]
out_of_fold_predictions_NN_1 = np.zeros((X_num.shape[0],1))
for train_index, holdout_index in kfold.split(X_num, y_train):
    instance = NN_base()
    X_t = {'num': X_num[train_index], 'assetCode': X_cat[train_index]}
    X_h = {'num': X_num[holdout_index], 'assetCode': X_cat[holdout_index]}
    instance.fit(X_t, y_train[train_index],epochs=10)
    NN_list.append(instance)
    y_pred=instance.predict(X_num[holdout_index])[:,0]
    out_of_fold_predictions_NN_1[holdout_index, 0] = y_pred


ctb_list=[]
out_of_fold_predictions_ctb_1 = np.zeros((X_num.shape[0],1))
for train_index, holdout_index in kfold.split(X_num, y_train):
    instance = CatBoostClassifier(thread_count=4, n_estimators=400, max_depth=8, eta=0.1, loss_function='Logloss' , verbose=10)
    instance.fit(X_num[train_index], y_train[train_index])
    ctb_list.append(instance)
    y_pred=instance.predict_proba(X_num[holdout_index])[:,1]
    out_of_fold_predictions_ctb_1[holdout_index, 0] = y_pred    
    
lgb_list_1=[]
out_of_fold_predictions_lgb_1 = np.zeros((X_num.shape[0], 1))
#for i in range(N_fold):
for train_index, holdout_index in kfold.split(X_num, y_train):
    train_data = lgb.Dataset(X_num[train_index], label=y_train[train_index].astype(int),free_raw_data=False )
    lgb_list_1.append(lgb.train(params_1,train_data,num_boost_round=100, valid_sets=test_data,early_stopping_rounds=5))
    y_pred = lgb_list_1[i].predict(X_num[holdout_index])
    out_of_fold_predictions_lgb_1[holdout_index, 0] = y_pred
                                   
lgb_list_2=[]
out_of_fold_predictions_lgb_2 = np.zeros((X_num.shape[0], 1))
#for i in range(N_fold):
for train_index, holdout_index in kfold.split(X_num, y_train):
    train_data = lgb.Dataset(X_num[train_index], label=y_train[train_index].astype(int),free_raw_data=False )    
    lgb_list_2.append(lgb.train(params_1,train_data,num_boost_round=100, valid_sets=test_data,early_stopping_rounds=5))
    y_pred = lgb_list_2[i].predict(X_num[holdout_index])
    out_of_fold_predictions_lgb_2[holdout_index, 0] = y_pred

base_learners_NN = [NN_list]
base_learners_ctb = [ctb_list]
base_learners_ = [lgb_list_1, lgb_list_2]
raw_features = X_num                                    
meta_features = np.concatenate((out_of_fold_predictions_NN_1,out_of_fold_predictions_ctb_1, out_of_fold_predictions_lgb_1, out_of_fold_predictions_lgb_2), axis=1)
meta_features = np.concatenate((raw_features , meta_features), axis=1)

'''


In [None]:
'''
meta_features_num_valid = np.column_stack([
            np.column_stack([model.predict(X_valid['num']) for model in base_learners]).mean(axis=1)
            for base_learners in base_learners_ ])

meta_features_num_valid_ctb = np.column_stack([
            np.column_stack([model.predict_proba(X_valid['num']) for model in base_learners]).mean(axis=1)
            for base_learners in base_learners_ctb ])

meta_features_num_valid = np.concatenate((meta_features_num_valid_ctb, meta_features_num_valid), axis=1)

raw_features_valid = X_valid['num']
meta_features_valid = np.concatenate((raw_features_valid, meta_features_num_valid), axis=1)
meta_features_valid = {'num': meta_features_valid, 'assetCode': X_valid['assetCode']}
'''

In [None]:
'''


meta_features = {'num': meta_features, 'assetCode': X_cat}  
NN_meta = NN_base(n_features=15)
history=NN_meta.fit(meta_features,y_train, validation_data_=(meta_features_valid,y_valid.astype(int)) )


cat_meta = CatBoostClassifier(thread_count=4, n_estimators=200, max_depth=8, eta=0.1, loss_function='Logloss' , verbose=10)
cat_meta.fit(meta_features['num'],y_train)
'''


In [None]:
'''
import matplotlib.pyplot as plt

loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(loss)
plt.plot(val_loss)

plt.legend(['loss','val_loss'])
plt.show()

'''


In [None]:
'''
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3,learning_rate=100)
transformed = tsne.fit_transform(raw_features)
'''

In [None]:
'''
meta_features_num_valid = np.column_stack([
            np.column_stack([model.predict(X_valid['num']) for model in base_learners]).mean(axis=1)
            for base_learners in base_learners_ ])

meta_features_num_valid_ctb = np.column_stack([
            np.column_stack([model.predict_proba(X_valid['num']) for model in base_learners]).mean(axis=1)
            for base_learners in base_learners_ctb ])

meta_features_num_valid = np.concatenate((meta_features_num_valid_ctb, meta_features_num_valid), axis=1)

raw_features_valid = X_valid['num']
meta_features_valid = np.concatenate((raw_features_valid, meta_features_num_valid), axis=1)
meta_features_valid = {'num': meta_features_valid, 'assetCode': X_valid['assetCode']}

confidence_valid = NN_meta.predict(meta_features_valid)[:,0]*2 -1

'''

In [None]:
'''
from sklearn.metrics import accuracy_score
print(accuracy_score(confidence_valid>0,y_valid))

# calculation of actual metric that is used to calculate final score
r_valid = r_valid.clip(-1,1) # get rid of outliers. Where do they come from??
x_t_i = confidence_valid * r_valid * u_valid
data = {'day' : d_valid, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print(score_valid)
'''

In [None]:
'''
import time
import copy

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5, along=False):
        
        self.along = along
        
        tmp_num = []
        tmp_cat = []
        for ty, model in base_models:
            if ty == 'num':
                tmp_num.append(model)
            elif ty == 'cat':
                tmp_cat.append(model)
            else:
                continue
                
        if tmp_num:
            self.base_models_num = tuple(tmp_num)
            self.is_num_models = True
        else:
            self.base_models_num = tuple([])
            self.is_num_models = False
            
            
        if tmp_cat:
            self.base_models_cat = tuple(tmp_cat)
            self.is_cat_models = True
        else:
            self.base_models_cat = tuple([])
            self.is_cat_models = False   
            
        self.meta_model_type = meta_model[0]
        self.meta_model = meta_model[1]
        self.n_folds = n_folds
        
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_num_ = [list() for x in self.base_models_num]
        self.base_models_cat_ = [list() for x in self.base_models_cat]
        if self.meta_model_type == 'object':
            self.meta_model_ = copy.deepcopy(self.meta_model)
        else:
            self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        X_num = X['num']
        X_cat = X['assetCode']
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X['num'].shape[0], len(self.base_models_num)))
        for i, model in enumerate(self.base_models_num):
            for train_index, holdout_index in kfold.split(X_num, y):
                ts = time.time()
                instance = clone(model)
                self.base_models_num_[i].append(instance)
                instance.fit(X_num[train_index], y[train_index])
                y_pred = instance.predict(X_num[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                print("{} model... complete at {}".format(i,(time.time()-ts)))
        
        out_of_fold_predictions_c = np.zeros((X['num'].shape[0], len(self.base_models_cat)))
        for i, model in enumerate(self.base_models_cat):
            for train_index, holdout_index in kfold.split(X_cat, y):
                ts = time.time()
                instance = copy.deepcopy(model)
                self.base_models_cat_[i].append(instance)
                
                X_t = {'assetCode' : X_cat[train_index], 'num': X_num[train_index]}
                X_h = {'assetCode' : X_cat[holdout_index], 'num': X_num[holdout_index]}
                instance.fit(X_t, y[train_index])
                y_pred = (instance.predict(X_h) > 0.5 )
                out_of_fold_predictions_c[holdout_index, i] = y_pred.flatten()
                print("{} model... complete at {}".format(i,(time.time()-ts)))        
        
        if self.is_cat_models == True:
            out_of_fold_predictions = np.concatenate((out_of_fold_predictions, out_of_fold_predictions_c), axis=1)
        
        if self.along == True:
            meta_features = np.concatenate((X_num,out_of_fold_predictions), axis=1)
        else:
            meta_features = out_of_fold_predictions
            
        
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(meta_features, y)
        return self
    
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        X_num = X['num']
        X_cat = X['assetCode']
        
        meta_features_num = np.column_stack([
            np.column_stack([model.predict(X_num) for model in base_models_num]).mean(axis=1)
            for base_models_num in self.base_models_num_ ])
        
        X_t = {'num': X_num, 'assetCode': X_cat}
        if self.base_models_cat:
            meta_features_cat = np.column_stack([
                np.column_stack([model.predict(X_t) for model in base_models_cat]).mean(axis=1)
                for base_models_cat in self.base_models_cat_ ])
            meta_features = np.concatenate((meta_features_num, meta_features_cat), axis=1)
        else:
            meta_features = meta_features_num
            
        if self.along == True:
            meta_features = np.concatenate((X_num, meta_features), axis=1)
            
        return self.meta_model_.predict(meta_features)
    
    def get_meta_features(self,X):
        X_num = X['num']
        X_cat = X['assetCode']
        
        meta_features_num = np.column_stack([
            np.column_stack([model.predict(X_num) for model in base_models_num]).mean(axis=1)
            for base_models_num in self.base_models_num_ ])
        
        X_t = {'num': X_num, 'assetCode': X_cat}
        if self.base_models_cat:
            meta_features_cat = np.column_stack([
                np.column_stack([model.predict(X_t) for model in base_models_cat]).mean(axis=1)
                for base_models_cat in self.base_models_cat_ ])
            meta_features = np.concatenate((meta_features_num, meta_features_cat), axis=1)
        else:
            meta_features = meta_features_num
            
        return meta_features
    
'''

In [None]:

#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import LogisticRegression

#NN = NN_base()

#KN_2 = KNeighborsClassifier(n_neighbors=2)
#KN_4 = KNeighborsClassifier(n_neighbors=4)
#KN_8 = KNeighborsClassifier(n_neighbors=8)
#lr = LogisticRegression()


#GBoost = GradientBoostingClassifier(n_estimators=10, learning_rate=0.05,
#                                   max_depth=6,min_samples_leaf=15, min_samples_split=10,random_state =5, verbose=2)
#model_lgb_ = lgb.LGBMClassifier(objective='binary',learning_rate=0.05, bagging_fraction = 0.8,
#                                bagging_freq = 5, n_estimators=10,boosting_type = 'dart',
#                                num_leaves = 2452, min_child_samples = 212, reg_lambda=0.01)

#model_xgb_ = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468, 
#                             learning_rate=0.05, max_depth=6, 
#                             min_child_weight=1.7817, n_estimators=10,
#                             reg_alpha=0.4640, reg_lambda=0.8571,
#                             subsample=0.5213, silent=1,
#                             random_state =7, nthread = -1)


In [None]:
#model_lgb_meta = lgb.LGBMClassifier(objective='binary',learning_rate=0.05, n_estimators=10, bagging_fraction = 0.8,
#                             bagging_freq = 5, boosting_type = 'dart')


In [None]:
#NN_meta = NN_base(include_cat=False, n_features=13)

In [None]:
#stacked_averaged_models = StackingAveragedModels(base_models = (('num',model_lgb_),('num',model_xgb_)),
#                                                 meta_model = ('object',NN_meta), along=True)

In [None]:
#stacked_averaged_models.fit(X_train,y_train)

In [None]:
#confidence_valid = stacked_averaged_models.predict(X_valid)[:]*2 -1
#print(accuracy_score(confidence_valid>0,y_valid))

In [None]:
#from sklearn.metrics import accuracy_score
#print(accuracy_score(confidence_valid>0,y_valid))

In [None]:
'''
# calculation of actual metric that is used to calculate final score
r_valid = r_valid.clip(-1,1) # get rid of outliers. Where do they come from??
x_t_i = confidence_valid * r_valid * u_valid
data = {'day' : d_valid, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print(score_valid)
'''

In [None]:
'''
days = env.get_prediction_days()
import time

n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    if n_days % 50 == 0:
        print(n_days,end=' ')
    
    t = time.time()
    assetCode = market_obs_df['assetCode']

    #market_obs_df['price_diff'] = market_obs_df['close'] - market_obs_df['open']
    #market_obs_df['close_to_open'] =  np.abs(market_obs_df['close'] / market_obs_df['open'])
    #market_obs_df['assetName_mean_open'] = market_obs_df.groupby('assetName')['open'].transform('mean')
    #market_obs_df['assetName_mean_close'] = market_obs_df.groupby('assetName')['close'].transform('mean')
    market_obs_df[num_cols] = market_obs_df[num_cols].fillna(0)
    market_obs_df[num_cols] = scaler.transform(market_obs_df[num_cols])
    #market_obs_df = market_obs_df.loc[:, num_cols].fillna(0).values
    X = {'num': market_obs_df[num_cols].values}
    for i,cat in enumerate(cat_cols):
        market_obs_df[cat+'_encoded'] = market_obs_df[cat].astype(str).apply(lambda x: encode(encoders[i],x))
        X[cat] = market_obs_df[cat+'_encoded'].values
    
    print(X)
    prep_time += time.time() - t
    
    t = time.time()
    #lp = stacked_averaged_models.predict(X)
    lp = NN_tmp.predict(X)[:,0]
    #lp = model_lgb_.predict(X['num'])[:]
    #lp = model_xgb_.predict(X['num'])[:]
    prediction_time += time.time() -t
    
    t = time.time()
    confidence = 2 * lp -1
    preds = pd.DataFrame({'assetCode':assetCode,'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t
    
env.write_submission_file()

'''


In [None]:
'''
days = env.get_prediction_days()
import time

n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
predicted_confidences = np.array([])
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    print(n_days,end=' ')
    
    t = time.time()

    market_obs_df['assetCode_encoded'] = market_obs_df[cat].astype(str).apply(lambda x: encode(encoders[0], x))

    market_obs_df[num_cols] = market_obs_df[num_cols].fillna(0)
    market_obs_df[num_cols] = scaler.transform(market_obs_df[num_cols])
    X_num_test = market_obs_df[num_cols]
    #X_test = {'num':X_num_test}
    X_cat_test = market_obs_df['assetCode_encoded']

    meta_features_num_test = np.column_stack([
                np.column_stack([model.predict(X_num_test) for model in base_learners]).mean(axis=1)
                for base_learners in base_learners_ ])
    
    meta_features_num_test_ctb = np.column_stack([
                np.column_stack([model.predict_proba(X_num_test) for model in base_learners]).mean(axis=1)
                for base_learners in base_learners_ctb ])
    
    X_test = {'num': X_num_test, 'assetCode': X_cat_test}
    meta_features_num_test_NN = np.column_stack([
                np.column_stack([model.predict(X_test) for model in base_learners]).mean(axis=1)
                for base_learners in base_learners_NN ])
    
    meta_features_num_test = np.concatenate((meta_features_num_test_NN, meta_features_num_test_ctb, meta_features_num_test), axis=1)    
    
    #print(meta_features_num_test.shape)
    raw_features_test = X_num_test
    meta_features_test = np.concatenate((raw_features_test, meta_features_num_test), axis=1)
    meta_features_test = {'num': meta_features_test, 'assetCode': X_cat_test}

    
    prep_time += time.time() - t
    
    t = time.time()
    #market_prediction = t_cat.predict_proba(X_num_test)[:,1]*2 -1
    '''
    market_prediction_ctb = ctb_meta.predict_proba(meta_features_test)[:,1]*2 -1
    market_prediction = (market_prediction_ctb + market_predcition_NN)/2
    '''
    market_prediction = NN_meta.predict(meta_features_test)[:,0]*2 -1
    #market_prediction = NN_tmp.predict(X_test)[:,0]*2 -1
    #market_prediction = meta_features_num_test.mean(axis=1)*2 -1
    predicted_confidences = np.concatenate((predicted_confidences, market_prediction))
    prediction_time += time.time() -t
    
    t = time.time()
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':market_prediction})
    # insert predictions to template
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t

env.write_submission_file()
total = prep_time + prediction_time + packaging_time
print(f'Preparing Data: {prep_time:.2f}s')
print(f'Making Predictions: {prediction_time:.2f}s')
print(f'Packing: {packaging_time:.2f}s')
print(f'Total: {total:.2f}s')
'''