# ESKEEETIT

In [None]:
##Import all the things
#Triple Threat---
import gc
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None #shut down that annoying slice warning
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt
from datetime import datetime, timedelta


from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

#Bayesian Optimization
# from skopt import gp_minimize

#You know it, you love it
from lightgbm import LGBMClassifier

#For featurizing
from multiprocessing import Pool



In [None]:
#Datetime functions bc I committed early to integer datetime and I don't want to refactor
def date_to_int(dt_time):
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day
def int_to_date(x):
    return datetime.strptime(str(date), '%Y%m%d')
    

In [None]:
#Get the training data
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()


## **`get_training_data`** function (built-in)

Returns the training data DataFrames as a tuple of:
* `market_train_df`: DataFrame with market training data
* `news_train_df`: DataFrame with news training data

These DataFrames contain all market and news data from February 2007 to December 2016. 

In [None]:
(env_market_train, env_news_train) = env.get_training_data()

## **`DataPrepper`** class
Takes environment dataframes:
* env_market_train
* env_news_train

Performs data cleaning:
* Replace Price Outliers
* Convert Datetime
* Remove data prior to train cutoff

Performs feature engineering:
* Create Daily Return
* Daily Change
* Price/Volume
* All News Data aggregated  to relevant trading day

Returns merged dataframe for training


In [None]:
class DataPrepper():
    """A class for loading and transforming data for stock data"""

    def __init__(self):
        self.train_cutoff = 20101231

    def make_price_diff(self, market_train):
        #eda function to find outliers
        market_train['closeOverOpen'] = market_train['Close']/market_train['Open']
        market_train['priceDiff'] = np.abs(market_train['Close'] - market_train['Open'])
        return market_train

    def _replace_price_outliers(self, market_train):
        market_train['dailychange'] = market_train['close']/market_train['open']
        market_train['open'][market_train['dailychange'] < .33] = market_train['close']
        market_train['close'][market_train['dailychange'] > 2] = market_train['open']
        return market_train

    def prepare_market(self, market_train):
        market_train = self._replace_price_outliers(market_train)
        market_train['time'] = market_train['time'].dt.strftime("%Y%m%d").astype(int)
        market_train = market_train[market_train.time >= self.train_cutoff]
        market_train['pricevolume'] = market_train['volume']/market_train['close']
        self.tradingdays = market_train['time'].unique()
        return market_train

    def prepare_news(self, news_train):
        news_train['time'] = news_train['time'].dt.strftime("%Y%m%d").astype(int)
        news_train = news_train[news_train.time >= self.train_cutoff]
        news_train['time'] = news_train['time'].apply(self._map_trading_day)
        news_train['coverage'] = news_train['sentimentWordCount']/news_train['wordCount']
        return news_train

    def _map_trading_day(self, news_date):
        if news_date in self.tradingdays:
            return news_date
        else:   
            values = self.tradingdays - news_date
            mask = values >= 0
            try:
                return self.tradingdays[mask][0]
            except:
                return 0

    def merge_data(self, market_df, news_df):
        newsgroup = news_df.groupby(['time', 'assetName'], sort=False).agg(np.mean).reset_index()
        merged = pd.merge(market_df, newsgroup, how='left', on=['time', 'assetName'], copy=False)
        merged.fillna(value=0, inplace=True)
        return merged

In [None]:
#Prepare the market and news data
prepper = DataPrepper()
market_train = prepper.prepare_market(env_market_train)
news_train = prepper.prepare_news(env_news_train)
train_df = prepper.merge_data(market_train, news_train)


In [None]:
#Make Room
del env_market_train
del env_news_train
del market_train
del news_train
gc.collect()

In [None]:
gc.collect()

# Time To Train
# `Featurizer` class

Packages Training Data and Test Data Functions for the LSTM:
Initializes with :
* assetId = the column by which to group assets - string
* n_lag = number of days lag - list of ints
* shift_size = the standard shift to create a full window length (default = 1)
* "return_features" Selected Lagged features - list of strings


**

In [None]:
class Featurizer():
    def __init__(self, assetId='assetCode',
                       n_lag=[3,7,14],
                       shift_size=1, 
                       return_features=['returnsClosePrevMktres10','returnsClosePrevRaw10',
                                        'returnsOpenPrevMktres1', 'returnsOpenPrevRaw1',
                                        'open','close']
                ):
        self.assetId = assetId
        self.n_lag = n_lag
        self.shift_size = shift_size
        self.return_features = return_features

    def transform(self, df):
        new_df = self.generate_lag_features(df)
        df = pd.merge(df, new_df, how='left', on=['time', self.assetId])
        df = self.mis_impute(df)
        return df
    
    def create_lag(self, df_code):
        for col in self.return_features:
            for window in self.n_lag:
                rolled = df_code[col].shift(self.shift_size).rolling(window=window)
                lag_mean = rolled.mean()
                lag_max = rolled.max()
                lag_min = rolled.min()
                lag_std = rolled.std()
                df_code['%s_lag_%s_mean'%(col,window)] = lag_mean
                df_code['%s_lag_%s_max'%(col,window)] = lag_max
                df_code['%s_lag_%s_min'%(col,window)] = lag_min
                # df_code['%s_lag_%s_std'%(col,window)] = lag_std
        return df_code.fillna(-1)

    def generate_lag_features(self,df):
        features = ['time', self.assetId, 'volume', 'close', 'open',
       'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
       'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
       'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
       'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
        df = df.loc[:,features]
    
        assetCodes = df[self.assetId].unique()
        df_codes = df.groupby(self.assetId)
        df_codes = [df_code[1][['time', self.assetId]+self.return_features] for df_code in df_codes]
        pool = Pool(4)
        all_df = pool.map(self.create_lag, df_codes)
        new_df = pd.concat(all_df)  
        new_df.drop(self.return_features,axis=1,inplace=True)
        pool.close()

        return new_df
        
    def mis_impute(self, df):
        for i in df.columns:
            if df[i].dtype == "object":
                df[i] = df[i].fillna("other")
            elif (df[i].dtype == "int32" or df[i].dtype == "float32"):
                df[i] = df[i].fillna(df[i].mean())
            elif (df[i].dtype == "int64" or df[i].dtype == "float64"):
                df[i] = df[i].fillna(df[i].mean())
            else:
                pass
        return df
    


In [None]:
# class AssetEncoder():
#     def __init__(self):
#         self.assetId = 'assetCode'
#         self.le = LabelEncoder()

#     def fit(self, X):
#         labels = X[self.assetId].values
#         self.labels = np.append(labels, 'unknown')
#         self.encoder = dict(zip(self.labels,np.arange(len(self.labels))))

#     def transform(self, X):
#         X[self.assetId] = X[self.assetId].apply(self._map_known_labels)
#         X['assetCodeT'] = X[self.assetId].apply(self._map_known_labels)
#         return X

#     def inverse_transform(self, X):
#         X[self.assetId] = self.le.inverse_transform(X[self.assetId])
#         return X

#     def _map_known_labels(self,x):
#         if x in self.labels:
#             return x
#         else:
#             return 'unknown'
        
#     def _encode(self,x):
#         return self.encoder[x]

In [None]:
# make X, y
target = train_df.pop('returnsOpenNextMktres10').values
X = train_df

In [None]:
#Make a shorter version of our training set. I'll append new observation
#data here as it comes in
pred_df = train_df.loc[train_df.time>20161201].copy()
del train_df
gc.collect()

In [None]:
#Make lag features
featurizer = Featurizer()
X = featurizer.transform(X)

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if is_numeric_dtype(col_type):
            col_type = col_type.name
            
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
X = reduce_mem_usage(X)
gc.collect()

In [None]:
drop_cols = ['assetCode','assetName','marketCommentary', 'time']
X_features = [c for c in X.columns.values if c not in drop_cols]
X = X.loc[:,X_features]
gc.collect()

In [None]:
# Scaling of X values

# scaler = StandardScaler()
# scaler.fit(X)
# X = scaler.transform(X)
# gc.collect()

In [None]:
# Get Train,Val set. Using a random split to expose classifier to different regimes.
# Should probably use sklearn time-series K-fold instead, but it performs better on unseen data
X_train, X_val, target_train, target_val = train_test_split(X, target, random_state=0)
del X, target
gc.collect()
#make binary y
y_train = np.where(target_train>0, 1, 0).astype(int)
y_val = np.where(target_val>0, 1, 0).astype(int)



## `BayesianOptimizerLGBM` class

 Recommended to run this locally since it takes a lot of resources and time.

In [None]:
# class BayesianOptimizerLGBM():
#     def __init__(self,spaces):
#         self.spaces = spaces

#     def fit(self, X_train, y_train, X_val, y_val):
#         self.X_train = X_train
#         self.y_train = y_train
#         self.X_val = X_val
#         self.y_val = y_val 
#         self.res = gp_minimize(self._optimize, self.spaces, acq_func="EI",n_calls=30)

#     def _optimize(self, x):

#         gbm = LGBMClassifier(learning_rate=x[0],
#                             num_leaves=x[1],
#                             min_data_in_leaf=x[2],
#                             num_iteration=x[3],
#                             max_bin=x[4],
#                             verbose=1, 
#                             n_jobs=-1)
#         gbm.fit(self.X_train, self.y_train, eval_set=(self.X_val, self.y_val),
#                 eval_metric=['binary_logloss'], verbose=True, early_stopping_rounds=5)
#         y_pred = gbm.predict_proba(self.X_val)
#         score = log_loss(self.y_val, y_pred)
#         print("score" , score)
#         return score

In [None]:
# spaces = [
# (0.10, 0.25), #learning_rate
# (1000, 10000), #num_leaves
# (200, 400), #min_data_in_leaf
# (300, 500), #num_iteration
# (200, 400) #max_bin
# ]

# opt = BayesianOptimizerLGBM(spaces)
# opt.fit(X_train, y_train, X_val, y_val)
# print("optimal params", opt.res.x)
# params = opt.res.x


## `build_model` function

 Builds my LGB Model

In [None]:
def build_lgbm(X_train, y_train, X_val, y_val, boosting_type, params):
    gbm = LGBMClassifier(boosting_type=boosting_type,
                         learning_rate=params[0],
                        num_leaves=params[1],
                        min_data_in_leaf=params[2],
                        num_iteration=params[3],
                        max_bin=params[4],
                        verbose=1, 
                        n_jobs=-1)
    gbm.fit(X_train, y_train, eval_set=(X_val, y_val),
            eval_metric=['binary_logloss'], verbose=True, early_stopping_rounds=10)
    return gbm
    

## Fit the Model

In [None]:
#Params from local skopt gp_minimize
params_1 = [0.10192437737356348, 1011, 399, 500, 242]
params_2 = [0.14975024553335256, 279, 388, 300, 394]

In [None]:
model_1 = build_lgbm(X_train, y_train, X_val, y_val, 'gbdt', params_1)
# model_2 = build_lgbm(X_train, y_train, X_val, y_val, 'dart', params_2)

In [None]:
#Confidence Test
minmax = MinMaxScaler()

# preds = model.predict_proba(X_val)[:,1]
# confidence_test = (preds-preds.min())/(preds.max()-preds.min())
# confidence_test = confidence_test*2-1
# print(max(confidence_test),min(confidence_test))

# print(minmax.fit_transform(preds)*2-1, confidence_test)

# # calculation of actual metric that is used to calculate final score
# r_test = target_val.clip(-1,1) # get rid of outliers. Where do they come from??
# x_t_i = confidence_test * r_test * u_test
# data = {'day' : d_test, 'x_t_i' : x_t_i}
# df = pd.DataFrame(data)
# x_t = df.groupby('day').sum().values.flatten()
# mean = np.mean(x_t)
# std = np.std(x_t)
# score_test = mean / std
# print(score_test)

## `get_prediction_days` function (Built-In)

Generator which loops through each "prediction day" (trading day) and provides all market and news observations which occurred since the last data you've received.  Once you call **`predict`** to make your future predictions, you can continue on to the next prediction day.

Yields:
* While there are more prediction day(s) and `predict` was called successfully since the last yield, yields a tuple of:
    * `market_observations_df`: DataFrame with market observations for the next prediction day.
    * `news_observations_df`: DataFrame with news observations for the next prediction day.
    * `predictions_template_df`: DataFrame with `assetCode` and `confidenceValue` columns, prefilled with `confidenceValue = 0`, to be filled in and passed back to the `predict` function.
* If `predict` has not been called since the last yield, yields `None`.

In [None]:
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
days = env.get_prediction_days()

In [None]:
# Make room
del X_train
del X_val
del y_train
del y_val
gc.collect()

## Main Loop
Let's loop through all the days and make our  predictions.  The `days` generator (returned from `get_prediction_days`) will simply stop returning values once you've reached the end.

In [None]:

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    print(f'___________________________________')
    print(f'Getting new data and preparing it')
    market_obs_df = prepper.prepare_market(market_obs_df)
    news_obs_df = prepper.prepare_news(news_obs_df)
    pred_day = prepper.merge_data(market_obs_df, news_obs_df)
    print(f'Got it. ready to make some predictions')
    date = pred_day.time.values[0]
    
    #make a historical df
    pred_df = pred_df.append(pred_day, sort=True)
    lag_date = date_to_int(int_to_date(date) - timedelta(days=30))
    short_pred_df = pred_df.loc[pred_df.time >= lag_date]
    
    #Get historical features to predict on:
    print(f'Getting features for the new day {date}')
    short_pred_df = featurizer.transform(short_pred_df)
    
    #make the X Matrix
    print("setting X for sucess")
    short_pred_df = short_pred_df.loc[short_pred_df.time == date,:]
    ordered_df = pd.merge(predictions_template_df, short_pred_df, how='left', on='assetCode')
    X = ordered_df.loc[:, X_features]
    

    print("making predictions")
    raw_pred = model_1.predict_proba(X)
    pred_scaled = minmax.fit_transform(raw_pred)[:,1]
    mod_pred = pred_scaled*2-1
    predictions_template_df.confidenceValue = np.clip(mod_pred, -0.99, 0.99)
    print("Submitting predictions!")
    env.predict(predictions_template_df)
    gc.collect()


### **`predict`** function (Built-In)
Stores your predictions for the current prediction day.  Expects the same format as you saw in `predictions_template_df` returned from `get_prediction_days`.

Args:
* `predictions_df`: DataFrame which must have the following columns:
    * `assetCode`: The market asset.
    * `confidenceValue`: Your confidence whether the asset will increase or decrease in 10 trading days.  All values must be in the range `[-1.0, 1.0]`.

The `predictions_df` you send **must** contain the exact set of rows which were given to you in the `predictions_template_df` returned from `get_prediction_days`.  The `predict` function does not validate this, but if you are missing any `assetCode`s or add any extraneous `assetCode`s, then your submission will fail.

## **`write_submission_file`** function (Built-in)

Writes your predictions to a CSV file (`submission.csv`) in the current working directory.

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])

As indicated by the helper message, calling `write_submission_file` on its own does **not** make a submission to the competition.  It merely tells the module to write the `submission.csv` file as part of the Kernel's output.  To make a submission to the competition, you'll have to **Commit** your Kernel and find the generated `submission.csv` file in that Kernel Version's Output tab (note this is _outside_ of the Kernel Editor), then click "Submit to Competition".  When we re-run your Kernel during Stage Two, we will run the Kernel Version (generated when you hit "Commit") linked to your chosen Submission.