# My Tester


This is a simulation of the kernel with real data. Many of my engineered features are lagged. This poses a problem given the set up of this competition. The prediction days are available one at a time. The predictions must be made on one day before moving on to the next day. Therefore, it is impossible to process the test data for lags. In this notebook, we cycle a simulated test set. A base set is used set back 6 months (needed for squeeze indicator). Daily test data is appended to the base so that lags can be derived. The base is then updated to be used for the next day.

In [46]:
import sys
paths = ['../scripts',
        '/Users/jacob/Desktop/docs/ML/_a_python/_1_code/notebooks/quickpipeline',
        '/Users/jacob/Desktop/docs/ML/_a_python/_1_code/notebooks/pd_feature_union']

for path in paths:
    sys.path.append(path)

from market_imports import *
from market_code import *
from market_trans import *
from quickpipe_mod import * 
from pandas_feature_union import *

data_path = '/Users/jacob/Desktop/docs/kaggle/two_sigma/_g_data/data'
df_market = pd.read_csv(data_path + '/marketdata_sample.csv') 
df_news = pd.read_csv(data_path + '/news_sample.csv')


import warnings
warnings.filterwarnings(action='ignore')

import datetime as dt



## Data Prep

In [48]:
# TRYING TO PREVENT DATE TYPE CONVERSION
# THIS SHOULD BE UPDATED FROM PANDA DATAREADER
csv = pd.read_csv(data_path + '/sigma_data.csv', index_col='Date', parse_dates=True, 
                 infer_datetime_format=True)

csv.shape

(57384, 9)

In [49]:
# COMPUTE RESIDUAL FOR RESPONSE
import statsmodels.tsa.api as tsa
from statsmodels.tsa.seasonal import seasonal_decompose 

x = csv[['returns_open_raw10_next']]
# Freq will depend on asset number - this needs to be researched
result = seasonal_decompose(x, model='additive', freq=50)

csv['returns_res10_next'] = result.resid
# Lose the first and last day
csv = csv[-csv['returns_res10_next'].isnull()]

check = csv.columns[:4].tolist()
csv.drop_duplicates(check, inplace=True)

# ELC is insanely high. The next highest price is around 300
csv[csv.Close ==csv.Close.max()]
csv[csv.Close > 1000]
csv = csv[csv.asset != 'ELC']

In [50]:
csv.shape

(35818, 10)

## Feature Engineer  

Ideas: momentum, Beta, indicators

In [51]:
# data.query("asset == 'AAPL'")

In [52]:
# test.query("asset == 'AAPL'")[['Close']].diff().head()

vectorize  

https://stackoverflow.com/questions/42869495/numpy-version-of-exponential-weighted-moving-average-equivalent-to-pandas-ewm

In [53]:
df = csv.copy()
df.head()

Unnamed: 0_level_0,asset,Open,Close,Volume,returns_close_raw,returns_open_raw,returns_close_raw10,returns_open_raw10,returns_open_raw10_next,returns_res10_next
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-01-16,QCOM,74.470001,74.720001,10263200.0,0.002814,0.013519,0.018914,0.011615,0.02433,-0.015252
2014-01-16,KEP,16.66,16.719999,205300.0,0.009012,0.0,0.017497,0.00965,0.025533,-0.010187
2014-01-16,NWS,17.059999,17.040001,506000.0,-0.006435,-0.011076,-0.048678,-0.044707,0.087544,0.050676
2014-01-16,BHP,66.57,66.889999,5735900.0,0.029432,0.033296,-0.001494,-0.012391,0.051633,0.013469
2014-01-16,NGG,70.043671,69.934494,212800.0,0.001875,0.00453,-0.004361,-0.009308,-0.002802,-0.042414


In [54]:
df = csv.copy()
df.reset_index(inplace=True)
# SEPARATE TEST DATA EARLY TO SIMULATE KERNEL
# Temporarily convert to integers for validation and test sets

df['date_'] = df.Date.dt.strftime('%Y%m%d').astype(int)
base = df['date_'].unique().tolist()
base = df[ df['date_'].isin( dates[ :int(len(dates)*.90)] ) ]

data_test = df[ df['date_'].isin( dates[ int(len(dates)*.90):] ) ]
test_dates = data_test['date_'].unique().tolist()

base.drop(['date_'], axis=1, inplace=True)
# data_test.drop(['date_'], axis=1, inplace=True)

# data.reset_index(drop=True, inplace=True)
# data_test.reset_index(drop=True, inplace=True)

# the incoming test data will not have a response

data_test.drop(['returns_open_raw10_next', 
                'returns_res10_next'], axis=1, inplace=True)



In [55]:
base.head()

Unnamed: 0,Date,asset,Open,Close,Volume,returns_close_raw,returns_open_raw,returns_close_raw10,returns_open_raw10,returns_open_raw10_next,returns_res10_next
7032,2014-08-14,SNE,17.889999,18.07,2018300.0,0.00722,0.002799,-0.019727,-0.003348,-0.050678,-0.030193
7033,2014-08-14,SIEB,2.39,2.25,6200.0,-0.060363,0.0,-0.156054,-0.09569,0.073786,0.102621
7034,2014-08-14,ADBE,69.889999,70.050003,1318700.0,0.003719,0.011078,0.011486,-0.020114,-0.018992,0.005233
7035,2014-08-14,RAD,6.27,6.22,9883400.0,-0.004812,0.011227,-0.072844,-0.075247,-0.006359,0.020666
7036,2014-08-14,THC,57.779999,58.639999,1178700.0,0.016333,0.013416,0.105474,0.0764,-0.041528,-0.025563


The test set is tricky. Keep both data forms. You'll need the int form
to interate and the timestamp for the delta and feature engineering

In [56]:
# SETTING UP THE TEST TRIAL

data_test.set_index(['date_', 'asset'], inplace=True)
data_test.sort_index(inplace=True)

In [None]:
# THIS COULD SIMULATE THE KERNEL WELL ENOUGH
days = []
for i in range(len(test_dates)):
    dframe = data_test.loc[test_dates[i]]
    days.append(dframe)
    

# So, we preproc first. Then, as the daily data comes in, we append it to 
# a six month block of raw data - without the target. We then preproc the 
# lags

# so, need a 6 month block prior to the data starting




# All the date values are the same in the subset
test = days[0]
test.reset_index(inplace=True)
t = test.Date[0] 
time = (t - dt.timedelta(days=130))
base = data[data.Date > time]

base.drop(['returns_open_raw10_next', 
            'returns_res10_next'], axis=1, inplace=True)

update = pd.concat([base, test], axis=0)

In [57]:
# data is used as the base set
data = base.copy()

data >> \
        exTractTime(col='Date', 
                    atts=['weekday', 'month', 'year', 'day', 'quarter'],
                    mthds=['month_name']) >> \
        toCatFeat(feats=['year', 'quarter']) >> \
        macdFeats() >> \
        bbSqueeze() >> \
        rsiFeats()
        
data.tail()


Unnamed: 0,Date,asset,Open,Close,Volume,returns_close_raw,returns_open_raw,returns_close_raw10,returns_open_raw10,returns_open_raw10_next,returns_res10_next,weekday,month,year,day,quarter,month_name,year_obj,quarter_obj,macd,sig,macd_cross,macd_sharp_rise,macd_div,squeeze,sq_min,low_vol,rsi
29645,2016-06-16,GE,28.98077,29.461538,70322800.0,0.001633,-0.011217,0.019444,0.004323,-0.017106,-0.025358,3,6,2016,16,2,June,2016_,2_,0.053228,-0.029973,True,False,False,0.044039,0.019288,False,0.670461
29646,2016-06-16,SID,2.06,2.09,1939900.0,0.024214,0.019608,0.084839,0.168821,-0.161062,-0.168786,3,6,2016,16,2,June,2016_,2_,-0.137575,-0.174101,True,False,False,0.293405,0.273883,False,0.501159
29647,2016-06-16,THC,27.48,27.440001,1234400.0,-0.016625,-0.013734,-0.078135,-0.067536,-0.006167,-0.017061,3,6,2016,16,2,June,2016_,2_,-0.45628,-0.302988,False,False,False,0.127197,0.052738,False,0.317041
29648,2016-06-16,VIAb,42.18,45.049999,10593500.0,0.065353,0.018424,-0.000888,-0.056684,0.040644,0.038157,3,6,2016,16,2,June,2016_,2_,0.575544,0.725818,False,False,False,0.202467,0.065608,False,0.65738
29649,2016-06-16,SNY,37.709999,38.400002,2454400.0,0.006008,-0.020993,-0.069652,-0.079247,-0.088998,-0.085013,3,6,2016,16,2,June,2016_,2_,-0.546393,-0.263585,False,False,False,0.117085,0.059338,False,0.244273


In [58]:
(
    data[[col for col in data.columns if data[col].isnull().sum() > 0]].
    isnull().
    sum()
)




squeeze     931
sq_min     7105
rsi          53
dtype: int64

In [59]:
X = data.copy()

In [60]:
# WHAT STAYS? JUST DO TWO SEPARATE FUNCTS FOR NOW

# UNIVERSE AND DATE
X.dropna(inplace=True)
X.drop(['sq_min'], axis=1, inplace=True)
# only use universe for real data simulation
X['universe'] = universe_feat(X) # N/A kernel
X['date_'] = X.Date.dt.strftime("%Y%m%d").astype(int) 
# No longer using timestamp index -> reset. Not for kernel
# X.reset_index(drop=True, inplace=True) # N/A kernel

# Create objects for ranges, etc
dates = X['date_'].unique()
target = ['returns_res10_next']
label = ['label']  
X[label[0]] = (X[target[0]] > 0).astype(int)
preproc_feats = trainFeats(X, drops=['date_', 'Date', 'asset', 'universe', 
                                    'returns_open_raw10_next', 'returns_res10_next',
                                    'label'])
test_pipe = preprocPipe(X, binFeat='Volume')
X_p = test_pipe.fit_transform(X[preproc_feats])
X_p.head()

pipeline created


Unnamed: 0,Open,Close,Volume,returns_close_raw,returns_open_raw,returns_close_raw10,returns_open_raw10,weekday,month,year,day,quarter,macd,sig,squeeze,rsi,Volume_bin_category_0.0,Volume_bin_category_1.0,Volume_bin_category_2.0,Volume_bin_category_3.0,Volume_bin_category_4.0,macd_cross,macd_sharp_rise,macd_div,low_vol,year_obj,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,quarter_obj_1_,quarter_obj_2_,quarter_obj_3_,quarter_obj_4_
14117,1.051392,1.046395,-0.32025,-0.068454,0.074017,-0.018653,0.012783,1.437414,-0.984445,-0.747832,-0.304797,-1.311743,-0.002104,0.05237,-1.531575,0.028858,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
14118,0.329223,0.341447,3.512957,0.061647,-0.1022,-0.331402,-0.470945,1.437414,-0.984445,-0.747832,-0.304797,-1.311743,-0.210881,-0.173236,-0.347323,-1.254631,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
14119,-0.344865,-0.347664,-0.337037,-0.018682,0.123968,-0.13565,-0.112872,1.437414,-0.984445,-0.747832,-0.304797,-1.311743,0.056198,0.112004,-0.610459,0.024522,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
14120,-0.041852,-0.033808,-0.367416,0.034895,-0.038707,0.183195,0.0751,1.437414,-0.984445,-0.747832,-0.304797,-1.311743,0.137845,0.138099,-0.304535,0.962495,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
14121,1.119247,1.111255,-0.374873,-0.088062,0.105874,0.113187,0.112293,1.437414,-0.984445,-0.747832,-0.304797,-1.311743,0.461382,0.57424,-1.062011,0.493283,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [61]:
# TRAIN THE MODEL
feats_train = X_p.columns.tolist()

# Non training feats
feats = [col for col in X.columns if col not in preproc_feats]

X_ = pd.concat([X[feats] ,X_p],axis=1)

In [62]:
train_range = range(len(dates))[:int(0.90*len(dates))]
val_range = range(len(dates))[int(0.90*len(dates)):]

# test_range = range(len(dates))[int(0.9*len(dates)):] # N/A kernel

X_train = X_[feats_train].loc[X_['date_'].isin(dates[train_range])].values



y_train = X_[label].loc[X_['date_'].isin(dates[train_range])].values
# GETS INTERESTING -> INPUTS DATA INTO INSTANCE
lgb_train = lgb.Dataset(X_train, y_train[:,0], feature_name=feats_train)
print(X_train.shape, y_train.shape)


# validation data - numpys
X_val = X_[feats_train].loc[X_['date_'].isin(dates[val_range])].values
y_val = X_[label].loc[X_['date_'].isin(dates[val_range])].values
lgb_val = lgb.Dataset(X_val, y_val[:,0], feature_name=feats_train),
print(X_val.shape, y_val.shape)


# test data --> N/A kernel
# X_test = X_[feats_train].loc[X_['date_'].isin(dates[test_range])].values
# y_test = X_[label].loc[X_['date_'].isin(dates[test_range])].values

# print(X_test.shape, y_test.shape)

param = {"objective" : "binary",
          "metric" : "binary_logloss",
          "verbosity" : -1,
        'random_state': 81}

# TONS OF HYPERPARAMENTERS. THIS FIT FUNC USES THE VAL SET TO TRAIN AND 
# SELECT THE BEST ITERATION

model = lgb.train(param, lgb_train, valid_sets=lgb_val, 
                  early_stopping_rounds=10) 
                  

(13967, 42) (13967, 1)
(1546, 42) (1546, 1)
[1]	valid_0's binary_logloss: 0.69424
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's binary_logloss: 0.693772
[3]	valid_0's binary_logloss: 0.693684
[4]	valid_0's binary_logloss: 0.693103
[5]	valid_0's binary_logloss: 0.693721
[6]	valid_0's binary_logloss: 0.693591
[7]	valid_0's binary_logloss: 0.692092
[8]	valid_0's binary_logloss: 0.692892
[9]	valid_0's binary_logloss: 0.691646
[10]	valid_0's binary_logloss: 0.691531
[11]	valid_0's binary_logloss: 0.691493
[12]	valid_0's binary_logloss: 0.69165
[13]	valid_0's binary_logloss: 0.692508
[14]	valid_0's binary_logloss: 0.692945
[15]	valid_0's binary_logloss: 0.694153
[16]	valid_0's binary_logloss: 0.694018
[17]	valid_0's binary_logloss: 0.695212
[18]	valid_0's binary_logloss: 0.695016
[19]	valid_0's binary_logloss: 0.695557
[20]	valid_0's binary_logloss: 0.695477
[21]	valid_0's binary_logloss: 0.696421
Early stopping, best iteration is:
[11]	valid_0's binary_logloss:

In [None]:
# Kernel simulation
# Define the ba

__The base set is established given the last day of the data set. The test days resume on the next day.__

In [455]:
# Kernel simulation
# Define the base ahead of time
t = base.tail(1)['Date'].tolist()[0]
# 250 covers the 6 months, since delta counts every day, not just trading days
time = (t - dt.timedelta(days=250))
print(time)
print(t)

base_ = base[base.Date > time]
# this could, and probably is, a different size every time

base_.drop(['returns_open_raw10_next', 
            'returns_res10_next'], axis=1, inplace=True)
print(base_.shape)

2015-10-10 00:00:00
2016-06-16 00:00:00
(8344, 9)


In [456]:
env_sim = []
days = []
for i in range(len(test_dates)):
    dframe = data_test.loc[test_dates[i]]
    days.append(dframe)
    

    # First day comes in...
    test = days[i]

    print(test.shape)
    
    # Simulate template: not sure about this one yet
    test.reset_index(inplace=True)
    t = test.Date[0] 

    import random
    vec = test.asset.tolist()
    print(len(vec))
    print(int(len(vec)*.95))
    
    # be sure to use sample for WOR
    template = pd.DataFrame({ 'assets' :random.sample(vec, k=int(len(vec)*.95)),
                            'confidenceValue': 0})
    print(template.shape)
    unv = template.assets.tolist()
    print(len(unv))
    df_unv = test[test.asset.isin(unv)]

    print(df_unv.shape)



    base_ = pd.concat([base_, df_unv], axis=0)

    X_test = base_.copy()

    X_test >> \
            exTractTime(col='Date', 
                        atts=['weekday', 'month', 'year', 'day', 'quarter'],
                        mthds=['month_name']) >> \
            toCatFeat(feats=['year', 'quarter']) >> \
            macdFeats() >> \
            bbSqueeze() >> \
            rsiFeats()

    print(X_test.shape)

    X_test = X_test.iloc[-df_unv.shape[0]:, :]
    print(X_test.shape)

    # No longer able to drop nulls. Use fill instead
    X_test.drop(['sq_min'], axis=1, inplace=True)
    preproc_feats = trainFeats(X_test, drops=['Date', 'asset'])
    test_pipe = preprocPipe(X_test, binFeat='Volume')
    X_p = test_pipe.fit_transform(X_test[preproc_feats]).fillna(0)

    preds = model.predict(X_p, num_iteration=model.best_iteration) * 2 - 1
    print(preds.shape)
    df_pred = pd.DataFrame({'asset':X_test['asset'],'conf':preds})
    pred_dict = df_pred.set_index('asset').to_dict()['conf']
    template['confidenceValue'] = template.assets.map(pred_dict)
    template.head()

    env_sim.append(template)
    del template

(48, 8)
48
45
(45, 2)
45
(45, 9)
(8389, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8434, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8479, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8524, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8569, 26)
(45, 26)
pipeline created
(45,)
(49, 8)
49
46
(46, 2)
46
(46, 9)
(8615, 26)
(46, 26)
pipeline created
(46,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8660, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8705, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8750, 26)
(45, 26)
pipeline created
(45,)
(49, 8)
49
46
(46, 2)
46
(46, 9)
(8796, 26)
(46, 26)
pipeline created
(46,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8841, 26)
(45, 26)
pipeline created
(45,)
(1, 8)
1
0
(0, 2)
0
(0, 9)
(8841, 26)
(8841, 26)
pipeline created
(8841,)
(48, 8)
48
45
(45, 2)
45
(45, 9)
(8886, 26)
(45, 26)
pipeline created
(45,)
(48, 8)
48
45


In [461]:
env_sim[7].head()

Unnamed: 0,assets,confidenceValue
0,CHL,0.12546
1,C,-0.215846
2,DIS,0.027907
3,SAN,-0.214779
4,AUO,0.077045


In [463]:
len(env_sim)


54

In [None]:
# FULL TEST SET DATASET VERSION
preds = model.predict(X_test, num_iteration=model.best_iteration) 
date_vec = df['date'].loc[df['date'].isin(dates[test_range])]
u = df['universe'].loc[df['date'].isin(dates[test_range])]
actual = df[target].fillna(0).loc[df['date'].isin(dates[test_range])].values[:, 0]


custom_metric(date_vec, preds, actual, u)

In [None]:
lgb.plot_importance(model, )

In [None]:
importances = model.feature_importance()
importances

In [None]:
indices = np.argsort(importances)
top = indices[::-1][:20]

In [None]:
top

In [None]:
importances[14] / sum(importances)

In [None]:
importances = model.feature_importance()
indices = np.argsort(importances)
top = indices[::-1][:20]
for i in range(len(top)):
#     print(top[i])
    name = df_[feats].columns[top[i]]
    w = importances[top[i]] / sum(importances)
#     print(name)
    print('{} Feature: {}, weight: {}'.format(i+1, name, round(w, 3)))

In [None]:
# Can probably drop the vol bins...