In [246]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from tqdm import tqdm_notebook

In [247]:
df = pd.read_csv('data/train.csv')
df = df.rename({'Company ': 'Company'}, axis=1)
df = df.sort_values(['Company', 'ID'])
df = df.reset_index()

features = df.columns[4:-1].tolist()
next_records_count = 5

for feature in features:
    for i in range(1, next_records_count + 1):
        df.insert(df.shape[1], '{0}_{1}'.format(feature, i), np.nan)

for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0], desc='Filling additional features'):
    for feature in features:
        for i in range(1, next_records_count + 1):
            if index + i < df.shape[0] and df.at[index + i, 'Company'] == row['Company']:
                df.at[index, '{0}_{1}'.format(feature, i)] = df.loc[index + i, feature]

df = df.dropna()
df[:5]

HBox(children=(IntProgress(value=0, description='Filling additional features', max=11997, style=ProgressStyle(…

Unnamed: 0,index,ID,Date,Company,SMA,EMA,WMA,DEMA,TEMA,TRIMA,...,PHASE_1,PHASE_2,PHASE_3,PHASE_4,PHASE_5,QUADRATURE_1,QUADRATURE_2,QUADRATURE_3,QUADRATURE_4,QUADRATURE_5
54,162,163,03-04-2148,ABC,26.936,26.916,27.1042,27.5128,27.2546,26.9737,...,0.1255,0.3447,0.6608,0.4866,0.0214,0.1485,0.5227,0.0459,-1.1171,-1.6814
55,165,166,06-04-2148,ABC,27.042,26.9313,27.1158,27.432,27.1422,27.074,...,0.3447,0.6608,0.4866,0.0214,-0.4524,0.5227,0.0459,-1.1171,-1.6814,-2.1545
56,168,169,07-04-2148,ABC,26.986,26.7274,26.8918,26.9703,26.5223,27.093,...,0.6608,0.4866,0.0214,-0.4524,-1.2657,0.0459,-1.1171,-1.6814,-2.1545,-2.1983
57,171,172,08-04-2148,ABC,26.749,26.4588,26.5762,26.4377,25.8552,27.0033,...,0.4866,0.0214,-0.4524,-1.2657,-1.9242,-1.1171,-1.6814,-2.1545,-2.1983,-0.4979
58,174,175,09-04-2148,ABC,26.58,26.2045,26.2691,25.9792,25.3354,26.8447,...,0.0214,-0.4524,-1.2657,-1.9242,-1.7462,-1.6814,-2.1545,-2.1983,-0.4979,1.9877


In [248]:
corr = df[df.columns[3:]].corr().sort_values('Price', ascending=False)['Price']
corr

Price                 1.000000
TEMA_2                0.999733
DEMA_3                0.999661
DEMA_2                0.999602
TEMA_3                0.999590
WMA_4                 0.999578
TRIMA_5               0.999557
WMA_5                 0.999513
WMA_3                 0.999498
DEMA_4                0.999480
TEMA_1                0.999444
SMA_5                 0.999429
Real Middle Band_5    0.999429
EMA_4                 0.999384
EMA_5                 0.999375
TRIMA_4               0.999297
EMA_3                 0.999280
Real Middle Band_4    0.999259
SMA_4                 0.999259
WMA_2                 0.999243
DEMA_1                0.999216
TEMA_4                0.999181
DEMA_5                0.999145
MIDPOINT_5            0.999112
MIDPRICE_5            0.999096
T3_5                  0.999095
MIDPRICE_4            0.999053
MIDPOINT_4            0.999041
EMA_2                 0.999026
KAMA_5                0.998982
                        ...   
MAC_Hist_5           -0.004746
BOP_3   

In [265]:
model_vars = [name for name in corr.index if corr[name] < 1.0 and corr[name] > 0.98]
model = sm.OLS(df['Price'], df[model_vars]).fit()
model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,1131000.0
Date:,"Thu, 04 Jul 2019",Prob (F-statistic):,0.0
Time:,04:16:32,Log-Likelihood:,-23068.0
No. Observations:,11820,AIC:,46300.0
Df Residuals:,11736,BIC:,46920.0
Df Model:,84,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
TEMA_2,-25.8741,222.594,-0.116,0.907,-462.195,410.447
DEMA_3,26.0234,15.639,1.664,0.096,-4.632,56.679
DEMA_2,22.8483,19.061,1.199,0.231,-14.514,60.211
TEMA_3,-95.4573,215.523,-0.443,0.658,-517.919,327.005
WMA_4,96.2771,490.087,0.196,0.844,-864.375,1056.929
TRIMA_5,-225.1470,495.076,-0.455,0.649,-1195.578,745.284
WMA_5,609.5775,458.960,1.328,0.184,-290.060,1509.215
WMA_3,-205.3959,503.283,-0.408,0.683,-1191.913,781.121
DEMA_4,-17.7895,18.700,-0.951,0.341,-54.445,18.866

0,1,2,3
Omnibus:,5367.966,Durbin-Watson:,1.951
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3027941.694
Skew:,0.793,Prob(JB):,0.0
Kurtosis:,81.394,Cond. No.,1.52e+17


In [266]:
full_names = ['simpleMovingAverage', 'exponentialMovingAverage', 'waysAndMeansAdvances', 'doubleExponentialMovingAverage', 'tripleExponentialMovingAverage', 'triangularMovingAverage', 'kaufmansAdaptiveMovingAverage', 'fAdaptiveMovingAverage', 'mesaAdaptiveMovingAverage', 't3MovingAverage', 'movingAverageConvergenceDivergence', 'movingAverageConvergenceDivergenceHistogram', 'movingAverageConvergenceDivergenceSignal', 'movingAverageConvergence', 'movingAverageConvergenceHistogram', 'movingAverageConvergenceSignal', 'slowStochasticOscillatorSecondLine', 'slowStochasticOscillatorMainLine', 'fastStochasticOscillatorSecondLine', 'fastStochasticOscillatorMainLine', 'relativeStrengthIndex', 'fastStochasticSecondLine', 'fastStochasticMainLine', 'willr', 'averageDirectionalIndex', 'averageDirectionalIndexRating', 'absolutePriceOscillator', 'percentagePriceOscillator', 'momentumIndicator', 'balanceOfPower', 'commodityChannelIndex', 'chandeMomentumOscillator', 'rateOfChange', 'rateOfChangeRating', 'aroonOscillatorDown', 'aroonOscillatorUp', 'aroonOscillator', 'moneyFlowIndex', 'tripleSmoothedExponentialMovingAverage', 'ultimateOscillatorDefinitionAndStrategies', 'directionalMovementIndex', 'minusDirectionalIndex', 'plusDirectionalIndex', 'minusDirectionalMovementIndex', 'plusDirectionalMovementIndex', 'relativeStrengthIndexLowerBand', 'relativeStrengthIndexMiddleBand', 'relativeStrengthIndexHighBand', 'midPoint', 'midPrice', 'parabolicStopAndReverse', 'trange', 'averageTrueRange', 'normalizedAverageTrueRange', 'chaikinAccumulationDistributionLine', 'accumulationDistributionOscillatorSC', 'onBalanceVolume', 'htTrendline', 'leadSine', 'sine', 'trendMode', 'dcPeriod', 'htDcPhase', 'phase', 'quadrature']

indexes = [int(name.rsplit('_', 1)[1]) for name in model.params.index if '_' in name and name not in features]

if len(indexes) > 0:
    print('if (nextRecords.size < {0}) {{'.format(max(indexes)))
    print('    return record.tripleExponentialMovingAverage')
    print('}')
    print()

print('return listOf(')
    
for name in model.params.index:
    modifier = model.params[name]
    comma = ',' if name != model.params.index[-1] else ''
    
    if '_' in name and name not in features:
        parts = name.rsplit('_', 1)
        index = int(parts[1]) - 1
        full_name = full_names[features.index(parts[0])]
        
        print('    nextRecords[{0}].{1} * {2}{3}'.format(index, full_name, modifier, comma))
    else:
        full_name = full_names[features.index(name)]
        
        print('    record.{0} * {1}{2}'.format(full_name, modifier, comma))

print(').sum()')

if (nextRecords.size < 5) {
    return record.tripleExponentialMovingAverage
}

return listOf(
    nextRecords[1].tripleExponentialMovingAverage * -25.874081922721075,
    nextRecords[2].doubleExponentialMovingAverage * 26.023394328161,
    nextRecords[1].doubleExponentialMovingAverage * 22.848290202324645,
    nextRecords[2].tripleExponentialMovingAverage * -95.45727981253958,
    nextRecords[3].waysAndMeansAdvances * 96.27714983847545,
    nextRecords[4].triangularMovingAverage * -225.1470452669446,
    nextRecords[4].waysAndMeansAdvances * 609.5774916782393,
    nextRecords[2].waysAndMeansAdvances * -205.39592365463704,
    nextRecords[3].doubleExponentialMovingAverage * -17.789450884784262,
    nextRecords[0].tripleExponentialMovingAverage * 104.35103886770264,
    nextRecords[4].simpleMovingAverage * 1139.0693028095557,
    nextRecords[4].relativeStrengthIndexMiddleBand * 1139.0693028445194,
    nextRecords[3].exponentialMovingAverage * 161.01774618517402,
    nextRecords[4].expon