In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import datetime
warnings.filterwarnings("ignore")

In [54]:
returns = pd.read_pickle("../Data/returns.pkl")
returns.head()

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,,,,,,,,,,,...,,,,,,,,,,
1990-01-03,-0.835165,-0.475059,5.0,-0.810537,0.361421,0.0,-3.030303,0.0,-0.431034,-1.315789,...,,,,,,,,,,
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.18006,3.454545,-1.5625,0.530612,0.225108,0.0,...,,,,,,,,,,
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.07973,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,
1990-01-08,2.017423,-0.873016,0.0,2.534319,0.0,0.0,0.0,1.010101,0.434783,3.405405,...,,,,,,,,,,


In [60]:
returns = returns.iloc[1:]

In [61]:
def get_investable(stock, t_start, t_end):
    y = returns[[stock]].loc[t_start:t_end, :]
    y.columns = ['Outcome']
    y = y.sort_index(ascending = False)
    
    X = returns.loc[(t_start + pd.DateOffset(-1)):(t_end + pd.DateOffset(-1)), :]
    X = X.sort_index(ascending = False)
    
    df_full = pd.concat([X, y], axis = 1)
    df_full = df_full.sort_index(ascending = False)
    df_full['Outcome'] = df_full['Outcome'].shift(1)
    df_full = df_full.iloc[1:, :]
    
    investable_universe = []

    for col in df_full.columns:
        if ~df_full[col].isna().any():
            investable_universe.append(col)
            
    df_investable = df_full[investable_universe]
    
    return df_investable

In [62]:
# Start, split, end dates of stock
split_ratio = 0.9

def get_start_split_end(stock):
    df = returns[[stock]].dropna()
    split = int(split_ratio * len(df.index))
    return df.index[0], df.index[split], df.index[-1]

In [63]:
# Creates linear model for an individual stock
from sklearn.linear_model import LinearRegression

def linear_predict(stock):
    # train test split
    start, split, end = get_start_split_end(stock)
    investable = get_investable(stock, start, end)
    test = investable.loc[:split]
    train = investable.loc[split:]
    
    model = LinearRegression()
    model.fit(X=train.iloc[:, :-1], y=train["Outcome"])
    
    return model

In [64]:
# Predicts stock price at a given date
def pred_date(stock, date):
    start, split, end = get_start_split_end(stock)
    features = get_investable(stock, start, end).columns[:-1]
    feature_vals = returns.loc[date, features]
    if feature_vals.isna().any():
        print("Missing feature values for " + stock + " at " + date.strftime("%Y-%m-%d"))
        return np.nan
    model = linear_predict(stock)
    prediction = model.predict([feature_vals])[0]
    return prediction

In [65]:
stock = '905270'
linear_model = linear_predict(stock)

date = pd.to_datetime('2020-02-24')

pred_date(stock, date)

Missing feature values for 905270 at 2020-02-24


nan

In [66]:
def pred_date_all(date):
    df = pd.DataFrame()
    for stock in returns.columns:
        df[stock] = [pred_date(stock, date)]
    df["date"] = [date]
    df.set_index("date", inplace=True)
    return df

In [68]:
# sample of stock predictions at 2020-02-04
pred = pred_date_all(date)
pred

Missing feature values for 905270 at 2020-02-24
Missing feature values for 921795 at 2020-02-24
Missing feature values for 904261 at 2020-02-24
Missing feature values for 905261 at 2020-02-24
Missing feature values for 923024 at 2020-02-24
Missing feature values for 902355 at 2020-02-24
Missing feature values for 912215 at 2020-02-24
Missing feature values for 929813 at 2020-02-24
Missing feature values for 921246 at 2020-02-24
Missing feature values for 923116 at 2020-02-24
Missing feature values for 952192 at 2020-02-24
Missing feature values for 912145 at 2020-02-24
Missing feature values for 905339 at 2020-02-24
Missing feature values for 951022 at 2020-02-24
Missing feature values for 906190 at 2020-02-24
Missing feature values for 912201 at 2020-02-24
Missing feature values for 921249 at 2020-02-24
Missing feature values for 906151 at 2020-02-24
Missing feature values for 921917 at 2020-02-24
Missing feature values for 945383 at 2020-02-24
Missing feature values for 905420 at 202

Missing feature values for 921365 at 2020-02-24
Missing feature values for 912808 at 2020-02-24
Missing feature values for 921601 at 2020-02-24
Missing feature values for 944997 at 2020-02-24
Missing feature values for 905056 at 2020-02-24
Missing feature values for 929799 at 2020-02-24
Missing feature values for 906684 at 2020-02-24
Missing feature values for 912812 at 2020-02-24
Missing feature values for 912176 at 2020-02-24
Missing feature values for 923020 at 2020-02-24
Missing feature values for 921284 at 2020-02-24
Missing feature values for 906545 at 2020-02-24
Missing feature values for 904848 at 2020-02-24
Missing feature values for 921698 at 2020-02-24
Missing feature values for 907736 at 2020-02-24
Missing feature values for 912811 at 2020-02-24
Missing feature values for 906147 at 2020-02-24
Missing feature values for 904525 at 2020-02-24
Missing feature values for 905353 at 2020-02-24
Missing feature values for 938892 at 2020-02-24
Missing feature values for 921699 at 202

Missing feature values for 905025 at 2020-02-24
Missing feature values for 905637 at 2020-02-24
Missing feature values for 906265 at 2020-02-24
Missing feature values for 901700 at 2020-02-24
Missing feature values for 902272 at 2020-02-24
Missing feature values for 944345 at 2020-02-24
Missing feature values for 921154 at 2020-02-24
Missing feature values for 905368 at 2020-02-24
Missing feature values for 904870 at 2020-02-24
Missing feature values for 923580 at 2020-02-24
Missing feature values for 501541 at 2020-02-24
Missing feature values for 945421 at 2020-02-24
Missing feature values for 905005 at 2020-02-24
Missing feature values for 921264 at 2020-02-24
Missing feature values for 921991 at 2020-02-24
Missing feature values for 982423 at 2020-02-24
Missing feature values for 921883 at 2020-02-24
Missing feature values for 543944 at 2020-02-24
Missing feature values for 982863 at 2020-02-24
Missing feature values for 905436 at 2020-02-24
Missing feature values for 916695 at 202

Missing feature values for 777266 at 2020-02-24
Missing feature values for 328426 at 2020-02-24
Missing feature values for 292530 at 2020-02-24
Missing feature values for 680006 at 2020-02-24
Missing feature values for 898530 at 2020-02-24
Missing feature values for 357344 at 2020-02-24
Missing feature values for 286019 at 2020-02-24
Missing feature values for 894076 at 2020-02-24
Missing feature values for 292731 at 2020-02-24
Missing feature values for 901657 at 2020-02-24
Missing feature values for 264518 at 2020-02-24
Missing feature values for 904844 at 2020-02-24
Missing feature values for 683363 at 2020-02-24
Missing feature values for 916532 at 2020-02-24
Missing feature values for 878991 at 2020-02-24
Missing feature values for 874835 at 2020-02-24
Missing feature values for 944703 at 2020-02-24
Missing feature values for 921276 at 2020-02-24
Missing feature values for 514950 at 2020-02-24
Missing feature values for 255956 at 2020-02-24
Missing feature values for 902280 at 202

Missing feature values for 325453 at 2020-02-24
Missing feature values for 87659R at 2020-02-24
Missing feature values for 322810 at 2020-02-24
Missing feature values for 87524M at 2020-02-24
Missing feature values for 357366 at 2020-02-24
Missing feature values for 324857 at 2020-02-24
Missing feature values for 8871XR at 2020-02-24
Missing feature values for 670134 at 2020-02-24
Missing feature values for 88874X at 2020-02-24
Missing feature values for 879439 at 2020-02-24
Missing feature values for 298514 at 2020-02-24
Missing feature values for 54060C at 2020-02-24
Missing feature values for 95313K at 2020-02-24
Missing feature values for 9595FU at 2020-02-24
Missing feature values for 740800 at 2020-02-24
Missing feature values for 29739V at 2020-02-24
Missing feature values for 8729N1 at 2020-02-24
Missing feature values for 14846V at 2020-02-24
Missing feature values for 670314 at 2020-02-24
Missing feature values for 9113PQ at 2020-02-24
Missing feature values for 131402 at 202

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-24,,,,,0.35385,,0.743213,,,,...,,-0.649641,0.311411,-5.07516,0.090043,0.393879,-1.597388,-2.435683,-7.891228,0.674937


In [85]:
# bottom 20 are na
returns = returns.iloc[:-20]
returns

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,-0.835165,-0.475059,5.000000,-0.810537,0.361421,0.000000,-3.030303,0.000000,-0.431034,-1.315789,...,,,,,,,,,,
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,3.454545,-1.562500,0.530612,0.225108,0.000000,...,,,,,,,,,,
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,0.000000,1.010101,0.434783,3.405405,...,,,,,,,,,,
1990-01-09,-1.123596,-2.962370,-0.404606,-1.098524,-0.364707,-1.043478,1.639344,0.000000,0.432900,-3.293257,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-26,,,,,-0.592004,,3.202030,,,,...,,1.206485,1.914223,3.401809,2.295228,0.601738,1.716610,0.072600,1.295174,0.628043
2021-04-27,,,,,-0.717899,,-0.234165,,,,...,,-4.532647,-0.071327,-0.897770,-1.162940,2.625166,-0.790167,0.281089,4.939163,-0.615206
2021-04-28,,,,,-0.205423,,-1.396550,,,,...,,-1.467222,-0.713776,-14.137931,-1.942857,0.215866,-2.335300,-4.699352,0.815565,0.128570
2021-04-29,,,,,-0.362289,,-0.130921,,,,...,,-2.505746,0.239636,-2.389218,-1.697897,-1.733980,1.062170,6.648534,-3.050682,-0.277748


In [92]:
# filtering for currently listed
listed_stocks = returns.iloc[-1].dropna().index
listed_stocks

Index(['916328', '936365', '905271', '905113', '905802', '905425', '906156',
       '916305', '992816', '921093',
       ...
       '311917', '69568X', '543755', '77463M', '29235J', '131745', '69487D',
       '68157P', '9110RA', '292703'],
      dtype='object', length=505)

In [93]:
returns[listed_stocks]

Unnamed: 0_level_0,916328,936365,905271,905113,905802,905425,906156,916305,992816,921093,...,311917,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,0.361421,-3.030303,0.000000,1.487518,-1.289070,-0.377358,1.076063,-0.118535,0.661455,0.000000,...,,,,,,,,,,
1990-01-04,-0.180060,-1.562500,-1.263035,0.162656,-1.827262,-1.515152,-1.064608,-1.423505,0.328554,-0.543319,...,,,,,,,,,,
1990-01-05,-1.079730,-3.174603,1.279191,-0.975557,-1.064040,-2.307692,-1.074944,-3.008375,0.357249,-4.371475,...,,,,,,,,,,
1990-01-08,0.000000,0.000000,1.009554,0.000000,1.881935,-0.393701,-0.724416,-0.620337,0.652625,1.142519,...,,,,,,,,,,
1990-01-09,-0.364707,1.639344,-0.999463,-0.985168,-1.055618,-0.395257,-2.190247,-0.624280,-1.002063,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-26,-0.592004,3.202030,-0.041371,0.813517,1.278050,-0.114025,4.164068,0.401352,0.297796,0.959434,...,-1.496913,1.206485,1.914223,3.401809,2.295228,0.601738,1.716610,0.072600,1.295174,0.628043
2021-04-27,-0.717899,-0.234165,-0.134365,-0.589696,3.374908,-0.456621,0.013303,1.914580,-0.244952,3.067689,...,2.042323,-4.532647,-0.071327,-0.897770,-1.162940,2.625166,-0.790167,0.281089,4.939163,-0.615206
2021-04-28,-0.205423,-1.396550,-0.068981,0.280987,7.792761,-0.538991,1.449854,-0.474814,-0.602723,2.070527,...,-1.517679,-1.467222,-0.713776,-14.137931,-1.942857,0.215866,-2.335300,-4.699352,0.815565,0.128570
2021-04-29,-0.362289,-0.130921,0.935771,0.062267,1.053463,1.568085,1.370132,1.514209,-0.074862,0.729002,...,1.483301,-2.505746,0.239636,-2.389218,-1.697897,-1.733980,1.062170,6.648534,-3.050682,-0.277748


In [117]:
def pred_next_day():
    df = returns[listed_stocks]
    last_day = df.index[-1]
    next_day = last_day + datetime.timedelta(days=1)
    new_df = pd.DataFrame()
    for stock in listed_stocks:
        new_df[stock] = [pred_date(stock, last_day)]
    new_df["date"] = [next_day]
    new_df.set_index("date", inplace=True)
    return new_df

In [118]:
pred_tomorrow = pred_next_day()
pred_tomorrow

Unnamed: 0_level_0,916328,936365,905271,905113,905802,905425,906156,916305,992816,921093,...,311917,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-01,0.302648,-2.296877,-0.1897,-0.888425,-0.579806,0.031526,-0.426443,-1.033946,-0.561069,-0.146903,...,-1.380851,0.113121,-0.854038,6.041106,-0.904961,-1.283908,0.221403,-0.284755,2.469163,0.138124


In [138]:
def get_models():
    models = {}
    for stock in listed_stocks:
        start, split, end = get_start_split_end(stock)
        investable = get_investable(stock, start, end)
        model = LinearRegression()
        model.fit(X=investable.iloc[:, :-1], y=investable["Outcome"])
        models[stock] = model
    return models

In [139]:
models = get_models()

In [165]:
def get_features():
    features = {}
    for stock in listed_stocks:
        start, split, end = get_start_split_end(stock)
        feature_list = get_investable(stock, start, end).columns[:-1]
        features[stock] = feature_list
    return features

In [166]:
features = get_features()

In [181]:
def pred_next_days(days):
    df = returns[listed_stocks]
    for day in range(days):
        last_day = df.index[-1]
        new_df = pd.DataFrame()
        for stock in listed_stocks:
            model = models[stock]
            feature_list = features[stock]
            feature_vals = df.loc[last_day, feature_list]
            new_df[stock] = [model.predict([feature_vals])[0]]
        next_day = last_day + datetime.timedelta(days=1)
        new_df["date"] = [next_day]
        new_df.set_index("date", inplace=True)
        df = pd.concat([df, new_df])
    return df

In [184]:
next_30_days = pred_next_days(30)
next_30_days

Unnamed: 0_level_0,916328,936365,905271,905113,905802,905425,906156,916305,992816,921093,...,311917,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,0.361421,-3.030303,0.000000,1.487518,-1.289070,-0.377358,1.076063,-0.118535,0.661455,0.000000,...,,,,,,,,,,
1990-01-04,-0.180060,-1.562500,-1.263035,0.162656,-1.827262,-1.515152,-1.064608,-1.423505,0.328554,-0.543319,...,,,,,,,,,,
1990-01-05,-1.079730,-3.174603,1.279191,-0.975557,-1.064040,-2.307692,-1.074944,-3.008375,0.357249,-4.371475,...,,,,,,,,,,
1990-01-08,0.000000,0.000000,1.009554,0.000000,1.881935,-0.393701,-0.724416,-0.620337,0.652625,1.142519,...,,,,,,,,,,
1990-01-09,-0.364707,1.639344,-0.999463,-0.985168,-1.055618,-0.395257,-2.190247,-0.624280,-1.002063,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-26,0.056318,0.114489,0.056193,0.039600,0.047228,0.022743,0.060699,0.036289,0.114874,0.043449,...,0.127063,0.297819,0.101138,0.334805,0.14703,0.158502,0.14856,0.161704,0.286238,0.083639
2021-05-27,0.056318,0.114489,0.056193,0.039600,0.047228,0.022743,0.060699,0.036289,0.114874,0.043449,...,0.127063,0.297819,0.101138,0.334805,0.14703,0.158502,0.14856,0.161704,0.286238,0.083639
2021-05-28,0.056318,0.114489,0.056193,0.039600,0.047228,0.022743,0.060699,0.036289,0.114874,0.043449,...,0.127063,0.297819,0.101138,0.334805,0.14703,0.158502,0.14856,0.161704,0.286238,0.083639
2021-05-29,0.056318,0.114489,0.056193,0.039600,0.047228,0.022743,0.060699,0.036289,0.114874,0.043449,...,0.127063,0.297819,0.101138,0.334805,0.14703,0.158502,0.14856,0.161704,0.286238,0.083639


In [190]:
sorted_returns = next_30_days.iloc[-1].sort_values(ascending=False)
for i in range(len(sorted_returns)):
    print(str(i + 1) + ". " + sorted_returns.index[i] + ": " + str(sorted_returns[i]))

1. 77463M: 0.33480466796655484
2. 69568X: 0.29781897798185475
3. 9434UH: 0.29584347521715715
4. 9110RA: 0.28623837912965466
5. 8858FM: 0.2409652001878363
6. 9801CK: 0.20623121795004412
7. 9406MA: 0.20169943844295973
8. 15303X: 0.20133735289316668
9. 891399: 0.1995103537340624
10. 512785: 0.19438199414513774
11. 30241D: 0.18724618842975294
12. 694405: 0.18308891442317113
13. 13466Q: 0.17514021461409493
14. 683199: 0.16895398993222915
15. 8857RL: 0.166919456366307
16. 28484K: 0.16486805163798063
17. 68157P: 0.1617035765798421
18. 9110QF: 0.16062934977422486
19. 292538: 0.15971113677899768
20. 131745: 0.15850243045650197
21. 699628: 0.1573080765130379
22. 699786: 0.15619710714834156
23. 298957: 0.1558572645964023
24. 32451J: 0.1518633929989897
25. 9269TG: 0.15180206967192866
26. 923602: 0.14984242848082105
27. 32199R: 0.14858331975439681
28. 69487D: 0.14856034207541094
29. 9664FX: 0.14797271700214198
30. 29235J: 0.14702956407533335
31. 89261N: 0.1469857804357703
32. 50806E: 0.145565579469