In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
returns = pd.read_pickle("../Data/returns.pkl")
returns.head()

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,,,,,,,,,,,...,,,,,,,,,,
1990-01-03,-0.835165,-0.475059,5.0,-0.810537,0.361421,0.0,-3.030303,0.0,-0.431034,-1.315789,...,,,,,,,,,,
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.18006,3.454545,-1.5625,0.530612,0.225108,0.0,...,,,,,,,,,,
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.07973,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,
1990-01-08,2.017423,-0.873016,0.0,2.534319,0.0,0.0,0.0,1.010101,0.434783,3.405405,...,,,,,,,,,,


In [4]:
#drop very first row
returns = returns.iloc[1:, :]

In [5]:
def get_investable(stock, t_start, t_end):
    y = returns[[stock]].loc[t_start:t_end, :]
    y.columns = ['Outcome']
    y = y.sort_index(ascending = False)
    
    X = returns.loc[(t_start + pd.DateOffset(-1)):(t_end + pd.DateOffset(-1)), :]
    X = X.sort_index(ascending = False)
    
    df_full = pd.concat([X, y], axis = 1)
    df_full = df_full.sort_index(ascending = False)
    df_full['Outcome'] = df_full['Outcome'].shift(1)
    df_full = df_full.iloc[1:, :]
    
    investable_universe = []

    for col in df_full.columns:
        if ~df_full[col].isna().any():
            investable_universe.append(col)
            
    df_investable = df_full[investable_universe]
    
    return df_investable

In [6]:
# Start, split, end dates of stock
split_ratio = 0.9

def get_start_split_end(stock):
    df = returns[[stock]].dropna()
    split = int(split_ratio * len(df.index))
    return df.index[0], df.index[split], df.index[-1]

In [41]:
# Creates linear model for an individual stock
from sklearn.linear_model import LinearRegression

def linear_predict(stock):
    # train test split
    start, split, end = get_start_split_end(stock)
    investable = get_investable(stock, start, end)
    test = investable.loc[:split]
    train = investable.loc[split:]
    
    model = LinearRegression()
    model.fit(X=train.iloc[:, :-1], y=train["Outcome"])
    
    return model

In [47]:
# Predicts stock price at a given date
def pred_date(stock, date):
    start, split, end = get_start_split_end(stock)
    features = get_investable(stock, start, end).columns[:-1]
    feature_vals = returns.loc[date, features]
    if feature_vals.isna().any():
        print("Missing feature values for " + stock + " at " + date.strftime("%Y-%m-%d"))
        return np.nan
    model = linear_predict(stock)
    prediction = model.predict([feature_vals])[0]
    return prediction

In [48]:
stock = '905270'
linear_model = linear_predict(stock)

date = pd.to_datetime('2020-02-24')

pred_date(stock, date)

Missing feature values for 905270 at 2020-02-24


nan

In [62]:
def pred_date_all(date):
    df = pd.DataFrame()
    for stock in returns.columns:
        df[stock] = [pred_date(stock, date)]
    df["date"] = [date]
    df.set_index("date", inplace=True)
    return df

In [61]:
# sample of first 20 stocks at 2020-02-04
pred_date_all(date)

Missing feature values for 905270 at 2020-02-24
Missing feature values for 921795 at 2020-02-24
Missing feature values for 904261 at 2020-02-24
Missing feature values for 905261 at 2020-02-24
Missing feature values for 923024 at 2020-02-24
Missing feature values for 902355 at 2020-02-24
Missing feature values for 912215 at 2020-02-24
Missing feature values for 929813 at 2020-02-24
Missing feature values for 921246 at 2020-02-24
Missing feature values for 923116 at 2020-02-24
Missing feature values for 952192 at 2020-02-24
Missing feature values for 912145 at 2020-02-24
Missing feature values for 905339 at 2020-02-24
Missing feature values for 951022 at 2020-02-24
Missing feature values for 906190 at 2020-02-24


Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,905271,921246,923116,952192,905113,912145,905339,951022,905802,906190
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-02-24,,,,,0.35385,,0.743213,,,,-0.187696,,,,0.742668,,,,0.833675,
