In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import datetime
warnings.filterwarnings("ignore")

In [21]:
returns = pd.read_pickle("../Data/returns.pkl")
returns

Unnamed: 0_level_0,905270,921795,904261,905261,916328,923024,936365,902355,912215,929813,...,9660J1,69568X,543755,77463M,29235J,131745,69487D,68157P,9110RA,292703
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,,,,,,,,,,,...,,,,,,,,,,
1990-01-03,-0.835165,-0.475059,5.000000,-0.810537,0.361421,0.000000,-3.030303,0.000000,-0.431034,-1.315789,...,,,,,,,,,,
1990-01-04,-0.797872,-1.272872,2.380952,-1.634321,-0.180060,3.454545,-1.562500,0.530612,0.225108,0.000000,...,,,,,,,,,,
1990-01-05,-2.546917,1.531023,-0.372093,-1.661475,-1.079730,1.054482,-3.174603,0.487211,-0.656531,-1.333333,...,,,,,,,,,,
1990-01-08,2.017423,-0.873016,0.000000,2.534319,0.000000,0.000000,0.000000,1.010101,0.434783,3.405405,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-24,,,,,0.136600,,0.349877,,,,...,0.376702,4.400221,1.590198,-0.991004,1.682900,3.364703,2.271817,0.634962,4.859794,1.204016
2021-05-25,,,,,0.946372,,0.542355,,,,...,0.230947,-0.288569,0.449063,-1.661106,0.439588,-0.548765,0.185928,0.727530,1.884279,-0.498203
2021-05-26,,,,,-1.393581,,0.616491,,,,...,1.296083,2.388001,-0.166049,3.255613,2.317760,4.702784,0.673960,2.336242,-0.429666,-1.520393
2021-05-27,,,,,-0.907923,,0.102119,,,,...,0.056867,1.893011,-0.063971,-0.510347,1.044185,-0.275482,1.979237,1.876893,1.247655,2.369727


In [22]:
returns = returns.iloc[1:]

In [57]:
# dropping columns with all NaN
drop_columns = []

for col in returns.columns:
    if returns[col].isnull().all() == True:
        drop_columns.append(col)
        
returns.drop(columns=drop_columns, inplace=True)

In [58]:
def get_investable(stock, t_start, t_end):
    y = returns[[stock]].loc[t_start:t_end, :]
    y.columns = ['Outcome']
    y = y.sort_index(ascending = False)
    
    X = returns.loc[(t_start + pd.DateOffset(-1)):(t_end + pd.DateOffset(-1)), :]
    X = X.sort_index(ascending = False)
    
    df_full = pd.concat([X, y], axis = 1)
    df_full = df_full.sort_index(ascending = False)
    df_full['Outcome'] = df_full['Outcome'].shift(1)
    df_full = df_full.iloc[1:, :]
    
    investable_universe = []

    for col in df_full.columns:
        if ~df_full[col].isna().any():
            investable_universe.append(col)
            
    df_investable = df_full[investable_universe]
    
    return df_investable

In [59]:
# Start, split, end dates of stock
split_ratio = 0.9

def get_start_split_end(stock):
    df = returns[[stock]].dropna()
    split = int(split_ratio * len(df.index))
    return df.index[0], df.index[split], df.index[-1]

In [60]:
# Creates linear model for an individual stock
from sklearn.linear_model import LinearRegression

def linear_predict(stock):
    # train test split
    start, split, end = get_start_split_end(stock)
    investable = get_investable(stock, start, end)
    test = investable.loc[:split]
    train = investable.loc[split:]
    
    model = LinearRegression()
    model.fit(X=train.iloc[:, :-1], y=train["Outcome"])
    
    return model

In [61]:
# Predicts stock price at a given date
def pred_date(stock, date):
    start, split, end = get_start_split_end(stock)
    features = get_investable(stock, start, end).columns[:-1]
    feature_vals = returns.loc[date, features]
    if feature_vals.isna().any():
        return np.nan
    model = linear_predict(stock)
    prediction = model.predict([feature_vals])[0]
    return prediction

In [62]:
def pred_date_all(date):
    df = pd.DataFrame()
    for stock in returns.columns:
        df[stock] = [pred_date(stock, date)]
    df["date"] = [date + datetime.timedelta(days=1)]
    df.set_index("date", inplace=True)
    return df

In [73]:
def pipeline(date, num_stocks):
    df = pred_date_all(date)
    sorted_vals = returns.iloc[0].dropna().sort_values(ascending=False)
    return sorted_vals.head(num_stocks), sorted_vals.tail(num_stocks)

In [74]:
date = pd.to_datetime('2020-02-24')

portfolio = pipeline(date, 5)
portfolio

(719630    14.285714
 929302    12.008639
 519803    10.000000
 912131    10.000000
 906394     7.347166
 Name: 1990-01-03 00:00:00, dtype: float64,
 912377    -6.363636
 907615    -6.448839
 922843    -7.975460
 719618   -10.892917
 756210   -16.319444
 Name: 1990-01-03 00:00:00, dtype: float64)

In [80]:
def average_returns(top5, bot5):
    return top5.mean() - bot5.mean()

In [81]:
average_returns(portfolio[0], portfolio[1])

20.32836326834488

In [88]:
actual = returns.loc[date, :]
sorted_actual = actual.dropna().sort_values(ascending=False)
sorted_actual.head(5), sorted_actual.tail(5)

(544623    5.514078
 921264    4.735272
 546697    4.591105
 905047    4.347826
 326996    2.859657
 Name: 2020-02-24 00:00:00, dtype: float64,
 14863U    -9.375477
 755695    -9.426721
 27020T   -10.540037
 916532   -14.782609
 88874X   -18.554688
 Name: 2020-02-24 00:00:00, dtype: float64)

In [89]:
average_returns(sorted_actual.head(5), sorted_actual.tail(5))

16.945493901942818