In [1]:
# Import packages
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import metrics
import math
import pickle
import joblib

In [2]:
# Read data
df_raw = pd.read_csv('C:/Users/karen/PycharmProjects/ycng228-project/.data/_SP500_data_all.csv',index_col = 0)
df_raw.index = pd.to_datetime(df_raw.index)
df_raw.index.name = 'date'
df_raw = df_raw.sort_index(axis = 0)

In [3]:
df_raw.head(5)

Unnamed: 0_level_0,open,high,low,close,adjclose,volume,ticker
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970-03-25,6.78125,6.9375,6.78125,6.875,0.261931,68400,ED
1970-03-25,0.341564,0.368313,0.340535,0.349794,0.158033,2041200,MCD
1970-03-25,8.08403,8.434241,8.054846,8.200767,1.285789,382912,IP
1970-03-25,1.753906,1.796875,1.753906,1.789063,0.158143,2720000,XOM
1970-03-25,15.750478,16.108988,15.750478,15.917782,3.698164,1303316,IBM


In [4]:

df_open = df_raw[['open','ticker']]
df_high = df_raw[['high','ticker']]
df_low = df_raw[['low','ticker']]
df_close = df_raw[['close','ticker']]
df_adjclose = df_raw[['adjclose','ticker']]
df_volume = df_raw[['volume','ticker']]


In [5]:
# Functions
def pivot_df(_df):
    _df = _df.pivot(columns = "ticker")
    return _df


def diff_df(_df):
    _df = _df.diff()
    return _df


def trim_df(_df,_start,_end):
    dates = _df.index.get_level_values('date')[(_df.index.get_level_values('date') >= _start) & \
                                               (_df.index.get_level_values('date') <= _end)]
    _df = pd.DataFrame(_df[_df.index.get_level_values('date').isin(dates)][_df.columns])
    return _df


def create_input_0(_df,_pivot_start,_pivot_end):
    pivots = _df.index.get_level_values('date')[(_df.index.get_level_values('date') >= _pivot_start) & \
                                               (_df.index.get_level_values('date') <= _pivot_end)]

    column_names = ['day' + str(i) for i in range(1,31)]
    column_names.append('output')
    column_names.append('ticker')
    column_names.append('date')

    df_input = pd.DataFrame(columns=column_names)
    
    for i in pivots:
        df_tmp = trim_df(_df,_df.index[0],i)
        df_tmp = df_tmp.tail(32).diff().dropna()
        df_tmp = pd.DataFrame(df_tmp.transpose().values,columns=column_names[:-2])
        df_tmp['ticker'] = _df.columns.get_level_values('ticker').to_list()
        df_tmp['date'] = i
        df_input = pd.concat([df_input, df_tmp], ignore_index=True)
        
    return df_input


def create_input_1(_df,_pivot_start,_pivot_end):
    pivots = _df.index.get_level_values('date')[(_df.index.get_level_values('date') >= _pivot_start) & \
                                               (_df.index.get_level_values('date') <= _pivot_end)]

    column_names = ['day' + str(i) for i in range(1,46)] #lag=45
    column_names.append('output')
    column_names.append('ticker')
    column_names.append('date')

    df_input = pd.DataFrame(columns=column_names)
    
    for i in pivots:
        df_tmp = trim_df(_df,_df.index[0],i)
        df_tmp = df_tmp.tail(47).diff().dropna() #lag=45
        df_tmp = pd.DataFrame(df_tmp.transpose().values,columns=column_names[:-2])
        df_tmp['ticker'] = _df.columns.get_level_values('ticker').to_list()
        df_tmp['date'] = i
        df_input = pd.concat([df_input, df_tmp], ignore_index=True)
        
    return df_input

def create_input_2(_df,_pivot_start,_pivot_end):
    pivots = _df.index.get_level_values('date')[(_df.index.get_level_values('date') >= _pivot_start) & \
                                               (_df.index.get_level_values('date') <= _pivot_end)]

    column_names = ['day' + str(i) for i in range(1,61)] #lag=60
    column_names.append('output')
    column_names.append('ticker')
    column_names.append('date')

    df_input = pd.DataFrame(columns=column_names)
    
    for i in pivots:
        df_tmp = trim_df(_df,_df.index[0],i)
        df_tmp = df_tmp.tail(62).diff().dropna() #lag=60
        df_tmp = pd.DataFrame(df_tmp.transpose().values,columns=column_names[:-2])
        df_tmp['ticker'] = _df.columns.get_level_values('ticker').to_list()
        df_tmp['date'] = i
        df_input = pd.concat([df_input, df_tmp], ignore_index=True)
        
    return df_input


def create_input_3(_df,_pivot_start,_pivot_end):
    pivots = _df.index.get_level_values('date')[(_df.index.get_level_values('date') >= _pivot_start) & \
                                               (_df.index.get_level_values('date') <= _pivot_end)]

    column_names = ['day' + str(i) for i in range(1,91)] #lag=90
    column_names.append('output')
    column_names.append('ticker')
    column_names.append('date')

    df_input = pd.DataFrame(columns=column_names)
    #df_tmp = pd.DataFrame(columns=_df.columns)
    
    for i in pivots:
        df_tmp = trim_df(_df,_df.index[0],i)
        df_tmp = df_tmp.tail(92).diff().dropna() #lag=90
        df_tmp = pd.DataFrame(df_tmp.transpose().values,columns=column_names[:-2])
        df_tmp['ticker'] = _df.columns.get_level_values('ticker').to_list()
        df_tmp['date'] = i
        df_input = pd.concat([df_input, df_tmp], ignore_index=True)
        
    return df_input


def tokenize_output(_df,_column='output'):
    df_tmp = _df
    df_tmp.loc[df_tmp[_column] <= 0, _column] = -1 
    df_tmp.loc[df_tmp[_column] >  0, _column] =  1 

    return df_tmp


In [6]:
# CREATE formatted dfs
df_open_clean = pivot_df(trim_df(df_open,'2018-01-01','2022-09-01')).fillna(method = 'bfill')
df_high_clean = pivot_df(trim_df(df_high,'2018-01-01','2022-09-01')).fillna(method = 'bfill')
df_low_clean = pivot_df(trim_df(df_low,'2018-01-01','2022-09-01')).fillna(method = 'bfill')
df_close_clean = pivot_df(trim_df(df_close,'2018-01-01','2022-09-01')).fillna(method = 'bfill')
df_adjclose_clean = pivot_df(trim_df(df_adjclose,'2018-01-01','2022-09-01')).fillna(method = 'bfill')
df_volume_clean = pivot_df(trim_df(df_volume,'2018-01-01','2022-09-01')).fillna(method = 'bfill')

In [44]:
# Iteration 0 - Baseline

In [16]:
df_train_0 = tokenize_output(create_input_0(trim_df(df_close_clean,'2019-01-01','2022-05-01'),'2019-04-01','2022-05-01'))
df_test_0 = tokenize_output(create_input_0(trim_df(df_close_clean,'2022-01-01','2022-09-01'),'2022-06-01','2022-09-01'))

In [17]:
lr_0 = LogisticRegression(solver='liblinear')

In [18]:
X_0,y_0 = df_train_0.iloc[:,:-3],df_train_0.iloc[:,30]
X_valid_0,y_valid_0 = df_test_0.iloc[:,:-3],df_test_0.iloc[:,30]
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size=0.33, random_state=42)
lr_0.fit(X_train_0,y_train_0)
y_pred_0 = lr_0.predict(X_test_0)
y_predv_0 = lr_0.predict(X_valid_0)

In [19]:
bal_acc_train_0 = metrics.balanced_accuracy_score(y_test_0, y_pred_0)
bal_acc_valid_0 = metrics.balanced_accuracy_score(y_valid_0, y_predv_0)
print('Training balanced accuracy: ' + str(bal_acc_train_0))
print('Validation balanced accuracy: ' + str(bal_acc_valid_0))

Training balanced accuracy: 0.5073219375692692
Validation balanced accuracy: 0.5055112982376558


In [20]:
# Iteration 1 - Adjust lag from 30 to 45 days

In [23]:
df_train_1 = tokenize_output(create_input_1(trim_df(df_close_clean,'2019-01-01','2022-05-01'),'2019-04-01','2022-05-01'))
df_test_1 = tokenize_output(create_input_1(trim_df(df_close_clean,'2022-01-01','2022-09-01'),'2022-06-01','2022-09-01'))

In [24]:
lr_1 = LogisticRegression(solver='liblinear')

In [25]:
X_1,y_1 = df_train_1.iloc[:,:-3],df_train_1.iloc[:,45]
X_valid_1,y_valid_1 = df_test_1.iloc[:,:-3],df_test_1.iloc[:,45]
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.33, random_state=42)
y_train_1=y_train_1.astype('int')
lr_1.fit(X_train_1,y_train_1)
y_pred_1 = lr_1.predict(X_test_1)
y_predv_1 = lr_1.predict(X_valid_1)

In [29]:
y_test_1 = y_test_1.astype('int')
y_predv_1 = y_predv_1.astype('int')
y_valid_1 = y_valid_1.astype('int')

In [30]:
bal_acc_train_1 = metrics.balanced_accuracy_score(y_test_1, y_pred_1)
bal_acc_valid_1 = metrics.balanced_accuracy_score(y_valid_1, y_predv_1)
print('Training balanced accuracy: ' + str(bal_acc_train_1))
print('Validation balanced accuracy: ' + str(bal_acc_valid_1))

Training balanced accuracy: 0.5098055076313895
Validation balanced accuracy: 0.5115039569297077


In [None]:
# Iteration 2 - Adjust lag from 45 to 60 days

In [25]:
df_train_2 = tokenize_output(create_input_2(trim_df(df_close_clean,'2019-01-01','2022-05-01'),'2019-04-01','2022-05-01'))
df_test_2 = tokenize_output(create_input_2(trim_df(df_close_clean,'2022-01-01','2022-09-01'),'2022-06-01','2022-09-01'))

In [26]:
lr_2 = LogisticRegression(solver='liblinear')

In [27]:
X_2,y_2 = df_train_2.iloc[:,:-3],df_train_2.iloc[:,60]
X_valid_2,y_valid_2 = df_test_2.iloc[:,:-3],df_test_2.iloc[:,60]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.33, random_state=42)
lr_2.fit(X_train_2,y_train_2)
y_pred_2 = lr_2.predict(X_test_2)
y_predv_2 = lr_2.predict(X_valid_2)

In [28]:
bal_acc_train_2 = metrics.balanced_accuracy_score(y_test_2, y_pred_2)
bal_acc_valid_2 = metrics.balanced_accuracy_score(y_valid_2, y_predv_2)
print('Training balanced accuracy: ' + str(bal_acc_train_2))
print('Validation balanced accuracy: ' + str(bal_acc_valid_2))

Training balanced accuracy: 0.5114626955272751
Validation balanced accuracy: 0.5062041650195774


In [324]:
# Iteration 3 - Adjust lag from 60 to 90 days

In [31]:
df_train_3 = tokenize_output(create_input_3(trim_df(df_close_clean,'2018-11-01','2022-05-01'),'2019-04-01','2022-05-01'))
df_test_3 = tokenize_output(create_input_3(trim_df(df_close_clean,'2022-01-01','2022-09-01'),'2022-06-01','2022-09-01'))

In [32]:
lr_3 = LogisticRegression(solver='liblinear')

In [35]:
X_3,y_3 = df_train_3.iloc[:,:-3],df_train_3.iloc[:,90]
X_valid_3,y_valid_3 = df_test_3.iloc[:,:-3],df_test_3.iloc[:,90]
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.33, random_state=42)
lr_3.fit(X_train_3,y_train_3)
y_pred_3 = lr_3.predict(X_test_3)
y_predv_3 = lr_3.predict(X_valid_3)

In [36]:
bal_acc_train_3 = metrics.balanced_accuracy_score(y_test_3, y_pred_3)
bal_acc_valid_3 = metrics.balanced_accuracy_score(y_valid_3, y_predv_3)
print('Training balanced accuracy: ' + str(bal_acc_train_3))
print('Validation balanced accuracy: ' + str(bal_acc_valid_3))

Training balanced accuracy: 0.5150177052885482
Validation balanced accuracy: 0.4994879580548367


In [31]:
filename = 'mdl.sav'
joblib.dump(lr_1, filename)

['mdl.sav']