In [1]:
#import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
# create data frame
df = pd.read_csv('stock_data.csv')
print(df.head(10))

         date   open   high     low  close    volume Name   7_day_ma  \
0  2013-02-08  45.07  45.35  45.000  45.08   1824755    A        NaN   
1  2013-02-11  45.17  45.18  44.450  44.60   2915405    A        NaN   
2  2013-02-12  44.81  44.95  44.500  44.62   2373731    A        NaN   
3  2013-02-13  44.81  45.24  44.680  44.75   2052338    A        NaN   
4  2013-02-14  44.72  44.78  44.360  44.58   3826245    A        NaN   
5  2013-02-15  43.48  44.24  42.210  42.25  14657315    A        NaN   
6  2013-02-19  42.21  43.12  42.210  43.01   4116141    A  44.127143   
7  2013-02-20  42.84  42.85  42.225  42.24   3873183    A  43.721429   
8  2013-02-21  42.14  42.14  41.470  41.63   3415149    A  43.297143   
9  2013-02-22  41.83  42.07  41.580  41.80   3354862    A  42.894286   

   15_day_ma  30_day_ma  daily_returns  daily_volatility  target  
0        NaN        NaN            NaN               NaN       0  
1        NaN        NaN      -0.010648               NaN       0  
2     

In [3]:
#remove target column. Not necessary for our purposes
df.drop('target', axis=1, inplace=True)
print(df.head(10))

         date   open   high     low  close    volume Name   7_day_ma  \
0  2013-02-08  45.07  45.35  45.000  45.08   1824755    A        NaN   
1  2013-02-11  45.17  45.18  44.450  44.60   2915405    A        NaN   
2  2013-02-12  44.81  44.95  44.500  44.62   2373731    A        NaN   
3  2013-02-13  44.81  45.24  44.680  44.75   2052338    A        NaN   
4  2013-02-14  44.72  44.78  44.360  44.58   3826245    A        NaN   
5  2013-02-15  43.48  44.24  42.210  42.25  14657315    A        NaN   
6  2013-02-19  42.21  43.12  42.210  43.01   4116141    A  44.127143   
7  2013-02-20  42.84  42.85  42.225  42.24   3873183    A  43.721429   
8  2013-02-21  42.14  42.14  41.470  41.63   3415149    A  43.297143   
9  2013-02-22  41.83  42.07  41.580  41.80   3354862    A  42.894286   

   15_day_ma  30_day_ma  daily_returns  daily_volatility  
0        NaN        NaN            NaN               NaN  
1        NaN        NaN      -0.010648               NaN  
2        NaN        NaN       

In [4]:
# transform date to datetime, get year,month,day from date column
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

print(df.head(10))

        date   open   high     low  close    volume Name   7_day_ma  \
0 2013-02-08  45.07  45.35  45.000  45.08   1824755    A        NaN   
1 2013-02-11  45.17  45.18  44.450  44.60   2915405    A        NaN   
2 2013-02-12  44.81  44.95  44.500  44.62   2373731    A        NaN   
3 2013-02-13  44.81  45.24  44.680  44.75   2052338    A        NaN   
4 2013-02-14  44.72  44.78  44.360  44.58   3826245    A        NaN   
5 2013-02-15  43.48  44.24  42.210  42.25  14657315    A        NaN   
6 2013-02-19  42.21  43.12  42.210  43.01   4116141    A  44.127143   
7 2013-02-20  42.84  42.85  42.225  42.24   3873183    A  43.721429   
8 2013-02-21  42.14  42.14  41.470  41.63   3415149    A  43.297143   
9 2013-02-22  41.83  42.07  41.580  41.80   3354862    A  42.894286   

   15_day_ma  30_day_ma  daily_returns  daily_volatility  year  month  day  
0        NaN        NaN            NaN               NaN  2013      2    8  
1        NaN        NaN      -0.010648               NaN  2013  

In [5]:
#save names of all companies prior to on-hotencoding
unique_companies = df['Name'].unique()

# One-hot encode company tickers to differentiate between companies in training
df = pd.get_dummies(df, columns=['Name'])

print(df.head(10))

        date   open   high     low  close    volume   7_day_ma  15_day_ma  \
0 2013-02-08  45.07  45.35  45.000  45.08   1824755        NaN        NaN   
1 2013-02-11  45.17  45.18  44.450  44.60   2915405        NaN        NaN   
2 2013-02-12  44.81  44.95  44.500  44.62   2373731        NaN        NaN   
3 2013-02-13  44.81  45.24  44.680  44.75   2052338        NaN        NaN   
4 2013-02-14  44.72  44.78  44.360  44.58   3826245        NaN        NaN   
5 2013-02-15  43.48  44.24  42.210  42.25  14657315        NaN        NaN   
6 2013-02-19  42.21  43.12  42.210  43.01   4116141  44.127143        NaN   
7 2013-02-20  42.84  42.85  42.225  42.24   3873183  43.721429        NaN   
8 2013-02-21  42.14  42.14  41.470  41.63   3415149  43.297143        NaN   
9 2013-02-22  41.83  42.07  41.580  41.80   3354862  42.894286        NaN   

   30_day_ma  daily_returns  ...  Name_XL  Name_XLNX  Name_XOM  Name_XRAY  \
0        NaN            NaN  ...        0          0         0          0  

In [6]:
#create columns with lag
lag_columns = ['open', 'high', 'low', 'volume', '7_day_ma', '15_day_ma', '30_day_ma', 'daily_returns', 'daily_volatility']
#for every company
for ticker in unique_companies:
    ticker_data = df.loc[df[f'Name_{ticker}'] == 1]  #get all data fora  specific company
    for col in lag_columns: #for every column we're creating lag for
        for n in [1, 3, 5,7,15,30]: #create lag for days 1,3,5,7,15,30
            column_name = f'{col}_lag_{n}' #create new column with afforementioned lag
            df.loc[df[f'Name_{ticker}'] == 1, column_name] = ticker_data[col].shift(n)#shift data by that many days

print(df.head(10))

        date   open   high     low  close    volume   7_day_ma  15_day_ma  \
0 2013-02-08  45.07  45.35  45.000  45.08   1824755        NaN        NaN   
1 2013-02-11  45.17  45.18  44.450  44.60   2915405        NaN        NaN   
2 2013-02-12  44.81  44.95  44.500  44.62   2373731        NaN        NaN   
3 2013-02-13  44.81  45.24  44.680  44.75   2052338        NaN        NaN   
4 2013-02-14  44.72  44.78  44.360  44.58   3826245        NaN        NaN   
5 2013-02-15  43.48  44.24  42.210  42.25  14657315        NaN        NaN   
6 2013-02-19  42.21  43.12  42.210  43.01   4116141  44.127143        NaN   
7 2013-02-20  42.84  42.85  42.225  42.24   3873183  43.721429        NaN   
8 2013-02-21  42.14  42.14  41.470  41.63   3415149  43.297143        NaN   
9 2013-02-22  41.83  42.07  41.580  41.80   3354862  42.894286        NaN   

   30_day_ma  daily_returns  ...  daily_returns_lag_5  daily_returns_lag_7  \
0        NaN            NaN  ...                  NaN                  NaN

In [7]:
# drop NaN values
df = df.dropna()
print(df.head(10))

         date   open    high     low  close    volume   7_day_ma  15_day_ma  \
60 2013-05-07  42.18  42.410  41.900  42.40   3524022  41.662857  41.933333   
61 2013-05-08  42.40  42.950  42.300  42.94   2119765  41.874286  41.983333   
62 2013-05-09  42.97  43.195  42.630  43.16   3159293  42.120000  42.072000   
63 2013-05-10  43.12  43.850  43.040  43.63   4662252  42.451429  42.180667   
64 2013-05-13  43.43  43.560  42.720  43.04   4260335  42.674286  42.260000   
65 2013-05-14  42.98  44.060  42.882  43.97   6075845  43.020000  42.350667   
66 2013-05-15  44.90  46.490  44.890  45.68  10289000  43.545714  42.539333   
67 2013-05-16  45.43  45.840  44.970  44.99   4890962  43.915714  42.690000   
68 2013-05-17  45.02  45.830  44.990  45.56   3247851  44.290000  42.974000   
69 2013-05-20  45.48  47.450  45.390  46.34   5698804  44.744286  43.299333   

    30_day_ma  daily_returns  ...  daily_returns_lag_5  daily_returns_lag_7  \
60  42.018333       0.009524  ...            -0.000

In [8]:
#drop one-hot encoded columns
columns_to_drop = [col for col in df.columns if col.startswith('Name_')]
df = df.drop(columns=columns_to_drop)

In [9]:

#define features, target vars

#a list of columns to exclude. This should be all the columns with data from a day d for which we are trying 
#to make predictions for (because we will not have access to this data in practice)
exclude = [col for col in lag_columns if any(f'{col}_lag_' in c for c in df.columns)]
#a list of the features to include. This is all columns that are not the date, target, or included
#in our list of columns to exclude
features = [col for col in df.columns if col not in ['date', 'close'] + exclude]
print(features)
print('\n\n')
print(exclude)

['year', 'month', 'day', 'open_lag_1', 'open_lag_3', 'open_lag_5', 'open_lag_7', 'open_lag_15', 'open_lag_30', 'high_lag_1', 'high_lag_3', 'high_lag_5', 'high_lag_7', 'high_lag_15', 'high_lag_30', 'low_lag_1', 'low_lag_3', 'low_lag_5', 'low_lag_7', 'low_lag_15', 'low_lag_30', 'volume_lag_1', 'volume_lag_3', 'volume_lag_5', 'volume_lag_7', 'volume_lag_15', 'volume_lag_30', '7_day_ma_lag_1', '7_day_ma_lag_3', '7_day_ma_lag_5', '7_day_ma_lag_7', '7_day_ma_lag_15', '7_day_ma_lag_30', '15_day_ma_lag_1', '15_day_ma_lag_3', '15_day_ma_lag_5', '15_day_ma_lag_7', '15_day_ma_lag_15', '15_day_ma_lag_30', '30_day_ma_lag_1', '30_day_ma_lag_3', '30_day_ma_lag_5', '30_day_ma_lag_7', '30_day_ma_lag_15', '30_day_ma_lag_30', 'daily_returns_lag_1', 'daily_returns_lag_3', 'daily_returns_lag_5', 'daily_returns_lag_7', 'daily_returns_lag_15', 'daily_returns_lag_30', 'daily_volatility_lag_1', 'daily_volatility_lag_3', 'daily_volatility_lag_5', 'daily_volatility_lag_7', 'daily_volatility_lag_15', 'daily_volat

In [10]:
#Split data into features and our target variable
X = df[features]
y = df['close']

In [11]:
#split the data into training and testing sets
split = int(len(df) * 0.8) #split at 80% spot
train = df.iloc[:split]#train data is the data before the split
test = df.iloc[split:] #testing is the data after

X_train = train[features] #feature data for training
y_train = train['close'] #closing price data for testing
X_test = test[features] #feature data for testing
y_test = test['close'] #closing price data for testing


In [12]:

split_date = '2016-11-15'  # date to split data into training/testing sets

# Split data
X_train = df[df['date'] < split_date][features]
y_train = df[df['date'] < split_date]['close']

X_test = df[df['date'] >= split_date][features]
y_test = df[df['date'] >= split_date]['close']

In [13]:
#scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:

#init model
lr = LinearRegression()

# train model
lr.fit(X_train_scaled, y_train)

LinearRegression()

In [15]:

# make predictions
predictions = lr.predict(X_test_scaled)

# eval
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


Mean Squared Error (MSE): 5.015923799135143
R-squared (R2): 0.9996750469631156


In [16]:
from joblib import dump

#save model
dump(lr, 'lr-model.joblib')



['lr-model.joblib']