In [78]:
import pandas as pd
import datetime as dt
import numpy as np
import seaborn as sns

In [2]:
stock_data = pd.read_csv('sphist.csv')
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
stock_data = stock_data.sort_values(by = 'Date')
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08


In [3]:
stock_data.reset_index(drop = True, inplace=True)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66
1,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85
2,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93
3,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98
4,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08


### In a normal machine learning exercise, we treat each row as independent. Stock market data is sequential, and each observation comes a day after the previous observation. Thus, the observations are not all independent.

### You have to be extra careful to not inject "future" knowledge into the past rows when you do training and prediction.

### Don't use the current row in the values you average!!!

## Some interesting indicators:
* The average price from the past 5/30/365 days (**exclude current day!!!**).  -->**trading days**
* The ratio between the average price for the past 5 days, and the average price for the past 365 days.
* The standard deviation of the price over the past 5/365 days.
* The ratio between the std for the past 5 days, and the std for the past 365 days.

In [62]:
def average_n_days(row, n):
    ## n should be less than the first index number
    current_index = row.name
    ## row.name to get the current index of the row
    previous_df = stock_data.iloc[current_index -n : current_index]
    average_close = np.average(previous_df['Close'])
    return average_close

In [65]:
stock_data['day_5'] = 0
stock_data['day_30'] = 0
stock_data['day_250'] = 0
day_5 = stock_data[stock_data['Date'] >= dt.datetime(1951,1,3)].apply(
    lambda row: average_n_days(row, 5), axis = 1)
day_30 = stock_data[stock_data['Date'] >= dt.datetime(1951,1,3)].apply(
    lambda row: average_n_days(row, 30), axis = 1)
day_250 = stock_data[stock_data['Date'] >= dt.datetime(1951,1,3)].apply(
    lambda row: average_n_days(row, 250), axis = 1)
stock_data.loc[stock_data['Date'] >= dt.datetime(1951,1,3), 'day_5'] = day_5
stock_data.loc[stock_data['Date'] >= dt.datetime(1951,1,3), 'day_30'] = day_30
stock_data.loc[stock_data['Date'] >= dt.datetime(1951,1,3), 'day_250'] = day_250
stock_data[stock_data['Date'] >= dt.datetime(1951,1,1)].head(11)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,day_5,day_30,day_250
249,1951-01-02,20.77,20.77,20.77,20.77,3030000.0,20.77,0.0,0.0,0.0
250,1951-01-03,20.690001,20.690001,20.690001,20.690001,3370000.0,20.690001,20.36,19.815,18.40676
251,1951-01-04,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.514,19.842666,18.42288
252,1951-01-05,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.628,19.874,18.43896
253,1951-01-08,21.0,21.0,21.0,21.0,2780000.0,21.0,20.726001,19.907,18.45472
254,1951-01-09,21.120001,21.120001,21.120001,21.120001,3800000.0,21.120001,20.840001,19.935,18.4708
255,1951-01-10,20.85,20.85,20.85,20.85,3270000.0,20.85,20.910001,19.961667,18.48696
256,1951-01-11,21.190001,21.190001,21.190001,21.190001,3490000.0,21.190001,20.942001,19.984,18.50224
257,1951-01-12,21.110001,21.110001,21.110001,21.110001,2950000.0,21.110001,21.006001,20.038333,18.51864
258,1951-01-15,21.299999,21.299999,21.299999,21.299999,2830000.0,21.299999,21.054001,20.096333,18.53604


In [7]:
## Another way to use iterrows
"""
day_5 = []
for index, row in stock_data[stock_data.Date >= dt.datetime(1951, 1,3)].iterrows():
    previous_df = stock_data.iloc[index - 5: index]
    day_5.append(np.mean(previous_df['Close']))
stock_data['day_5'] = np.NaN
stock_data.loc[stock_data.Date >= dt.datetime(1951, 1, 3), 'day_5'] = day_5

day_30 = []
for index, row in stock_data[stock_data.Date >= dt.datetime(1951, 1,3)].iterrows():
    previous_df = stock_data.iloc[index - 30: index]
    day_30.append(np.mean(previous_df['Close']))
stock_data['day_30'] = np.NaN
stock_data.loc[stock_data.Date >= dt.datetime(1951, 1, 3), 'day_30'] = day_30
stock_data[stock_data.Date >= dt.datetime(1951,1,3)].head(10)
"""

In [66]:
def std_n_days(row, n):
    ## n should be less than the first index number
    current_index = row.name
    ## row.name to get the current index of the row
    previous_df = stock_data.iloc[current_index -n : current_index]
    std_close = np.std(previous_df['Close'])
    return std_close

In [67]:
stock_data['std_5'] = 0
stock_data['std_250'] = 0
std_5 = stock_data[stock_data['Date'] >= dt.datetime(1951,1,3)].apply(
    lambda row: std_n_days(row, 5), axis = 1)

std_250 = stock_data[stock_data['Date'] >= dt.datetime(1951,1,3)].apply(
    lambda row: std_n_days(row, 250), axis = 1)
stock_data.loc[stock_data['Date'] >= dt.datetime(1951,1,3), 'std_5'] = std_5
stock_data.loc[stock_data['Date'] >= dt.datetime(1951,1,3), 'std_250'] = std_250
stock_data[stock_data['Date'] >= dt.datetime(1951,1,1)].head(11)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,day_5,day_30,day_250,std_5,std_250
249,1951-01-02,20.77,20.77,20.77,20.77,3030000.0,20.77,0.0,0.0,0.0,0.0,0.0
250,1951-01-03,20.690001,20.690001,20.690001,20.690001,3370000.0,20.690001,20.36,19.815,18.40676,0.27225,1.066244
251,1951-01-04,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.514,19.842666,18.42288,0.182932,1.070171
252,1951-01-05,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.628,19.874,18.43896,0.191458,1.076599
253,1951-01-08,21.0,21.0,21.0,21.0,2780000.0,21.0,20.726001,19.907,18.45472,0.162678,1.083212
254,1951-01-09,21.120001,21.120001,21.120001,21.120001,3800000.0,21.120001,20.840001,19.935,18.4708,0.10469,1.091011
255,1951-01-10,20.85,20.85,20.85,20.85,3270000.0,20.85,20.910001,19.961667,18.48696,0.144083,1.100172
256,1951-01-11,21.190001,21.190001,21.190001,21.190001,3490000.0,21.190001,20.942001,19.984,18.50224,0.103808,1.106341
257,1951-01-12,21.110001,21.110001,21.110001,21.110001,2950000.0,21.110001,21.006001,20.038333,18.51864,0.133955,1.115634
258,1951-01-15,21.299999,21.299999,21.299999,21.299999,2830000.0,21.299999,21.054001,20.096333,18.53604,0.118761,1.121974


In [68]:
stock_data = stock_data[stock_data['Date'] >= dt.datetime(1951,1,3)]
stock_data = stock_data.reset_index(drop = True)

In [69]:
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,day_5,day_30,day_250,std_5,std_250
0,1951-01-03,20.690001,20.690001,20.690001,20.690001,3370000.0,20.690001,20.36,19.815,18.40676,0.27225,1.066244
1,1951-01-04,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.514,19.842666,18.42288,0.182932,1.070171
2,1951-01-05,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.628,19.874,18.43896,0.191458,1.076599
3,1951-01-08,21.0,21.0,21.0,21.0,2780000.0,21.0,20.726001,19.907,18.45472,0.162678,1.083212
4,1951-01-09,21.120001,21.120001,21.120001,21.120001,3800000.0,21.120001,20.840001,19.935,18.4708,0.10469,1.091011


In [70]:
stock_data['ratio_avg5_avg_250'] = stock_data.day_5 / stock_data.day_250
stock_data['ratio_std5_std_250'] = stock_data['std_5'] / stock_data['std_250']
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,day_5,day_30,day_250,std_5,std_250,ratio_avg5_avg_250,ratio_std5_std_250
0,1951-01-03,20.690001,20.690001,20.690001,20.690001,3370000.0,20.690001,20.36,19.815,18.40676,0.27225,1.066244,1.106115,0.255335
1,1951-01-04,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.514,19.842666,18.42288,0.182932,1.070171,1.113507,0.170937
2,1951-01-05,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.628,19.874,18.43896,0.191458,1.076599,1.118718,0.177836
3,1951-01-08,21.0,21.0,21.0,21.0,2780000.0,21.0,20.726001,19.907,18.45472,0.162678,1.083212,1.123073,0.150181
4,1951-01-09,21.120001,21.120001,21.120001,21.120001,3800000.0,21.120001,20.840001,19.935,18.4708,0.10469,1.091011,1.128267,0.095957


In [84]:
stock_data.rename(columns={'day_5' : 'avg_5', 'day_30':'avg_30', 'day_250':'avg_250',
                          'ratio_avg5_avg_250':'ratio_avg5_avg250',
                          'ratio_std5_std_250':'ratio_std5_std250'}, inplace=True)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,avg_5,avg_30,avg_250,std_5,std_250,ratio_avg5_avg250,ratio_std5_std250
0,1951-01-03,20.690001,20.690001,20.690001,20.690001,3370000.0,20.690001,20.36,19.815,18.40676,0.27225,1.066244,1.106115,0.255335
1,1951-01-04,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.514,19.842666,18.42288,0.182932,1.070171,1.113507,0.170937
2,1951-01-05,20.870001,20.870001,20.870001,20.870001,3390000.0,20.870001,20.628,19.874,18.43896,0.191458,1.076599,1.118718,0.177836
3,1951-01-08,21.0,21.0,21.0,21.0,2780000.0,21.0,20.726001,19.907,18.45472,0.162678,1.083212,1.123073,0.150181
4,1951-01-09,21.120001,21.120001,21.120001,21.120001,3800000.0,21.120001,20.840001,19.935,18.4708,0.10469,1.091011,1.128267,0.095957


In [85]:
stock_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16340 entries, 0 to 16339
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               16340 non-null  datetime64[ns]
 1   Open               16340 non-null  float64       
 2   High               16340 non-null  float64       
 3   Low                16340 non-null  float64       
 4   Close              16340 non-null  float64       
 5   Volume             16340 non-null  float64       
 6   Adj Close          16340 non-null  float64       
 7   avg_5              16340 non-null  float64       
 8   avg_30             16340 non-null  float64       
 9   avg_250            16340 non-null  float64       
 10  std_5              16340 non-null  float64       
 11  std_250            16340 non-null  float64       
 12  ratio_avg5_avg250  16340 non-null  float64       
 13  ratio_std5_std250  16340 non-null  float64       
dtypes: dat

# Split the data to train and test

In [91]:
train = stock_data[stock_data['Date'] < dt.datetime(2013,1,1)].copy()
test = stock_data[stock_data['Date'] >= dt.datetime(2013,1,1)].copy()

# Train data on train, recommend to use Mean Absolute Error, MAE, it will show you how 'close' you were to the price in intuitive terms.

In [92]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

In [93]:
## Use previous information not current for prediction, so leave out all of the original columns(Close, High, Low,
## Open, Volume, Adj Close, Date)
lr = LinearRegression()
features = ['avg_5', 'avg_30', 'avg_250', 'std_5', 'std_250', 'ratio_avg5_avg250', 'ratio_std5_std250']
target = 'Close'

lr.fit(train[features], train[target])
predictions = lr.predict(test[features])
test['prediction'] = predictions
test.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,avg_5,avg_30,avg_250,std_5,std_250,ratio_avg5_avg250,ratio_std5_std250,prediction
15601,2013-01-02,1426.189941,1462.430054,1426.189941,1462.420044,4202600000.0,1462.420044,1418.641992,1414.258667,1379.35416,8.783991,46.533328,1.028483,0.188768,1419.135505
15602,2013-01-03,1462.420044,1465.469971,1455.530029,1459.369995,3829730000.0,1459.369995,1425.793994,1417.676668,1380.0956,19.911131,46.373961,1.033112,0.42936,1425.505003
15603,2013-01-04,1459.369995,1467.939941,1458.98999,1466.469971,3424290000.0,1466.469971,1433.702002,1420.092668,1380.823879,23.500472,46.183156,1.038295,0.508854,1433.459067
15604,2013-01-07,1466.469971,1466.469971,1456.619995,1461.890015,3304970000.0,1461.890015,1443.376001,1422.714665,1381.565519,24.994984,46.063691,1.044739,0.542618,1443.482795
15605,2013-01-08,1461.890015,1461.890015,1451.640015,1457.150024,3601600000.0,1457.150024,1455.267993,1425.076664,1382.301839,14.716296,45.870129,1.052786,0.320825,1456.992309


In [94]:
mae = mean_absolute_error(test['Close'], test['prediction'])
mae

16.16392348887093

In [95]:
lr = LinearRegression()
features = ['avg_5', 'avg_30', 'avg_250', 'std_5', 'std_250']
target = 'Close'

lr.fit(train[features], train[target])
predictions = lr.predict(test[features])

mean_absolute_error(test['Close'], predictions)

16.158490251490967

In [96]:
lr = LinearRegression()
features = ['avg_5']
target = 'Close'

lr.fit(train[features], train[target])
predictions = lr.predict(test[features])

mean_absolute_error(test['Close'], predictions)

16.267533214534456

In [98]:
test[['Close', 'prediction']]

Unnamed: 0,Close,prediction
15601,1462.420044,1419.135505
15602,1459.369995,1425.505003
15603,1466.469971,1433.459067
15604,1461.890015,1443.482795
15605,1457.150024,1456.992309
...,...,...
16335,2102.629883,2088.942732
16336,2079.510010,2091.852031
16337,2049.620117,2089.557285
16338,2091.689941,2080.173478
