In [1]:
import pandas as pd
from datetime import datetime

data = pd.read_csv('sphist.csv')
data['Date'] = pd.to_datetime(data['Date'], yearfirst=True)
data = data.sort_values(by='Date',ascending=True)
data['close_avg_5_day'] = -1
data['close_rat_5_365'] = -1
data['close_std_5_day'] = -1
data['vol_avg_5_day'] = -1
data['vol_rat_5_365'] = -1
data['vol_std_5_day'] = -1

data.reset_index(inplace=True, drop=True)

print(data.head())

        Date   Open   High    Low  Close     Volume  Adj Close  \
0 1950-01-03  16.66  16.66  16.66  16.66  1260000.0      16.66   
1 1950-01-04  16.85  16.85  16.85  16.85  1890000.0      16.85   
2 1950-01-05  16.93  16.93  16.93  16.93  2550000.0      16.93   
3 1950-01-06  16.98  16.98  16.98  16.98  2010000.0      16.98   
4 1950-01-09  17.08  17.08  17.08  17.08  2520000.0      17.08   

   close_avg_5_day  close_rat_5_365  close_std_5_day  vol_avg_5_day  \
0               -1               -1               -1             -1   
1               -1               -1               -1             -1   
2               -1               -1               -1             -1   
3               -1               -1               -1             -1   
4               -1               -1               -1             -1   

   vol_rat_5_365  vol_std_5_day  
0             -1             -1  
1             -1             -1  
2             -1             -1  
3             -1             -1  
4     

In [2]:
def get_metrics(index):
    index_5_days = index - 5
    index_365_days = index - 365
    
    close_avg_5_day = data.loc[index_5_days:index - 1]['Close'].mean()
    close_rat_5_365 = close_avg_5_day / data.loc[index_365_days:index - 1]['Close'].mean()
    close_std_5_day = data.loc[index_5_days:index-1]['Close'].std()
    
    vol_avg_5_day = data.loc[index_5_days:index - 1]['Volume'].mean()
    vol_rat_5_365 = vol_avg_5_day / data.loc[index_365_days:index - 1]['Volume'].mean()
    vol_std_5_day = data.loc[index_5_days:index-1]['Volume'].std()
    
    return close_avg_5_day, close_rat_5_365, close_std_5_day, vol_avg_5_day, vol_rat_5_365, vol_std_5_day

def calc_metrics(row):
    idx = row.name
    row['close_avg_5_day'], row['close_rat_5_365'], row['close_std_5_day'], row['vol_avg_5_day'], row['vol_rat_5_365'], row['vol_std_5_day'] =  get_metrics(idx)
    return row
    
data = data.apply(calc_metrics, axis=1)
data.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,close_avg_5_day,close_rat_5_365,close_std_5_day,vol_avg_5_day,vol_rat_5_365,vol_std_5_day
0,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66,,,,,,
1,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85,16.66,1.0,,1260000.0,1.0,
2,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93,16.755,1.0,0.13435,1575000.0,1.0,445477.272148
3,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98,16.813333,1.0,0.138684,1900000.0,1.0,645058.136915
4,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08,16.855,1.0,0.140594,1927500.0,1.0,529551.697193
5,1950-01-10,17.030001,17.030001,17.030001,17.030001,2160000.0,17.030001,16.9,1.0,0.157956,2046000.0,1.0,529650.828377
6,1950-01-11,17.09,17.09,17.09,17.09,2630000.0,17.09,16.974,1.003093,0.089051,2226000.0,1.077966,298043.620968
7,1950-01-12,16.76,16.76,16.76,16.76,2970000.0,16.76,17.022,1.004502,0.067602,2374000.0,1.106391,272084.545684
8,1950-01-13,16.67,16.67,16.67,16.67,3330000.0,16.67,16.988,1.003871,0.134796,2458000.0,1.093052,382452.611444
9,1950-01-16,16.719999,16.719999,16.719999,16.719999,1460000.0,16.719999,16.926,1.001868,0.196545,2722000.0,1.149062,446172.612337


In [3]:
# clean data
#Removing rows that don't have enough data for all metrics
start_index = data[data['Date'] > datetime(year=1951, month=1, day=2)].index[0]
data = data[start_index:]
data.reset_index(inplace=True, drop=True)
data.head(20)

#Removing rows with Na values
data = data.dropna(axis=0)

#Train/test split
train_test_index = data[data['Date'] > datetime(year=2014, month=12, day=31)].index[0]
train = data.iloc[:train_test_index]
test = data.iloc[train_test_index:]

print(train.shape)
print(test.shape)

(16105, 13)
(235, 13)


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

lr = LinearRegression()
feature_cols = ['close_avg_5_day','close_rat_5_365','close_std_5_day','vol_avg_5_day', 'vol_rat_5_365', 'vol_std_5_day']
lr = lr.fit(train[feature_cols],train['Close'])
predictions = lr.predict(test[feature_cols])
mae = mean_absolute_error(predictions, test['Close'])
print(mae)

19.85756792125505
