In [1]:
import pandas as pd
import math, datetime
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

In [2]:
df = pd.read_csv('stock.csv')

In [3]:
df.corr()

Unnamed: 0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Open,1.0,0.999754,0.999623,0.999384,-0.552175,-0.001748,,0.815955,0.815283,0.816261,0.815542,-0.552175
High,0.999754,1.0,0.999577,0.999677,-0.548985,-0.001032,,0.815531,0.815145,0.815956,0.815432,-0.548985
Low,0.999623,0.999577,1.0,0.999746,-0.557796,-0.001859,,0.815933,0.81538,0.816688,0.815961,-0.557796
Close,0.999384,0.999677,0.999746,1.0,-0.553778,-0.001872,,0.815732,0.815374,0.816479,0.816047,-0.553778
Volume,-0.552175,-0.548985,-0.557796,-0.553778,1.0,-0.007876,,-0.549529,-0.547943,-0.552269,-0.550264,1.0
Ex-Dividend,-0.001748,-0.001032,-0.001859,-0.001872,-0.007876,1.0,,0.010884,0.011568,0.010737,0.010773,-0.007876
Split Ratio,,,,,,,,,,,,
Adj. Open,0.815955,0.815531,0.815933,0.815732,-0.549529,0.010884,,1.0,0.999906,0.999849,0.999758,-0.549529
Adj. High,0.815283,0.815145,0.81538,0.815374,-0.547943,0.011568,,0.999906,1.0,0.999832,0.999869,-0.547943
Adj. Low,0.816261,0.815956,0.816688,0.816479,-0.552269,0.010737,,0.999849,0.999832,1.0,0.999902,-0.552269


In [4]:
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]

In [5]:
df['HL_PCT'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['PCT_change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100

In [6]:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

In [7]:
df

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume
0,50.322842,3.712563,0.324968,44659000.0
1,54.322689,0.710922,7.227007,22834300.0
2,54.869377,3.729433,-1.227880,18256100.0
3,52.597363,6.417469,-5.726357,15247300.0
4,53.164113,1.886792,1.183658,9188600.0
...,...,...,...,...
3419,1094.000000,1.343693,0.130884,1990515.0
3420,1053.150000,2.921711,-2.487014,3418154.0
3421,1026.550000,3.918952,-2.360729,2413517.0
3422,1054.090000,0.491419,0.332191,3272409.0


In [8]:
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace = True)

forecast_out = int(math.ceil(0.01*len(df)))

In [9]:
df['label'] = df[forecast_col].shift(-forecast_out)

In [10]:
len(df[forecast_col])

a = df[forecast_col].shift(forecast_out)

In [11]:
a

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
         ...   
3419    1177.37
3420    1182.22
3421    1181.59
3422    1119.20
3423    1068.76
Name: Adj. Close, Length: 3424, dtype: float64

In [12]:
b =  df[forecast_col].shift(-forecast_out)

In [13]:
df.head()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
0,50.322842,3.712563,0.324968,44659000.0,69.078238
1,54.322689,0.710922,7.227007,22834300.0,67.839414
2,54.869377,3.729433,-1.22788,18256100.0,68.912727
3,52.597363,6.417469,-5.726357,15247300.0,70.668146
4,53.164113,1.886792,1.183658,9188600.0,71.219849


In [14]:
df.dropna(inplace = True)
df.tail()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
3384,1177.37,0.896914,-0.029718,1792602.0,1094.0
3385,1182.22,0.346805,-0.134312,1643877.0,1053.15
3386,1181.59,0.495942,0.476195,2774967.0,1026.55
3387,1119.2,1.081129,-0.729098,5798880.0,1054.09
3388,1068.76,4.325574,-2.89385,3742469.0,1006.94


In [15]:
df.head()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
0,50.322842,3.712563,0.324968,44659000.0,69.078238
1,54.322689,0.710922,7.227007,22834300.0,67.839414
2,54.869377,3.729433,-1.22788,18256100.0,68.912727
3,52.597363,6.417469,-5.726357,15247300.0,70.668146
4,53.164113,1.886792,1.183658,9188600.0,71.219849


In [16]:
X = np.array(df.drop(['label'],axis = 1))
y = np.array(df['label'])
df.dropna(inplace = True)

In [17]:
X = preprocessing.scale(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [19]:
clf = LinearRegression(n_jobs = -1)

In [20]:
model  = clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
predict = model.predict(X_test)

In [21]:
print('The accuracy of the model\'s prediction:', score)

The accuracy of the model's prediction: 0.9783974627582368


the y intercept

In [32]:
clf.intercept_

412.3288777373444

## Using the Regression built from scratch 

In [22]:
from LinearRegression import LinearRegression
reg = LinearRegression(eta =  0.01, n_iter = 1000)
model2 = reg.fit(X_train, y_train)
pre = reg.predict(X_test)

In [34]:
print('The model\'s accuracy is:',reg.score(y_test,pre))

The model's accuracy is: 0.9782476963907837


In [24]:
from sklearn.metrics import r2_score

In [35]:
print('The model\'s accuracy using sklearn:',r2_score(y_test,pre))

The model's accuracy using sklearn: 0.9782476963907837


squared error computed by the model is the same as that of sklearn

y_intercept

In [36]:
reg.intercept(X_train, y_train)

413.2819489400481