In [4]:
import pandas as pd
import math, datetime
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [5]:
df = pd.read_csv('../Datasets/stock.csv')

In [6]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
0,2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
1,2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2,2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
3,2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
4,2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


#### Feature selection

In [7]:
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]

#### Feature Engineering

In [8]:
df['HL_PCT'] = ((df['Adj. High'] - df['Adj. Close']) / df['Adj. Close']) * 100
df['PCT_change'] = ((df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open']) * 100

In [9]:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

In [10]:
df.head()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume
0,50.322842,3.712563,0.324968,44659000.0
1,54.322689,0.710922,7.227007,22834300.0
2,54.869377,3.729433,-1.22788,18256100.0
3,52.597363,6.417469,-5.726357,15247300.0
4,53.164113,1.886792,1.183658,9188600.0


In [11]:
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace = True)

forecast_out = int(math.ceil(0.01*len(df)))

In [12]:
df['label'] = df[forecast_col].shift(-forecast_out)

In [13]:
df.head()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
0,50.322842,3.712563,0.324968,44659000.0,69.078238
1,54.322689,0.710922,7.227007,22834300.0,67.839414
2,54.869377,3.729433,-1.22788,18256100.0,68.912727
3,52.597363,6.417469,-5.726357,15247300.0,70.668146
4,53.164113,1.886792,1.183658,9188600.0,71.219849


In [14]:
df.dropna(inplace = True)
df.tail()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
3384,1177.37,0.896914,-0.029718,1792602.0,1094.0
3385,1182.22,0.346805,-0.134312,1643877.0,1053.15
3386,1181.59,0.495942,0.476195,2774967.0,1026.55
3387,1119.2,1.081129,-0.729098,5798880.0,1054.09
3388,1068.76,4.325574,-2.89385,3742469.0,1006.94


In [15]:
df.head()

Unnamed: 0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
0,50.322842,3.712563,0.324968,44659000.0,69.078238
1,54.322689,0.710922,7.227007,22834300.0,67.839414
2,54.869377,3.729433,-1.22788,18256100.0,68.912727
3,52.597363,6.417469,-5.726357,15247300.0,70.668146
4,53.164113,1.886792,1.183658,9188600.0,71.219849


In [16]:
X = np.array(df.drop(['label'],axis = 1))
y = np.array(df['label'])
df.dropna(inplace = True)

In [17]:
X = preprocessing.scale(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

### Linear Regression (sklearn)

In [19]:
clf = LinearRegression(n_jobs = -1)

In [20]:
model  = clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
predict = model.predict(X_test)

In [26]:
print('The accuracy of the model\'s prediction:', round(score, 4))

The accuracy of the model's prediction: 0.9765


**-- y -- intercept**

In [22]:
clf.intercept_

412.8409867490967

## Linear Regression (custom --> built from scratch)

In [24]:
from LinearRegression import LinearRegression
reg = LinearRegression(eta =  0.01, n_iter = 1000)
model2 = reg.fit(X_train, y_train)
pre = reg.predict(X_test)

In [25]:
print('The model\'s accuracy is:',round(reg.score(y_test,pre), 4))

The model's accuracy is: 0.9765


**y_intercept**

In [27]:
reg.intercept(X_train, y_train)

412.48989627744265

In [35]:
scores = {
    'sklearn_regression_score' : [round(score, 4)], 
    'custom_regression_score':[round(reg.score(y_test,pre), 4)]
}

pd.DataFrame(data = scores)

Unnamed: 0,sklearn_regression_score,custom_regression_score
0,0.9765,0.9765


**The aim of this notebook is not how the features are engineered but to display the comparison between sklearn's Linear Regression model and the custom model that was written from scratch**