# Simple Linear Regression for stock using scikit-learn


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import yfinance as yf
yf.pdr_override()

In [2]:
stock = 'WMT'
start = '2016-01-01' 
end = '2018-01-01'
data = yf.download(stock, start, end)
data.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,60.5,61.490002,60.360001,61.459999,55.496292,11989200
2016-01-05,62.029999,63.049999,61.849998,62.919998,56.814621,13326000
2016-01-06,62.48,63.950001,62.470001,63.549999,57.383499,16564600
2016-01-07,62.970001,65.230003,62.919998,65.029999,58.719879,26430000
2016-01-08,65.080002,65.410004,63.41,63.540001,57.374466,17767900


In [3]:
df = data.reset_index()
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2016-01-04,60.5,61.490002,60.360001,61.459999,55.496292,11989200
1,2016-01-05,62.029999,63.049999,61.849998,62.919998,56.814621,13326000
2,2016-01-06,62.48,63.950001,62.470001,63.549999,57.383499,16564600
3,2016-01-07,62.970001,65.230003,62.919998,65.029999,58.719879,26430000
4,2016-01-08,65.080002,65.410004,63.41,63.540001,57.374466,17767900


In [4]:
X = df.drop(['Date','Close'], axis=1, inplace=True)
y = df[['Adj Close']]

In [5]:
df = df.as_matrix()

In [6]:
from sklearn.model_selection import train_test_split

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25,  random_state=0)

In [7]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -3.2570142138865776e-09


In [9]:
regression_model.score(X_test, y_test)

1.0

In [10]:
from sklearn.metrics import mean_squared_error

y_predict = regression_model.predict(X_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

regression_model_mse

3.1337790113197897e-18

In [11]:
math.sqrt(regression_model_mse)

1.770248290867637e-09

In [12]:
# input the latest Open, High, Low, Close, Volume
# predicts the next day price
regression_model.predict([[167.81, 171.75, 165.19, 166.48, 37232900]])

array([[166.48000001]])