In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection, svm
from sklearn.linear_model import LinearRegression

Importing the data

In [2]:
df = pd.read_csv ("./data/Coca-Cola_stock_history.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1962-01-02,0.051133,0.052525,0.051133,0.051133,806400,0.0,0.0
1,1962-01-03,0.050374,0.050374,0.049234,0.049994,1574400,0.0,0.0
2,1962-01-04,0.050121,0.050753,0.050121,0.050374,844800,0.0,0.0
3,1962-01-05,0.050374,0.051006,0.049108,0.049234,1420800,0.0,0.0
4,1962-01-08,0.048855,0.048855,0.047779,0.048728,2035200,0.0,0.0


The new field HL_PCT together with volume will measure the volatility of the stock

In [3]:
df["HL_PCT"] = (df["High"] - df["Close"]) / df["Close"] * 100

PCT_change will measure if the stock price goes up or down after a day

In [4]:
df["PCT_change"] = (df["Close"] - df["Open"]) / df["Open"] * 100

In [5]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,HL_PCT,PCT_change
0,1962-01-02,0.051133,0.052525,0.051133,0.051133,806400,0.0,0.0,2.722604,0.0
1,1962-01-03,0.050374,0.050374,0.049234,0.049994,1574400,0.0,0.0,0.759843,-0.754113
2,1962-01-04,0.050121,0.050753,0.050121,0.050374,844800,0.0,0.0,0.75371,0.505019
3,1962-01-05,0.050374,0.051006,0.049108,0.049234,1420800,0.0,0.0,3.599131,-2.261544
4,1962-01-08,0.048855,0.048855,0.047779,0.048728,2035200,0.0,0.0,0.259724,-0.259052


In [6]:
df = df[["HL_PCT", "PCT_change", "Close", "Volume"]]
forecast_col = "Close"

In [7]:
df.head()

Unnamed: 0,HL_PCT,PCT_change,Close,Volume
0,2.722604,0.0,0.051133,806400
1,0.759843,-0.754113,0.049994,1574400
2,0.75371,0.505019,0.050374,844800
3,3.599131,-2.261544,0.049234,1420800
4,0.259724,-0.259052,0.048728,2035200


In [8]:
df.isnull().any()

HL_PCT        False
PCT_change    False
Close         False
Volume        False
dtype: bool

Although there's is no NA value, we are cleaning the data for extra caution

In [9]:
df.dropna(inplace=True)

Adding response variable which is the Close price of Coke stock 30 day in the future

In [10]:
forecast_col = "Close"

forecast_out = 30

df["label"] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
df

Unnamed: 0,HL_PCT,PCT_change,Close,Volume,label
0,2.722604,0.000000,0.051133,806400,0.047716
1,0.759843,-0.754113,0.049994,1574400,0.046830
2,0.753710,0.505019,0.050374,844800,0.046576
3,3.599131,-2.261544,0.049234,1420800,0.046576
4,0.259724,-0.259052,0.048728,2035200,0.047336
...,...,...,...,...,...
15087,0.380364,0.345326,55.209999,23832700,60.750000
15088,0.636361,-0.217705,55.000000,18026300,60.450001
15089,0.492163,0.200914,54.860001,13846400,59.959999
15090,0.071075,1.864251,56.279999,23151000,59.820000


Setting up the predictors and response variable

In [11]:
X = np.array(df.drop(columns="label"))
Y = np.array(df["label"])

Standardize predictors

In [12]:
X = preprocessing.scale(X)

Split predictors and responses into folds of cross validation

In [13]:
X_test, X_train, Y_test, Y_train = model_selection.train_test_split(X, Y, test_size=.2)

This is the accuracy of simple linear regression

In [14]:
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, Y_train)

accuracy = clf.score(X_test, Y_test)
accuracy * 100

99.38969077361122

This is the accuracy of support vector machine

In [15]:
clf = svm.SVR()
clf.fit(X_train, Y_train)

accuracy = clf.score(X_test, Y_test)
accuracy * 100

97.15031054259778