In [2]:
import pandas as pd
import quandl, math
import numpy as np
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression



In [4]:
df = quandl.get('WIKI/GOOGL')

In [5]:
print(df.head())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [6]:
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]

In [7]:
df.head()

Unnamed: 0_level_0,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,52.542193,54.167209,52.10083,53.164113,9188600.0


In [8]:
df['PCT_HL'] = (df['Adj. High'] - df['Adj. Low'])/df['Adj. Low'] *100
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open'])/df['Adj. Open'] *100

In [9]:
df = df[['Adj. Close', 'PCT_HL', 'PCT_change', 'Adj. Volume']]

In [10]:
df.head()

Unnamed: 0_level_0,Adj. Close,PCT_HL,PCT_change,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,50.322842,8.441017,0.324968,44659000.0
2004-08-20,54.322689,8.537313,7.227007,22834300.0
2004-08-23,54.869377,4.062357,-1.22788,18256100.0
2004-08-24,52.597363,7.75321,-5.726357,15247300.0
2004-08-25,53.164113,3.966115,1.183658,9188600.0


In [11]:
forecast_col = 'Adj. Close'
df.fillna(-99999,inplace=True) # Replace NA with -99999 in data

In [12]:
forecast_out = int(math.ceil(0.01*len(df))) 
forecast_out

35

In [13]:
# The label coloumn for each row will be the Adj. Close price
# .. 35 days? into the future
df['label'] = df[forecast_col].shift(-forecast_out)
df.head()

Unnamed: 0_level_0,Adj. Close,PCT_HL,PCT_change,Adj. Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.322842,8.441017,0.324968,44659000.0,69.078238
2004-08-20,54.322689,8.537313,7.227007,22834300.0,67.839414
2004-08-23,54.869377,4.062357,-1.22788,18256100.0,68.912727
2004-08-24,52.597363,7.75321,-5.726357,15247300.0,70.668146
2004-08-25,53.164113,3.966115,1.183658,9188600.0,71.219849


In [14]:
df.dropna(inplace=True)
df.tail()

Unnamed: 0_level_0,Adj. Close,PCT_HL,PCT_change,Adj. Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-30,1177.37,1.142604,-0.029718,1792602.0,1094.0
2018-01-31,1182.22,1.213207,-0.134312,1643877.0,1053.15
2018-02-01,1181.59,1.547,0.476195,2774967.0,1026.55
2018-02-02,1119.2,1.811604,-0.729098,5798880.0,1054.09
2018-02-05,1068.76,5.512236,-2.89385,3742469.0,1006.94


In [15]:
X =  np.array(df.drop(['label'],1)) # df.drop returns a new data frame
y = np.array(df['label'])

In [21]:
X = preprocessing.scale(X)
print(len(X),len(y))

3389 3389


In [22]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [31]:
# Refer this for official documentation
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)  # fit Synonym to train the data
accuracy = clf.score(X_test, y_test)  # score Synonym to test the data
accuracy # A squared error

0.9767140560557455

In [32]:
# Lets test it with SVM?
clf = svm.SVR(kernel='poly')
clf.fit(X_train, y_train) 
accuracy = clf.score(X_test, y_test)
accuracy

0.44026938478320626