# Google stock prediction using linear regression

## We will use quandl to get the dataset

In [163]:
import pandas as pd
import quandl
df=quandl.get('WIKI/GOOGL')

### Lets check out how the data looks like.
- So lets print first 5 rows.

In [164]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


In [165]:
df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Ex-Dividend', 'Split Ratio',
       'Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume'],
      dtype='object')

**Okay so we have some columns out here.There are 2 columns that should be added by us to make good predictions.**
- one column that we should add is change in stocks which is described below as 'Change'

- and other column we should add is percentage change in high and close columns described by 'High_low_percent'

In [166]:
df=df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
df['Change']=(df['Adj. Close']-df['Adj. Open'])/df['Adj. Open']
df['High_low_percent']=(df['Adj. High']-df['Adj. Close'])/df['Adj. Close']

In [167]:
df=df[['Adj. Volume','Adj. Close','Change','High_low_percent']]

In [168]:
df.head()

Unnamed: 0_level_0,Adj. Volume,Adj. Close,Change,High_low_percent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,44659000.0,50.322842,0.00325,0.037126
2004-08-20,22834300.0,54.322689,0.07227,0.007109
2004-08-23,18256100.0,54.869377,-0.012279,0.037294
2004-08-24,15247300.0,52.597363,-0.057264,0.064175
2004-08-25,9188600.0,53.164113,0.011837,0.018868


### Shifting out 1% data upwards

In [169]:
prediction_shift=int(0.01*len(df))

In [170]:
prediction_shift

34

In [171]:
temp_variable='Adj. Close'

### defining labels

In [172]:
df['labels']=df[temp_variable].shift(-prediction_shift)

In [173]:
df.head()

Unnamed: 0_level_0,Adj. Volume,Adj. Close,Change,High_low_percent,labels
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,44659000.0,50.322842,0.00325,0.037126,69.639972
2004-08-20,22834300.0,54.322689,0.07227,0.007109,69.078238
2004-08-23,18256100.0,54.869377,-0.012279,0.037294,67.839414
2004-08-24,15247300.0,52.597363,-0.057264,0.064175,68.912727
2004-08-25,9188600.0,53.164113,0.011837,0.018868,70.668146


In [174]:
from sklearn import preprocessing,cross_validation,svm
from sklearn.linear_model import LinearRegression

### Finally we define features and labels for prediction

In [175]:
import numpy as np

In [176]:
X=df[['Adj. Volume','Adj. Close','Change','High_low_percent']]
Y=df['labels']
# X=np.array(df.drop(['labels']))
# Y=np.array[df['labels']]

In [177]:
X.head()

Unnamed: 0_level_0,Adj. Volume,Adj. Close,Change,High_low_percent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,44659000.0,50.322842,0.00325,0.037126
2004-08-20,22834300.0,54.322689,0.07227,0.007109
2004-08-23,18256100.0,54.869377,-0.012279,0.037294
2004-08-24,15247300.0,52.597363,-0.057264,0.064175
2004-08-25,9188600.0,53.164113,0.011837,0.018868


In [178]:
Y.head()

Date
2004-08-19    69.639972
2004-08-20    69.078238
2004-08-23    67.839414
2004-08-24    68.912727
2004-08-25    70.668146
Name: labels, dtype: float64

In [179]:
X=(X-X.max())/(X.max()-X.min())

In [180]:
X.head()

Unnamed: 0_level_0,Adj. Volume,Adj. Close,Change,High_low_percent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,-0.459293,-0.999857,-0.47018,-0.667258
2004-08-20,-0.726655,-0.99634,-0.085441,-0.936283
2004-08-23,-0.78274,-0.995859,-0.55674,-0.665746
2004-08-24,-0.819599,-0.997857,-0.807498,-0.424828
2004-08-25,-0.89382,-0.997359,-0.422314,-0.830894


### Dropping nan

In [181]:


Y.dropna(inplace=True)
Y.isnull().sum()

0

In [182]:
print(len(X),len(Y))

3424 3390


### We are making sure that we have features only for the labels we had as we shifted the labels up by prediction_shift

In [183]:
X=X[:-prediction_shift]

In [184]:
print(len(X),len(Y))

3390 3390


In [185]:
print(X.isnull().sum(),Y.isnull().sum())

Adj. Volume         0
Adj. Close          0
Change              0
High_low_percent    0
dtype: int64 0


In [186]:
print(len(X),len(Y))

3390 3390


In [204]:
X_train=X[:2900]

In [205]:
Y_train=Y[:2900]

In [206]:
print(len(X_train),len(Y_train))

2900 2900


In [207]:
X_test=X[2901:]
Y_test=Y[2901:]

In [208]:
# X_train,X_test,Y_train,Y_test=cross_validation.train_test_split(X,Y)

In [209]:
clf=LinearRegression()

In [210]:
clf.fit(X_train,Y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [212]:
clf.score(X_test,Y_test)

0.81758819995925092