# Imports

In [52]:
import pandas as pd
import quandl
import math
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Data import and preprocessing

In [4]:
df = quandl.get('WIKI/GOOGL')

In [5]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


You want features, but you need meaningful features, which actually bring info to our regression model.
You need to simplify your data as much as possible. Useless, correlated features can cause more problems than bring info.

In [6]:
df = df[['Adj. Open','Adj. High','Adj. Low','Adj. Close'
         ,'Adj. Volume']]

We want to keep features that have a meaningful relantionship. E.g.: Adj. High & Adj. Low tells us something about the volatility of the market that day. And, we wan to drop features that don't bring additional info.

In [7]:
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

In [37]:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

In [38]:
df.head()

Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-19,50.322842,8.072956,0.324968,44659000.0
2004-08-20,54.322689,7.921706,7.227007,22834300.0
2004-08-23,54.869377,4.04936,-1.22788,18256100.0
2004-08-24,52.597363,7.657099,-5.726357,15247300.0
2004-08-25,53.164113,3.886792,1.183658,9188600.0


Features are like attributes that make up the labels, and labels are like predictions.

So, which column is the label and which are the features?

Adj. Close can be feature or none of the above. It could be a label if we had chosen other features b/c we wouldn't know the High - Low or percent change until the close had already occured. 

If you trained an algorithm to predict that value, it would be very biased.

What we'll do is take the last 10 values of Adj. and try to predict another value. Close and that's a feature, but that's for when we write the algorithm ourselves.

A label will be a future price, and the only column that fits is Adj. Price, but it's for the next day or for the next 5 days.

In [39]:
forecast_col = 'Adj. Close'

We can't use empty rows in ML, so we need to fill NaN's. The below method will treat such rows, examples as outliers.

In [40]:
df.fillna(-99999, inplace = True)

We try to predict out 10% of the dataframe. In reality, we get tomorrow's price, and next day's price. We're using data that came 10 days ago to predict today.

Float var can be played with.

In [41]:
forecast_out = int(math.ceil(0.01*len(df)))

We define the label and shifting the "rows" negatively, so up. This way each row's label column will be the Adjusted Close price 10 days into the future.

Our features are these attributes that, we consider, may cause the adjusted close price in 10 days to change. Actually, it's 10% of the timeframe.

In [42]:
df['label'] = df[forecast_col].shift(-forecast_out)

In [43]:
df.dropna(inplace = True)

In [44]:
df.head()

Unnamed: 0_level_0,Adj. Close,HL_PCT,PCT_change,Adj. Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-08-19,50.322842,8.072956,0.324968,44659000.0,69.639972
2004-08-20,54.322689,7.921706,7.227007,22834300.0,69.078238
2004-08-23,54.869377,4.04936,-1.22788,18256100.0,67.839414
2004-08-24,52.597363,7.657099,-5.726357,15247300.0,68.912727
2004-08-25,53.164113,3.886792,1.183658,9188600.0,70.668146


In [45]:
X = np.array(df.drop(['label'], 1))
y = np.array(df['label'])
X = preprocessing.scale(X)
y = np.array(df['label'])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

**Using Linear Regression model**

In [56]:
clf = LinearRegression()

In [57]:
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [59]:
accuracy = clf.score(X_test, y_test)
accuracy

0.9769466420264561

In [60]:
print(forecast_out)

34


The accuracy is 97.7% for approx. 34 days in advance.

Caveats:

* Accuracy for Linear Regression is the mean squared error.
* It is *very high* for 34 days in advance using a simple module. Clearly, stock is more difficult to predict than this.
* Because the accuracy is defined as above, it's more like saying it's directionall accurate.

**Using Support Vector Machine's Regression model**

In [61]:
clf = svm.SVR()

In [62]:
clf.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [63]:
accuracy = clf.score(X_test, y_test)
accuracy

0.813940117836997

The SVM model is a lot more inaccurate.