# Practical Machine Learning Tutorial with Python Introduction

https://pythonprogramming.net/machine-learning-tutorial-python-introduction/
This Jupyter Notebook contains student notes while following the course in Python 2.7 (the author is using Python 3).

## Lesson 2: Regression - Intro and Data

In [2]:
import pandas as pd
import quandl

In [3]:
df_orig = quandl.get("WIKI/GOOGL", authtoken="gtknwWU6NHm3V44-JC3V")
print(df_orig.head())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [7]:
# create a new df on which we will do all our modifications
# this allows us to keep df_orig in memory so we can limit the amount of
# queries on Quandl
df = df_orig.copy(deep=True)

# create 2 new columns: High/Low Percent, and Percent_change
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100

# we are only interested in keeping these 4 columns:
df = df[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]

print(df.head())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  8.441017    0.324968   44659000.0
2004-08-20   54.322689  8.537313    7.227007   22834300.0
2004-08-23   54.869377  4.062357   -1.227880   18256100.0
2004-08-24   52.597363  7.753210   -5.726357   15247300.0
2004-08-25   53.164113  3.966115    1.183658    9188600.0


## Lesson 3: Regression - Features and Labels

In [210]:
import quandl
import math
import numpy as np
import pandas as pd
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression

In [211]:
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)

print df.head()
print df.tail()

# math.ceil rounds up.  At runtime forecast_out should be 30.0.
# that means we'll predict 30 days into the future.
# So the second line (shift) ensures that the label at row 1 
# will have the value of Adj.Close at row 31, and so on.
forecast_out = int(math.ceil(0.01 * len(df)))       
df['label'] = df2[forecast_col].shift(-forecast_out) 

print df.head()
print df.tail()

            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19      50.170  8.441017    0.340000   44659000.0
2004-08-20      54.155  8.537313    7.227007   22834300.0
2004-08-23      54.700  4.062357   -1.218962   18256100.0
2004-08-24      52.435  7.753210   -5.726357   15247300.0
2004-08-25      53.000  3.966115    0.990854    9188600.0
            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2016-07-05      704.89  1.285884   -0.017021    1399205.0
2016-07-06      708.97  2.002861    1.304584    1428996.0
2016-07-07      707.26  1.355845   -0.401346    1053642.0
2016-07-08      717.78  1.382554    1.016100    1463884.0
2016-07-11      727.20  1.399971    1.081427    1430124.0
            Adj. Close    HL_PCT  PCT_change  Adj. Volume   label
Date                                                             
2004-08-19      50.170  8.441017    0.340000   44659000.

## Lesson 4: Training and testing

In [212]:
# define features
X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]   # these are the X values that DON'T have a forecast.  Used to make predictions against.
X = X[:-forecast_out]          # these are the X values that DO have a forecast.  Used for training.
df.dropna(inplace=True)

# define labels
y = np.array(df['label'])

# set up crossvalidation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [213]:
# Train with Support Vector Regression (SVR)
for k in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVR(kernel=k)
    clf.fit(X_train, y_train)
    
    # test how well we trained
    confidence = clf.score(X_test, y_test)
    print k, confidence

linear 0.964266461789
poly 0.658263989065
rbf 0.800077964775
sigmoid -0.0639080456893


In [214]:
# Train with Linear Regression
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)

# Test how well we trained
confidence = clf.score(X_test, y_test)
print confidence

0.965385598191


## Lesson 5: Predicting

In [215]:
# here we take all X values that we didn't have forecasts for, and make a prediction,
# !!! We're using the last trained model from above !!!
forecast_set = clf.predict(X_lately)
print forecast_set

[ 750.04687707  751.83169348  752.01554506  747.25393799  739.68293008
  733.62236884  735.10379677  746.08803972  745.99532983  736.67388275
  735.35288561  736.50955029  736.23525211  726.81143363  706.10602939
  709.61635713  712.5164007   714.50055157  717.40269343  686.70817454
  684.62011142  695.36918557  699.22069361  706.1710037   713.86804517
  708.81961134  711.83565101  711.32778352  721.09557574  730.31620768]


In [216]:
import datetime
import time
import matplotlib.pyplot as plt
from matplotlib import style

style.use('ggplot')
df['Forecast'] = np.nan

In [217]:
# our first prediction is going to be for the day just after the last day in X
last_date = df.iloc[-1].name
last_unix = time.mktime(datetime.date(last_date.year, last_date.month, last_date.day).timetuple())
one_day = 86400
next_unix = last_unix + one_day

In [218]:
# add forecasts to the existing dataframe
for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += 86400
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]

print df.tail(forecast_out+5)

            Adj. Close    HL_PCT  PCT_change  Adj. Volume   label    Forecast
2016-05-20      721.71  1.774800    0.732769    1710030.0  704.89         NaN
2016-05-23      717.24  0.914986   -0.380566    1228165.0  708.97         NaN
2016-05-24      733.03  2.022525    1.830937    1879157.0  707.26         NaN
2016-05-25      738.10  0.995086    0.421769    1606551.0  717.78         NaN
2016-05-26      736.93  1.105048    0.126359    1283896.0  727.20         NaN
2016-05-27         NaN       NaN         NaN          NaN     NaN  750.046877
2016-05-28         NaN       NaN         NaN          NaN     NaN  751.831693
2016-05-29         NaN       NaN         NaN          NaN     NaN  752.015545
2016-05-30         NaN       NaN         NaN          NaN     NaN  747.253938
2016-05-31         NaN       NaN         NaN          NaN     NaN  739.682930
2016-06-01         NaN       NaN         NaN          NaN     NaN  733.622369
2016-06-02         NaN       NaN         NaN          NaN     Na

In [219]:
# graph it
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

## Lesson 6: Scaling by using Pickle

In [221]:
import pickle

# saving the trained classifier to a binary file for later use
with open('linearregression.pickle','wb') as f:
    pickle.dump(clf, f)

# read it from disk again
pickle_in = open('linearregression.pickle','rb')
clf = pickle.load(pickle_in)