# M1: direction ~ Lag1 + Lag2

In [None]:
'''
We want to predict if the stock market will go up or down based on the returns from the previous couple days trading history.
We will fit two models then compare them on a test set.

create a training set and a test set

using logistic regression fit the following two models on the training set

M1: direction ~ Lag1 + Lag2
M2: direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume

output the beta coefficients for M1 and M2 (for M2 they should be close to those from ISLR)

evaluate M1 and M2 on the training and test set using the following three loss functions
L1: 0-1 loss (this is just the percentage of correct predictions)
L2: cross entropy loss
L3: logistic loss
e.g. for each M1, M2 you should have a 3x2 table of [train, test] x [L1, L2, L3]
'''

In [1]:
# %load ../standard_import.txt
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')


# Get dataset from R into desktop

In [2]:
'''
install.packages("ISLR")
library(ISLR)
install.packages("xlsx")
library(xlsx)

write.xlsx(Smarket, "c:/Users/Michael/Desktop/Smarket.xlsx")
'''

'\ninstall.packages("ISLR")\nlibrary(ISLR)\ninstall.packages("xlsx")\nlibrary(xlsx)\n\nwrite.xlsx(Smarket, "c:/Users/Michael/Desktop/Smarket.xlsx")\n'

# Load dataset

In [3]:
df = pd.read_excel("c:/Users/Michael/Desktop/Smarket.xlsx")
df
#df[(df['Year'] != 2005)][['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
#df[(df['Year'] != 2005)]['Direction']

      Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
1     2001  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
2     2001  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
3     2001  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
4     2001 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
5     2001  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
6     2001  0.213  0.614 -0.623  1.032  0.959  1.34910  1.392        Up
7     2001  1.392  0.213  0.614 -0.623  1.032  1.44500 -0.403      Down
8     2001 -0.403  1.392  0.213  0.614 -0.623  1.40780  0.027        Up
9     2001  0.027 -0.403  1.392  0.213  0.614  1.16400  1.303        Up
10    2001  1.303  0.027 -0.403  1.392  0.213  1.23260  0.287        Up
11    2001  0.287  1.303  0.027 -0.403  1.392  1.30900 -0.498      Down
12    2001 -0.498  0.287  1.303  0.027 -0.403  1.25800 -0.189      Down
13    2001 -0.189 -0.498  0.287  1.303  0.027  1.09800  0.680   

In [37]:
row = df.loc[df['Year'] == 2001]
print row
print tuple(row.iloc[0].tolist())

print type(row.iloc[0].tolist())

     Year   Lag1   Lag2   Lag3   Lag4   Lag5   Volume  Today Direction
1    2001  0.381 -0.192 -2.624 -1.055  5.010  1.19130  0.959        Up
2    2001  0.959  0.381 -0.192 -2.624 -1.055  1.29650  1.032        Up
3    2001  1.032  0.959  0.381 -0.192 -2.624  1.41120 -0.623      Down
4    2001 -0.623  1.032  0.959  0.381 -0.192  1.27600  0.614        Up
5    2001  0.614 -0.623  1.032  0.959  0.381  1.20570  0.213        Up
6    2001  0.213  0.614 -0.623  1.032  0.959  1.34910  1.392        Up
7    2001  1.392  0.213  0.614 -0.623  1.032  1.44500 -0.403      Down
8    2001 -0.403  1.392  0.213  0.614 -0.623  1.40780  0.027        Up
9    2001  0.027 -0.403  1.392  0.213  0.614  1.16400  1.303        Up
10   2001  1.303  0.027 -0.403  1.392  0.213  1.23260  0.287        Up
11   2001  0.287  1.303  0.027 -0.403  1.392  1.30900 -0.498      Down
12   2001 -0.498  0.287  1.303  0.027 -0.403  1.25800 -0.189      Down
13   2001 -0.189 -0.498  0.287  1.303  0.027  1.09800  0.680        Up
14   2

In [39]:
df.loc[df['Year'] == 2001].values.tolist()

tuple_of_tuples = tuple(tuple(x) for x in df.values.tolist())[0]
print tuple_of_tuples

(2001L, 0.381, -0.192, -2.624, -1.055, 5.01, 1.1913, 0.959, u'Up')


# ---------------------------------------------------------------------------------------------------------------

# Logistic Regression on Training Set = All Years

In [4]:
y_train = df['Direction']

# dataset pre-2005--for training set
x_train = df[['Lag1', 'Lag2']]

clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_

classes:  [u'Down' u'Up']
coefficients:  [[-0.07132777 -0.04437961]]
intercept : [ 0.07424785]


# Acquire Probabilities of Direction == Yes for Training Set = All Years

In [5]:
# 252 X 2 Matrix, where column = probability for DOWN direction, probability for UP direction--reference: clf.classes_
prob = clf.predict_proba(x_train)

# predicted probabilities for ALL years for UP direction
prob_up = prob[:,1:2]

# convert to list
prob_up2 = [i.tolist()[0] for i in prob_up]

y_predicted = []
for i in prob_up2:
    if i>0.5:
        y_predicted.append("Up")
    else:
        y_predicted.append("Down")

# L1: 0-1 Loss

In [6]:
right_prediction = [i for i,j in zip(y_train, y_predicted) if i==j]
number_right = len(right_prediction)
zero_one_loss = number_right/len(y_predicted)

# I had to get the percentage wrong; not the percentage correct (same as error rate in classification)
print "L1 (0-1 loss): ", 1-zero_one_loss

L1 (0-1 loss):  0.472


# L2: Cross Entropy Loss

In [7]:
# source: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html
#         https://en.wikipedia.org/wiki/Cross_entropy

# using sci-kit:
y_train2 = []
for i in y_train:
    if i=='Up':
        y_train2.append(1)
    else:
        y_train2.append(0)

print "L2 (Cross Entropy Loss) by sci-kit: ", log_loss(y_train2, prob_up2)

# manually:
def ln(x):
    return np.log(x)

cross_entropy_losses = [-i*ln(j)-(1-i)*ln(1-j) for i,j in zip(y_train2, prob_up2)]
print "L2 (Cross Entropy Loss) by manually: ", sum(cross_entropy_losses)/len(cross_entropy_losses)

L2 (Cross Entropy Loss) by sci-kit:  0.69136137137
L2 (Cross Entropy Loss) by manually:  0.69136137137


# L3: Logistic Loss with ln(2) vs. without ln(2)

In [8]:
# source (with ln(2)): https://en.wikipedia.org/wiki/Loss_functions_for_classification

# source (without ln(2)): https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf
#                         https://github.com/JohnLangford/vowpal_wabbit/wiki/Loss-functions
#                         http://www.cs.cmu.edu/~yandongl/loss.html

logistic_losses = [(1/ln(2))*(ln(1+np.exp(-i*j))) for i,j in zip(y_train2, prob_up2)]        
print "L3 (Logistic Loss) with ln(2): ", sum(logistic_losses)/len(logistic_losses)

logistic_losses = [ln(1+np.exp(-i*j)) for i,j in zip(y_train2, prob_up2)]
print "L3 (Logistic Loss) without ln(2): ", sum(logistic_losses)/len(logistic_losses)

L3 (Logistic Loss) with ln(2):  0.830742496587
L3 (Logistic Loss) without ln(2):  0.575826819281


# ---------------------------------------------------------------------------------------------------------------

# Run Logistic Regression on Training Set (Pre-2005)

In [40]:
# dataset pre-2005--for training set
x_train = df[(df['Year'] != 2005)][['Lag1', 'Lag2']]

# directions pre-2005--for training set
y_train = df[(df['Year'] != 2005)]['Direction']

clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_

classes:  [u'Down' u'Up']
coefficients:  [[-0.05547007 -0.04436492]]
intercept : [ 0.0322169]


In [45]:
# random code testing
a = clf.coef_[0].tolist()
b = clf.intercept_.tolist()

print a
print b
print b+a

[-0.055470069397048495, -0.044364924475245905]
[0.032216904838310474]
[0.032216904838310474, -0.055470069397048495, -0.044364924475245905]


# Acquire Probabilities of Direction==Yes for our Test Set (2005)

In [11]:
# data in 2005--for testing set
x_test = df[(df['Year'] == 2005)][['Lag1', 'Lag2']]

# to compare with predictions from test set later (the ACTUAL directions in 2005)
y_test_unicode = df[(df['Year'] == 2005)]['Direction'].tolist()
y_test = [x.encode('ascii') for x in y_test_unicode]

# 252 X 2 Matrix, where column = probability for DOWN direction, probability for UP direction--reference: clf.classes_
prob = clf.predict_proba(x_test)

# predicted probabilities in 2005 for UP direction
prob_up = prob[:,1:2]

# convert to list
prob_up2 = [i.tolist()[0] for i in prob_up]

y_predicted = []
for i in prob_up2:
    if i>0.5:
        y_predicted.append("Up")
    else:
        y_predicted.append("Down")

# L1: 0-1 Loss

In [12]:
right_prediction = [i for i,j in zip(y_test, y_predicted) if i==j]
number_right = len(right_prediction)
zero_one_loss = number_right/len(y_predicted)

# I had to get the percentage wrong; not the percentage correct (same as error rate in classification)
print "L1 (0-1 loss): ", 1-zero_one_loss, " which matches the book!"

L1 (0-1 loss):  0.440476190476  which matches the book!


# L2: Cross Entropy Loss

In [13]:
# source: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html
#         https://en.wikipedia.org/wiki/Cross_entropy

# using sci-kit:
y_test2 = []
for i in y_test:
    if i=='Up':
        y_test2.append(1)
    else:
        y_test2.append(0)

print "L2 (Cross Entropy Loss) by sci-kit: ", log_loss(y_test2, prob_up2)

# manually:
def ln(x):
    return np.log(x)

cross_entropy_losses = [-i*ln(j)-(1-i)*ln(1-j) for i,j in zip(y_test2, prob_up2)]
print "L2 (Cross Entropy Loss) by manually: ", sum(cross_entropy_losses)/len(cross_entropy_losses)

L2 (Cross Entropy Loss) by sci-kit:  0.689785443022
L2 (Cross Entropy Loss) by manually:  0.689785443022


# L3 Logistic Loss with ln(2) vs. without ln(2)

In [14]:
# source (with ln(2)): https://en.wikipedia.org/wiki/Loss_functions_for_classification

# source (without ln(2)): https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf
#                         https://github.com/JohnLangford/vowpal_wabbit/wiki/Loss-functions
#                         http://www.cs.cmu.edu/~yandongl/loss.html

logistic_losses = [(1/ln(2))*(ln(1+np.exp(-i*j))) for i,j in zip(y_test2, prob_up2)]        
print "L3 (Logistic Loss) with ln(2): ", sum(logistic_losses)/len(logistic_losses)

logistic_losses = [ln(1+np.exp(-i*j)) for i,j in zip(y_test2, prob_up2)]
print "L3 (Logistic Loss) without ln(2): ", sum(logistic_losses)/len(logistic_losses)

L3 (Logistic Loss) with ln(2):  0.820587590405
L3 (Logistic Loss) without ln(2):  0.568787974691


# ---------------------------------------------------------------------------------------------------------------

# Table

In [15]:
loss_functions = ['L1 (0-1 Loss)', 'L2 (Cross Entropy Loss)', 'L3 (Logistic Loss) without ln(2)', 'L3 (Logistic Loss) with ln(2)']
train_train_scores = [0.472, 0.69136137137, 0.575826819281, 0.830742496587]
train_test_scores = [0.440476190476, 0.689785443022, 0.568787974691, 0.820587590405]
data = {'Loss Functions': loss_functions,
        'Train->Train (All)': train_train_scores,
        'Train->Test (2005)': train_test_scores
       }
DF = pd.DataFrame(data, columns = ['Loss Functions', 'Train->Train (All)', 'Train->Test (2005)'])

DF

                     Loss Functions  Train->Train (All)  Train->Test (2005)
0                     L1 (0-1 Loss)            0.472000            0.440476
1           L2 (Cross Entropy Loss)            0.691361            0.689785
2  L3 (Logistic Loss) without ln(2)            0.575827            0.568788
3     L3 (Logistic Loss) with ln(2)            0.830742            0.820588