In [130]:
import pandas as pd
import numpy as np
import glob
import datetime as dt
import pandas_datareader.data as web
import quandl
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

In [131]:
data_folder = "data"
csvDataFiles = sorted(glob.glob(data_folder + "/*.csv"))
dataFiles = []
for dataFile in csvDataFiles:
    df = pd.read_csv(dataFile)
    dataFiles.append(df)

In [132]:
#The number of stocks we have in our data folder
print(len(dataFiles))

30


In [133]:
#Lets store the ratings in a form we can understand
ratings_folder = "Ratings"
xlsxRatingFiles = sorted(glob.glob(ratings_folder + "/*.xlsx"))
ratingsFiles = []
for ratingFile in xlsxRatingFiles:
    rf = pd.ExcelFile(ratingFile)
    ratingsFiles.append(rf.parse())

In [134]:
#The number of ratings files we have in our ratings folder
print(len(ratingsFiles))

30


In [160]:
start3MonthsAgo = dt.datetime(2017, 12, 28)
end2MonthsAgo = dt.datetime(2018, 1, 31)

start2MonthsAgo = dt.datetime(2018, 2, 1)
end1MonthAgo = dt.datetime(2018, 2, 28)

start1MonthAgo = dt.datetime(2018, 3, 1)
end1MonthAgo = dt.datetime(2018, 3, 29)

nasdaqData1MonthAgo = quandl.get("NASDAQOMX/COMP-NASDAQ", trim_start=start1MonthAgo, trim_end=end1MonthAgo)

In [161]:
nasdaqData1MonthAgo = nasdaqData1MonthAgo[['Index Value']]
nasdaqData1MonthAgo.head()

Unnamed: 0_level_0,Index Value
Trade Date,Unnamed: 1_level_1
2018-03-01,7180.56
2018-03-02,7257.87
2018-03-05,7330.7
2018-03-06,7372.01
2018-03-07,7396.65


In [162]:
firstDay = 0
initPrice = nasdaqData1MonthAgo.iloc[firstDay]['Index Value']

lastDay = nasdaqData1MonthAgo.shape[0] - 1
finalPrice = nasdaqData1MonthAgo.iloc[lastDay]['Index Value']

#Market growth
oneMonthAgoROI = (finalPrice - initPrice) / initPrice
print(oneMonthAgoROI)

-0.016310705571710396


In [165]:
#Let's do the same thing for the 2 earlier months
nasdaqData2MonthsAgo = quandl.get("NASDAQOMX/COMP-NASDAQ", trim_start=dt.datetime(2018, 2, 1), trim_end=dt.datetime(2018, 2, 28))
nasdaqData2MonthsAgo = nasdaqData2MonthsAgo[['Index Value']]

firstDay = 0
initPrice = nasdaqData2MonthsAgo.iloc[firstDay]['Index Value']

lastDay = nasdaqData2MonthsAgo.shape[0] - 1
finalPrice = nasdaqData2MonthsAgo.iloc[lastDay]['Index Value']

#Market growth
twoMonthsAgoROI = (finalPrice - initPrice) / initPrice
print(twoMonthsAgoROI)
nasdaqData2MonthsAgo.shape

-0.015279195652232706


(19, 1)

In [139]:
#And for 3rd month ago
nasdaqData3MonthsAgo = quandl.get("NASDAQOMX/COMP-NASDAQ", trim_start=start3MonthsAgo, trim_end=end2MonthsAgo)
nasdaqData23MonthsAgo = nasdaqData3MonthsAgo[['Index Value']]

firstDay = 0
initPrice = nasdaqData3MonthsAgo.iloc[firstDay]['Index Value']

lastDay = nasdaqData3MonthsAgo.shape[0] - 1
finalPrice = nasdaqData3MonthsAgo.iloc[lastDay]['Index Value']

#Market growth
threeMonthsAgoROI = (finalPrice - initPrice) / initPrice
print(threeMonthsAgoROI)

0.06637545034934443


In [140]:
#Let's see how to professionals' ratings for specific stocks compared to the market returns

#The NASDAQ growth looks like this:
#01/01/18 - 01/31/18 : 5.7740227490045516 %
#02/01/18 - 02/28/18 : -4.695958162202421 %
#03/01/18 - 03/29/18 : -1.6310705571710396 %

#We don't need the current rating, so we'll drop it from each of the ratings data frames
#Then, we can calculate the avg rating for one month ago for each stock

avgRating1MonthAgo = []
avgRating2MonthsAgo = []
avgRating3MonthsAgo = []

    #for ratingFile in ratingsFiles:

        #remove current rating
        #del ratingFile[0]

for ratingFile in ratingsFiles:
    avgRating = 0
    xMonthAgo = 1
    totalReviewers = 0

    #Aggreate the ratings for the month
    #ratingFile.iloc[5 - rating][month]
    for index in range(len(ratingFile)):
        rating = 5 - index
        reviewers = ratingFile.iloc[index][xMonthAgo]
        totalReviewers += reviewers
        avgRating += rating * reviewers

        #set reviewers to 0 for next iteration
        reviewers = 0
    avgRating /= float(totalReviewers)
    avgRating1MonthAgo.append(avgRating)

#same thing for the 2nd month ago
for ratingFile in ratingsFiles:
    avgRating = 0
    xMonthAgo = 2
    totalReviewers = 0

    #Aggreate the ratings for the month
    #ratingFile.iloc[5 - rating][month]
    for index in range(len(ratingFile)):
        rating = 5 - index
        reviewers = ratingFile.iloc[index][xMonthAgo]
        totalReviewers += reviewers
        avgRating += rating * reviewers

        #set reviewers to 0 for next iteration
        reviewers = 0
    avgRating /= float(totalReviewers)
    avgRating2MonthsAgo.append(avgRating)

#And again for the 3rd month ago
for ratingFile in ratingsFiles:
    avgRating = 0
    xMonthAgo = 3
    totalReviewers = 0

    #Aggreate the ratings for the month
    #ratingFile.iloc[5 - rating][month]
    for index in range(len(ratingFile)):
        rating = 5 - index
        reviewers = ratingFile.iloc[index][xMonthAgo]
        totalReviewers += reviewers
        avgRating += rating * reviewers

        #set reviewers to 0 for next iteration
        reviewers = 0
    avgRating /= float(totalReviewers)
    avgRating3MonthsAgo.append(avgRating)
    

In [141]:
#Now, let's create a data frame with 2 dimensions: the average ranking at the beginning of the month for each specific stock
#and the sign of the difference of the NASDAQ growth and the stock growth

stockROIsMarch = []
#This is for the month of March
for index in range(len(dataFiles)):
    
    stockROI = 0
    
    firstRowIndex = 103
    openIndex = 1
    March1Open = dataFiles[index].iloc[firstRowIndex][openIndex]

    lastRowIndex = dataFiles[index].shape[0] - 1
    closeIndex = 4
    March29Close = dataFiles[index].iloc[lastRowIndex][closeIndex]
    
    stockROI = (March29Close - March1Open) / March1Open
    stockROIsMarch.append(stockROI)
    
stockROIsFebruary = []
#now for feb
for index in range(len(dataFiles)):
    
    stockROI = 0
    
    firstRowIndex = 84
    openIndex = 1
    Feb1Open = dataFiles[index].iloc[firstRowIndex][openIndex]

    lastRowIndex = 102
    closeIndex = 4
    Feb28Close = dataFiles[index].iloc[lastRowIndex][closeIndex]
    
    stockROI = (Feb28Close - Feb1Open) / Feb1Open
    stockROIsFebruary.append(stockROI)
    
stockROIsJanuary = []
#Now for January -- ** Note, market opens on the 2nd of January, not the 1st
for index in range(len(dataFiles)):
    
    stockROI = 0
    
    firstRowIndex = 63
    openIndex = 1
    Jan2Open = dataFiles[index].iloc[firstRowIndex][openIndex]

    lastRowIndex = 83
    closeIndex = 4
    Jan31Close = dataFiles[index].iloc[lastRowIndex][closeIndex]
    
    stockROI = (Jan31Close - Jan2Open) / Jan2Open
    stockROIsJanuary.append(stockROI)


In [142]:
#Using the stockROIs for each month, let's create 3 data frames--one for each month--that includes:
#the avg rating with a label that's positive if the stock beat the market and negative otherwise

#Subtract NASDAQ returns from stocks
didStockBeatMarket3MonthsAgo = list(stockROIsJanuary)
didStockBeatMarket3MonthsAgo[:] = [stockROI - threeMonthsAgoROI for stockROI in didStockBeatMarket3MonthsAgo]

didStockBeatMarket2MonthsAgo = list(stockROIsFebruary)
didStockBeatMarket2MonthsAgo[:] = [stockROI - twoMonthsAgoROI for stockROI in didStockBeatMarket2MonthsAgo]

didStockBeatMarket1MonthAgo = list(stockROIsMarch)
didStockBeatMarket1MonthAgo[:] = [stockROI - oneMonthAgoROI for stockROI in didStockBeatMarket1MonthAgo]

#Turn ratings' arrays into data frame
ratingsMarch = pd.DataFrame(np.array(avgRating1MonthAgo).reshape(30,1))
ratingsFebruary = pd.DataFrame(np.array(avgRating2MonthsAgo).reshape(30,1))
ratingsJanuary = pd.DataFrame(np.array(avgRating3MonthsAgo).reshape(30,1))

resultsMarch = pd.DataFrame(np.array(didStockBeatMarket1MonthAgo).reshape(30,1))
resultsFebruary = pd.DataFrame(np.array(didStockBeatMarket2MonthsAgo).reshape(30,1))
resultsJanuary = pd.DataFrame(np.array(didStockBeatMarket3MonthsAgo).reshape(30,1))

#Apply a sign function to the labels
resultsMarch[resultsMarch > 0] = 1
resultsMarch[resultsMarch < 0] = -1

In [143]:
#And again
resultsFebruary[resultsFebruary > 0] = 1
resultsFebruary[resultsFebruary < 0] = -1

resultsJanuary[resultsJanuary > 0] = 1
resultsJanuary[resultsJanuary < 0] = -1

In [144]:
#create the data frame for each month -- include rating and the label indicating whether the stock outperformed market
dfMarch=pd.concat([ratingsMarch, resultsMarch], axis = 1)

In [145]:
#Create the full data frames for the other months too
dfFeb=pd.concat([ratingsFebruary, resultsFebruary], axis = 1)
dfJan=pd.concat([ratingsJanuary, resultsJanuary], axis = 1)


In [146]:
dataFiles[19][60:65]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
60,2017-12-27,85.650002,85.980003,85.220001,85.709999,85.309265,14678000
61,2017-12-28,85.900002,85.93,85.550003,85.720001,85.319214,10594300
62,2017-12-29,85.629997,86.050003,85.5,85.540001,85.14006,18717400
63,2018-01-02,86.129997,86.309998,85.5,85.949997,85.548134,22483800
64,2018-01-03,86.059998,86.510002,85.970001,86.349998,85.946266,26061400


In [201]:
#Create extra features -- growth the day b4 and stock market growth the day b4
nasdaqData = nasdaqData3MonthsAgo
nasdaqData = nasdaqData.append(nasdaqData2MonthsAgo)
nasdaqData = nasdaqData.append(nasdaqData1MonthAgo)
nasdaqData.head()

Unnamed: 0_level_0,Dividend Market Value,High,Index Value,Low,Total Market Value
Trade Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-12-28,1218903000.0,6954.8,6950.16,6936.75,10591590000000.0
2017-12-29,111289200.0,6954.98,6903.39,6903.39,10519760000000.0
2018-01-02,737365900.0,7006.91,7006.9,6924.08,10670660000000.0
2018-01-03,3703689.0,7069.15,7065.53,7016.7,10788420000000.0
2018-01-04,1565174000.0,7098.05,7077.91,7072.38,10807380000000.0


In [230]:
#reset indices on nasdaqData to match dataFiles indices
nasdaqData2 = nasdaqData.copy()
nasdaqData2.index = range(61,124)


dataWOFirstMonths = []
for dataFile in dataFiles:
    
    #remove adjusted and volume columns and drop data b4 12/29/17
    
    update = dataFile.iloc[61:]
    update = update.drop(update.columns[[5,6]], axis=1)
    update["Stock_Yesterday_Growth"] = "Growth"
    update["Market_Yesterday_Growth"] = "Growth"
    update["DidStockOutperformToday_LABEL"] = "DidOutPerform"
    
    prevClose = 0
    prevPrevClose = 0
    prevMarketClose = 0
    prevPrevMarketClose = 0
    
    for index, row in update.iterrows():
        if(index > 62):
            update.set_value(index, "Stock_Yesterday_Growth", (prevClose - prevPrevClose)/prevPrevClose)
            update.set_value(index, "Market_Yesterday_Growth", (prevMarketClose - prevPrevMarketClose)/prevPrevMarketClose)
            
            stockGrowthToday = (row["Close"] - row["Open"])/row["Open"]
            marketGrowthToday = (nasdaqData2.loc[index]['Index Value'] - prevMarketClose)/prevMarketClose
            #Update whether we outperformed or not
            update.set_value(index, "DidStockOutperformToday_LABEL", int(stockGrowthToday >= marketGrowthToday))
            
            prevPrevClose = prevClose
            prevClose = row["Close"]
            
            prevPrevMarketClose = prevMarketClose
            prevMarketClose = nasdaqData2.loc[index]['Index Value']
        elif(index == 61):
            prevPrevClose = row["Close"]
            prevPrevMarketClose = nasdaqData2.loc[index]['Index Value']
        elif(index == 62):
            prevClose = row["Close"]
            prevMarketClose = nasdaqData2.loc[index]['Index Value']
    
    dataWOFirstMonths.append(update)



In [231]:
dataWOFirstMonths[4].head()

Unnamed: 0,Date,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth,DidStockOutperformToday_LABEL
61,2017-12-28,38.73,38.73,38.450001,38.59,Growth,Growth,DidOutPerform
62,2017-12-29,38.41,38.619999,38.299999,38.299999,Growth,Growth,DidOutPerform
63,2018-01-02,38.669998,38.950001,38.43,38.860001,-0.00751493,-0.00672934,0
64,2018-01-03,38.720001,39.279999,38.529999,39.169998,0.0146215,0.0149941,1
65,2018-01-04,39.049999,39.540001,38.93,38.990002,0.00797728,0.00836747,0


Unnamed: 0,Date,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth,DidStockOutperformToday_LABEL
61,12/28/2017,171.0,171.850006,170.479996,171.080002,Growth,Growth,DidOutPerform
62,12/29/2017,170.520004,170.589996,169.220001,169.229996,Growth,Growth,DidOutPerform
63,1/2/2018,170.160004,172.300003,169.259995,172.259995,-0.0108137,-0.00672934,0
64,1/3/2018,172.529999,174.550003,171.960007,172.229996,0.0179046,0.0149941,0
65,1/4/2018,172.539993,173.470001,172.080002,173.029999,-0.00017415,0.00836747,1


In [236]:
#Let's try GP's 
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern, RBF

GP_RBF = GaussianProcessClassifier(kernel = 1.0 * RBF(length_scale=1.0))
GP_Matern = GaussianProcessClassifier(kernel = Matern(length_scale=2, nu=3/2))



In [238]:
dataWOFirstMonths[0].head()

Unnamed: 0,Date,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth,DidStockOutperformToday_LABEL
61,12/28/2017,171.0,171.850006,170.479996,171.080002,Growth,Growth,DidOutPerform
62,12/29/2017,170.520004,170.589996,169.220001,169.229996,Growth,Growth,DidOutPerform
63,1/2/2018,170.160004,172.300003,169.259995,172.259995,-0.0108137,-0.00672934,0
64,1/3/2018,172.529999,174.550003,171.960007,172.229996,0.0179046,0.0149941,0
65,1/4/2018,172.539993,173.470001,172.080002,173.029999,-0.00017415,0.00836747,1


In [239]:
dataWOFirstMonths[0].tail()

Unnamed: 0,Date,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth,DidStockOutperformToday_LABEL
119,3/23/2018,168.389999,169.919998,164.940002,164.940002,-0.0141297,-0.0243163,1
120,3/26/2018,168.070007,173.100006,166.440002,172.770004,-0.0231567,-0.0242804,0
121,3/27/2018,173.679993,175.149994,166.919998,168.339996,0.0474718,0.032587,0
122,3/28/2018,167.25,170.020004,165.190002,166.479996,-0.0256411,-0.0293233,1
123,3/29/2018,167.809998,171.75,166.899994,167.779999,-0.0110491,-0.00850073,0


In [303]:
firstFile = dataWOFirstMonths[0].iloc[2:]
#test = #[63,113]

#Split data and test and second half of march
train = firstFile.iloc[:50]
test = firstFile.iloc[50:]

In [304]:
train

Unnamed: 0,Date,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth,DidStockOutperformToday_LABEL
63,1/2/2018,170.160004,172.300003,169.259995,172.259995,-0.0108137,-0.00672934,0
64,1/3/2018,172.529999,174.550003,171.960007,172.229996,0.0179046,0.0149941,0
65,1/4/2018,172.539993,173.470001,172.080002,173.029999,-0.00017415,0.00836747,1
66,1/5/2018,173.440002,175.369995,173.050003,175.0,0.00464497,0.00175217,1
67,1/8/2018,174.350006,175.610001,173.929993,174.350006,0.0113853,0.00828634,0
68,1/9/2018,174.550003,175.059998,173.410004,174.330002,-0.00371425,0.00291877,0
69,1/10/2018,173.160004,174.300003,173.0,174.289993,-0.000114735,0.00086484,1
70,1/11/2018,174.589996,175.490005,174.490005,175.279999,-0.000229502,-0.00139735,0
71,1/12/2018,176.179993,177.360001,175.649994,177.089996,0.00568022,0.0081372,0
72,1/16/2018,177.899994,179.389999,176.139999,176.190002,0.0103263,0.00683326,0


In [305]:
test

Unnamed: 0,Date,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth,DidStockOutperformToday_LABEL
113,3/15/2018,178.5,180.240005,178.070007,178.649994,-0.00850141,-0.00189056,1
114,3/16/2018,178.649994,179.119995,177.619995,178.020004,0.00117682,-0.00201019,0
115,3/19/2018,177.320007,177.470001,173.660004,175.300003,-0.00352639,3.34147e-05,1
116,3/20/2018,175.240005,176.800003,174.940002,175.240005,-0.0152792,-0.0184109,0
117,3/21/2018,175.039993,175.089996,171.259995,171.270004,-0.000342259,0.00273139,0
118,3/22/2018,170.0,172.679993,168.600006,168.850006,-0.0226547,-0.00258137,1
119,3/23/2018,168.389999,169.919998,164.940002,164.940002,-0.0141297,-0.0243163,1
120,3/26/2018,168.070007,173.100006,166.440002,172.770004,-0.0231567,-0.0242804,0
121,3/27/2018,173.679993,175.149994,166.919998,168.339996,0.0474718,0.032587,0
122,3/28/2018,167.25,170.020004,165.190002,166.479996,-0.0256411,-0.0293233,1


In [306]:
y_train = train.iloc[:,-1]
train = train.drop(train.columns[[7]], axis=1)

y_test = test.iloc[:,-1]
test = test.drop(test.columns[[7]], axis=1)
test
y_train

63     0
64     0
65     1
66     1
67     0
68     0
69     1
70     0
71     0
72     0
73     1
74     0
75     0
76     0
77     0
78     0
79     0
80     0
81     0
82     1
83     1
84     1
85     0
86     1
87     1
88     0
89     1
90     0
91     1
92     1
93     1
94     1
95     1
96     0
97     0
98     1
99     0
100    1
101    1
102    1
103    0
104    1
105    0
106    0
107    0
108    1
109    0
110    1
111    0
112    0
Name: DidStockOutperformToday_LABEL, dtype: object

In [442]:
train = train.drop('Date', 1)
test = test.drop('Date',1)
GP_RBF.fit(train.as_matrix(), y_train.to_frame().as_matrix())
#RBF_Test = accuracy_score(y_test.to_frame().as_matrix(), GP_RBF.predict(test.as_matrix()))
#RBF_Test

(50, 6)


In [480]:
import plotly.plotly as py
import plotly.graph_objs as go

def data_to_plotly(x):
    k = []
    
    for i in range(0, len(x)):
        k.append(x[i][0])
        
    return k


p1 = go.Scatter(x=data_to_plotly(train.as_matrix()), y=data_to_plotly(y_train.to_frame().as_matrix()), 
                mode='markers',
                line=dict(color='red', dash='dot'),
                name='classification of stock results')
data = [p1]
layout = go.Layout(xaxis=dict(title='<i>x</i>'),
                   yaxis=dict(title='classification of stock results'),
                  )
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [462]:
#RBF_Test = accuracy_score(y_test.to_frame().as_matrix().astype(int), GP_RBF.predict(test.as_matrix()))
#RBF_Test

trainCopy = train.copy()
y_trainCopy = y_train.copy()

count = 0
correctPred = 0

for index, row in test.iterrows():
    
    count = count + 1
    y_test_val = y_test.loc[index]
    
    #Keep track of the amount correct
    pred_X =  row.to_frame().as_matrix().T
    correctPred += int(GP_RBF.predict(pred_X) != y_test_val)

    trainCopy = trainCopy.append(row)
    y_trainCopy = y_trainCopy.append(pd.Series(y_test[index]))
    
    #Fit GP w/ new row
    GP_RBF.fit(trainCopy, y_trainCopy)
print(correctPred/float(count))

0.454545454545


In [397]:
trainCopy.head()

Unnamed: 0,Open,High,Low,Close,Stock_Yesterday_Growth,Market_Yesterday_Growth
63,170.160004,172.300003,169.259995,172.259995,-0.0108137,-0.00672934
64,172.529999,174.550003,171.960007,172.229996,0.0179046,0.0149941
65,172.539993,173.470001,172.080002,173.029999,-0.00017415,0.00836747
66,173.440002,175.369995,173.050003,175.0,0.00464497,0.00175217
67,174.350006,175.610001,173.929993,174.350006,0.0113853,0.00828634
