In [91]:
%matplotlib inline
import pandas as pd
import glob
import datetime as dt
import numpy as np
import theano.tensor as tt
import pymc3 as pm
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
sns.set_context('notebook')

In [3]:
data_folder = "data"
csvDataFiles = sorted(glob.glob(data_folder + "/*.csv"))
dataFiles = []
for dataFile in csvDataFiles:
    df = pd.read_csv(dataFile)
    dataFiles.append(df)

In [4]:
#The number of stocks we have in our data folder
print(len(dataFiles))

30


In [5]:
#This is what our data looks like
dataFiles[0].head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,10/2/2017,154.259995,154.449997,152.720001,153.809998,152.636734,18698800
1,10/3/2017,154.009995,155.089996,153.910004,154.479996,153.30162,16230300
2,10/4/2017,153.630005,153.860001,152.460007,153.479996,152.30925,20163800
3,10/5/2017,154.179993,155.440002,154.050003,155.389999,154.204681,21283800
4,10/6/2017,154.970001,155.490005,154.559998,155.300003,154.115372,17407600


In [6]:
#The dimensions of the stock data
dataFiles[0].shape

(124, 7)

In [7]:
ratings_file = "Ratings.xlsx"
rating_df = pd.read_excel(ratings_file)
rating_cols = rating_df.columns.tolist()
rating_rows = rating_df.index.values.tolist()[1:] # skip first row
rating_rows

[('Buy', 5.0),
 ('Outperform', 4.0),
 ('Hold', 3.0),
 ('Underperform', 2.0),
 ('Sell', 1.0)]

In [8]:
rating_cols

['Current Rating', '1 Month Ago', '2 Months Ago', '3 Months Ago']

In [9]:
#Lets store the ratings in a form we can understand
ratings_folder = "Ratings"
xlsxRatingFiles = sorted(glob.glob(ratings_folder + "/*.xlsx"))
ratingsFiles = []
for ratingFile in xlsxRatingFiles:
    rf = pd.read_excel(ratingFile, names=rating_cols, index=False)
    ratingsFiles.append(rf)

In [10]:
#This is what our ratings look like
ratingsFiles[0].head()

Unnamed: 0,Current Rating,1 Month Ago,2 Months Ago,3 Months Ago
5,6,6,7,8
4,0,0,0,0
3,5,5,4,3
2,0,0,0,0
1,0,0,0,0


In [11]:
#The dimensions of the ratings data -- 5 rows and 4 columns
ratingsFiles[0].shape

(5, 4)

In [12]:
#  assume ratings come at the end of the month
# rows are ratings
# columns are months
# entries are number of analysts that gave a stock a having a certain rating during that month
# get stock ROI for each month for each stock

start3MonthsAgo = dt.datetime(2018, 1, 1)
end2MonthsAgo = dt.datetime(2018, 1, 31)

start2MonthsAgo = dt.datetime(2018, 2, 1)
end1MonthAgo = dt.datetime(2018, 2, 28)

start1MonthAgo = dt.datetime(2018, 3, 1)
end1MonthAgo = dt.datetime(2018, 3, 29)

In [13]:
allPriceDifs = []
for idx, dataFile in enumerate(dataFiles):
# first select data from 3 months ago
    df = dataFile
    df['Date'] = pd.to_datetime(df['Date'])  
    threeMonthmask = (df['Date'] >= start3MonthsAgo) & (df['Date'] <= end2MonthsAgo)
    twoMonthmask = (df['Date'] >= start2MonthsAgo) & (df['Date'] <= end1MonthAgo)
    oneMonthmask = (df['Date'] >= start1MonthAgo) & (df['Date'] <= end1MonthAgo)
    dfLocList = [df.loc[threeMonthmask], df.loc[twoMonthmask], df.loc[oneMonthmask]]
    priceDifs = []
    for dfLoc in dfLocList:
        first_day = dfLoc.iloc[0]
        last_day  = dfLoc.iloc[-1]
        price_diff = last_day["Adj Close"] - first_day["Adj Close"]
        pct_dif = 100 * (price_diff / 2)
        priceDifs.append(pct_dif)
    allPriceDifs.append(priceDifs)

In [89]:
priceDifs = pd.DataFrame(data=allPriceDifs, columns=["jan_pct_dif", "feb_pct_dif", "mar_pct_dif"])
priceDifs.head()

Unnamed: 0,jan_pct_dif,feb_pct_dif,mar_pct_dif
0,-240.51895,34.06375,-361.00005
1,40.3927,-336.00005,-93.0
2,2862.3703,-1365.33355,-1090.49985
3,323.27655,-743.0,-142.49955
4,148.38525,59.4999,-45.5


In [90]:
ratings_january = [f["3 Months Ago"].mean() for f in ratingsFiles]
ratings_february = [f["2 Months Ago"].mean() for f in ratingsFiles]
ratings_march = [f["1 Month Ago"].mean() for f in ratingsFiles]

X_train = pd.DataFrame(data=list(zip(ratings_january, ratings_february)), columns=["jan_mean_rating", "feb_mean_rating"])
y_train = priceDifs[["feb_pct_dif", "jan_pct_dif"]]
X_test = pd.DataFrame(data=ratings_march, columns=["mar_mean_rating"])
y_test = priceDifs["mar_pct_dif"]

In [122]:
# http://docs.pymc.io/notebooks/GLM-logistic.html# The likelihood is the product of Bernoulli trials
# data = pd.concat([y_test, X_test], axis=1)
# with pm.Model() as logistic_model:
#     pm.glm.GLM.from_formula('mar_pct_dif ~ mar_mean_rating', data, family=pm.glm.families.Binomial())
#     trace_logistic_model = pm.sample(2000, chains=1, tune=1000)

In [144]:
# first stack x and y labels
## TODO: make X 30-dimensional so that we get 30 weights (flip rows and columns)
print(X_train.head())
reg = linear_model.BayesianRidge()
reg.fit(X_train["jan_mean_rating"].reshape(-1, 1), y_train["jan_pct_dif"].reshape(-1, 1))
reg.predict(X_test)


   jan_mean_rating  feb_mean_rating
0              2.2              2.2
1              1.4              1.6
2              1.8              1.8
3              1.4              1.4
4              1.6              1.6


array([324.64350562, 324.64353542, 324.64352052, 324.64355032,
       324.64355032, 324.64355032, 324.64355032, 324.64355032,
       324.64358012, 324.64353542, 324.64353542, 324.64352052,
       324.64349072, 324.64356522, 324.64350562, 324.64352052,
       324.64350562, 324.64359502, 324.64358012, 324.64352052,
       324.64352052, 324.64355032, 324.64356522, 324.64359502,
       324.64353542, 324.64358012, 324.64355032, 324.64352052,
       324.64353542, 324.64355032])

In [136]:
ard = linear_model.ARDRegression()
ard.fit(X_train.stack().values.reshape(-1, 1), y_train.stack().values.reshape(-1, 1))
ard.predict(X_test)

array([-75.64556155, -75.64607093, -75.64581624, -75.64632563,
       -75.64632563, -75.64632563, -75.64632563, -75.64632563,
       -75.64683501, -75.64607093, -75.64607093, -75.64581624,
       -75.64530686, -75.64658032, -75.64556155, -75.64581624,
       -75.64556155, -75.6470897 , -75.64683501, -75.64581624,
       -75.64581624, -75.64632563, -75.64658032, -75.6470897 ,
       -75.64607093, -75.64683501, -75.64632563, -75.64581624,
       -75.64607093, -75.64632563])