In [73]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from matplotlib import style
import statsmodels.api as sm
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from os import system 
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import plotly.plotly as py
import plotly.graph_objs as go
py.sign_in('kaushik.316', '86eaqb1c8w')
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
style.use('ggplot')

### Data Retrieval

The code written to obtain the data is in dead_cat.py and yahoo_finance.py. An abridged explanation of the process used to obtain data is written below 

I started by collecting instances of Dead Cat bounces and the date that the bounce occurred. I was able to compile a list of around 90 positives (The final number used was around 64 because some of these were not in the Quandl database).  I stored these in a dataframe along with the start date (how far back I wanted statistics for) which was always one year prior to the 'Bounce' date. 
I then made a list with the tickers of each stock. Looping through the list, I made a call to the Quandl API to obtain historical trading data for one year for each stock. The below functions make the call for each ticker (which returns about 256 rows of trading data for each stock) and then appends each one to a large csv file.

In [None]:
"""
def api_call(ticker, start, end):
    prices_df = quandl.get('WIKI/'+ticker, authtoken=api_key, start_date=start, end_date=end)
    prices_df.index = pd.to_datetime(prices_df.index, infer_datetime_format=True)
    prices_df['Ticker'] = ticker
    prices_df.to_csv('trading_data.csv', mode='a', header=False)

def get_DCBprices(array):
	for row in array:
		try:
			ticker = str(row[1])
			start = str(row[2].strftime("%Y-%m-%d"))
			end = str(row[0].strftime("%Y-%m-%d"))
			print ticker + ' ' + start + ' ' + end
			api_call(ticker, start, end)
		except:
			print "Ticker does not exist in database"
""""""

I then calculated the short interest for each stock, along with the change in closing price from high for the year, change from low for the year and the change from the 50 day moving average using the following functions.

In [None]:
"""
def get_short_interest(array):
    for row in array:
        ticker = str(row[1])
        if ticker in final_DCBlist:
            try:
            	end = str(row[0].strftime("%Y-%m-%d"))			
            	short_df = quandl.get('SI/'+ticker+'_SI', authtoken=api_key, end_date=end)
                short_df = short_df.tail(1)
                short_df['Ticker'] = ticker
                short_df.index = pd.to_datetime(short_df.index, infer_datetime_format=True)
                short_df.to_csv('shortInterest_data.csv', mode='a', header=False)
                print ticker 
            except:
                print ticker + "Does not exist in database"
"""


In [None]:
"""
# Function to get the % change from Yr High, Yr Low and 50 day mean
def get_DCBstats(df):
	global change_from_high, change_from_low, change_from_50dMean
	High = df.Close.max()
	current_close = df.iat[-3,4]
	change_from_high = (((current_close-High)/High)*100)
	print change_from_high 

	Low = df.Close.min()
	change_from_low = (((current_close-Low)/Low)*100)

	fifty_daydf = df.tail(52)
	fifty_day_series = fifty_daydf.Close[0:49]
	Mean = fifty_day_series.mean()
	change_from_50dMean = (((current_close-Mean)/Mean)*100)
	print change_from_50dMean


# Calculate 30d average trading volume, need this to calculate short ratio.
def get_avgvolume(df):
	global volume_mean
	vseries = df.Volume.tail(32)
	volume_series = vseries[0:29]
	volume_mean = volume_series.mean()
	print volume_mean
"""

After creating a consolidated csv for all the Dead cat bounce stocks with the metrics I wanted, I went about obtaining the same metrics for stocks that had not shown dead cat bounce behavior (the negatives). I created a long ticker string of over 300 stocks and got the data from Yahoo Finance.

In [None]:
"""
def write_stats_toCSV(tickerstr, csvfile, codes):
	r = requests.get('http://finance.yahoo.com/d/quotes.csv?s='+ tickerstr + '&' +'f=' + 'codes')
	csvfile.write(r.text)
	testing123.close()

"""

I then combined the data to create a csv file named CONSOLIDATED_DCB_DATA.csv that contained the relevant information for both Dead Cat Bounce stocks (the first 64 or so rows) and Non Dead Cat Bounce stocks (the next 360 or so rows)

### Reading in the Data

In [74]:
project_df = pd.read_csv('CONSOLIDATED_DCB_DATA.csv')
project_df = project_df.drop(project_df.columns[[0]],axis=1)
project_df = project_df[np.isfinite(project_df['Chg_from_50davg'])]
project_df.count()

Ticker             425
Chg_from_Hi        425
Chg_from_Lo        425
Chg_from_50davg    425
Short_Ratio        425
Is_Dead_Cat        425
dtype: int64

In [75]:
# Statsmodels requires the creation of an intercept
project_df['Intercept'] = 1
project_df.head()


Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat,Intercept
0,USCR,-7.649554,72.938921,1.453999,4.083659,1,1
1,SRPT,-45.872218,110.614525,23.350694,3.639042,1,1
2,SPWR,-49.756256,14.60341,-8.774085,7.637297,1,1
3,CMA,-23.00885,29.01354,-8.893328,3.340925,1,1
4,AKS,-8.971963,166.120219,13.191348,2.565646,1,1


### Visualizations

In [76]:
DCB_df = project_df[project_df['Is_Dead_Cat']==1]
nonDCB_df = project_df[project_df['Is_Dead_Cat']==0]

#### Distribution

In [77]:
from plotly.tools import FigureFactory as FF

x1 = DCB_df['Chg_from_Hi']
x2 = DCB_df['Chg_from_Lo'] 
x3 = DCB_df['Chg_from_50davg']
x4 = DCB_df['Short_Ratio']

hist_data = [x1,x2,x3,x4]
group_labels = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
#fig = FF.create_distplot(hist_data, group_labels, bin_size=.2)
fig = FF.create_distplot(hist_data, group_labels, show_hist=False)

py.iplot(fig, filename='DCB Distplot with Multiple Datasets', validate=False)

#### Mean Comparison

In [78]:
mean_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
DCB_means = DCB_df[mean_cols].mean()
nonDCB_means = nonDCB_df[mean_cols].mean()

x1 = go.Bar(
    x= mean_cols,
    y=DCB_means,
    name='DCB Stocks'
)

x2 = go.Bar(
    x=mean_cols,
    y=nonDCB_means,
    name='nonDCB stocks'
)

data = [x1,x2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='DCB mean grouped-bar', title="Averages")

#### Median Comparison

In [79]:
median_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
DCB_medians = DCB_df[median_cols].median()
nonDCB_medians = nonDCB_df[median_cols].median()

m1 = go.Bar(
    x= median_cols,
    y=DCB_medians,
    name='nonDCB Stocks'
)

m2 = go.Bar(
    x=median_cols,
    y=nonDCB_medians,
    name='nonDCB stocks'
)

data = [m1,m2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='DCB median grouped-bar')

### Creating the Training and Test Dataset

 I used approximately 70% of the data as the training dataset, consisting of almost 300 values of which 39 were DCB stocks. The test dataset contained about 127 values of which 23 were DCB stocks

#### Training Dataset

In [80]:
train_positives = project_df.iloc[0:39,:]
train_negatives = project_df[project_df['Is_Dead_Cat']==0].iloc[0:259,:]
df_train = train_positives.append(train_negatives, ignore_index=True)
df_train.count()

Ticker             298
Chg_from_Hi        298
Chg_from_Lo        298
Chg_from_50davg    298
Short_Ratio        298
Is_Dead_Cat        298
Intercept          298
dtype: int64

In [81]:
df_train['Is_Dead_Cat'].value_counts()

0    259
1     39
Name: Is_Dead_Cat, dtype: int64

#### Test Dataset

In [82]:
test_positives = project_df.iloc[39:62]
test_negatives = project_df[project_df['Is_Dead_Cat']==0].iloc[259:,:]
df_test = test_positives.append(test_negatives, ignore_index=True)
df_test.count()

Ticker             127
Chg_from_Hi        127
Chg_from_Lo        127
Chg_from_50davg    127
Short_Ratio        127
Is_Dead_Cat        127
Intercept          127
dtype: int64

In [83]:
df_test['Is_Dead_Cat'].value_counts()

0    104
1     23
Name: Is_Dead_Cat, dtype: int64

### Logistic Regression Model (StatsModels)

In [84]:
x_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio','Intercept']
x_train = df_train[x_cols]
y_train = df_train['Is_Dead_Cat']

In [85]:
logit = sm.Logit(y_train, x_train)
DCB_model = logit.fit()
print DCB_model.summary()

Optimization terminated successfully.
         Current function value: 0.334720
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:            Is_Dead_Cat   No. Observations:                  298
Model:                          Logit   Df Residuals:                      293
Method:                           MLE   Df Model:                            4
Date:                Thu, 11 Aug 2016   Pseudo R-squ.:                  0.1374
Time:                        20:48:42   Log-Likelihood:                -99.746
converged:                       True   LL-Null:                       -115.64
                                        LLR p-value:                 2.122e-06
                      coef    std err          z      P>|z|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Chg_from_Hi        -0.0258      0.007     -3.701      0.000        -0.039    -0.012
Chg_from_Lo   

In [86]:
# Make predictions on the model
x_test = df_test[x_cols]
df_test['DCB_Prob'] = DCB_model.predict(x_test)
df_test.head()

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat,Intercept,DCB_Prob
0,GERN,-11.520737,107.567568,-2.431942,11.02712,1,1,0.11195
1,SM,-76.817489,36.936937,-46.337051,6.238982,1,1,0.737415
2,GLNG,-29.862069,79.301834,14.257349,5.214547,1,1,0.079744
3,CLF,-72.387239,16.744186,-8.482774,8.084908,1,1,0.385894
4,SAIA,-14.505572,60.564784,-9.375586,2.542879,1,1,0.112825


In [87]:
# Create a threshold function to classify as DCB or non DCB
# Keep threshold low to minimize false negatives rather than minimizing false positives.
"""Model or dataset needs to be adjusted to account for the low probability of Dead Cat Bounce stocks occurring
in real life"""

def binary_predictor(x):
    if x > 0.1:
        return int(1)
    else:
        return int(0)

If we use a higher threshold and obtain a more accurate model overall but this would increase the number of false negatives (predicting a DCB stock is not a DCB). Given the small sample size of DCB stocks relative to the overall size of the sample, it's more important to minimize false negatives.

#### Accuracy measure

To calculate the accuracy of the model I mapped a value into the "Check" column based on the predicted value generated by the model ( a 1 or 0 for each stock). If the prediction matched the classification in the "Is_Dead_Cat" then the value in "Check" for that row would be True. Counting the percentage of True vs False values gives us the accuracy oof the model. 

In [88]:
# I classify each stock as a 1 or 0 and then see if that value matches with the value in the Is_Dead_Cat column
df_test['Log_Prediction'] = df_test.DCB_Prob.map(binary_predictor)
df_test['Check'] = df_test.Log_Prediction == df_test.Is_Dead_Cat
print df_test['Check'].value_counts(True)

True     0.677165
False    0.322835
Name: Check, dtype: float64


Looks like the model predicts whether a stock is a dead cat or not correctly 68% of the time.

In [102]:
# Check shows accuracy for all DCB stocks
df_test.head(22)

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat,Intercept,DCB_Prob,Log_Prediction,Check
0,GERN,-11.520737,107.567568,-2.431942,11.02712,1,1,0.11195,1,True
1,SM,-76.817489,36.936937,-46.337051,6.238982,1,1,0.737415,1,True
2,GLNG,-29.862069,79.301834,14.257349,5.214547,1,1,0.079744,0,False
3,CLF,-72.387239,16.744186,-8.482774,8.084908,1,1,0.385894,1,True
4,SAIA,-14.505572,60.564784,-9.375586,2.542879,1,1,0.112825,1,True
5,SN,-18.683652,234.497817,-6.188453,3.557872,1,1,0.135022,1,True
6,HOS,-54.241262,9.933436,-12.9741,6.098567,1,1,0.30215,1,True
7,OPK,-1.796407,84.269663,11.994797,12.536319,1,1,0.055254,0,False
8,CCOI,-11.080471,19.186828,0.784575,7.103883,1,1,0.081235,0,False
9,CUDA,0.0,91.607811,15.939442,2.219147,1,1,0.033378,0,False


In [90]:
df_test.tail()

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat,Intercept,DCB_Prob,Log_Prediction,Check
122,BOKF,-13.23,47.81,4.69,13.55,0,1,0.093276,0,True
123,AKP,-0.89,18.31,0.91,0.0,0,1,0.050757,0,True
124,CTS,-5.63,48.48,4.19,8.4,0,1,0.067812,0,True
125,ACRX,-38.27,51.25,24.35,8.86,0,1,0.074651,0,True
126,DSPG,-1.63,37.26,4.17,5.46,0,1,0.055616,0,True


If we look at the subset of data for stocks that are Dead Cat stocks, the accuracy is a little different.

In [91]:
# Test accuracy on subset where stocks are Dead Cats
subset = df_test[df_test['Is_Dead_Cat']==1]
print subset['Check'].value_counts(True)

True     0.73913
False    0.26087
Name: Check, dtype: float64


The accuracy is slightly better for Dead Cat stocks but this may just be due to their smaller sample size. The overall probablity predicted by the model is pretty low but in general if we look at stock not classified as dead cat bounce stocks the probability is < 0.1.

#### Confusion Matrix

In [92]:
y_true = df_test.Is_Dead_Cat
y_pred = df_test.Log_Prediction
matrix = confusion_matrix(y_true, y_pred)

print "True Negatives: %s" % (matrix[0][0])
print "False Positives: %s" % (matrix[0][1])
print "False Negatives: %s" % (matrix[1][0])
print "True Positives: %s" % (matrix[1][1])

True Negatives: 69
False Positives: 35
False Negatives: 6
True Positives: 17


In [106]:
matrixplot = [
    go.Heatmap(
        z= [[6,17],[69,35]],
        x=['Predicted No', 'Predicted Yes'],
        y=['Actual Yes', 'Actual No'],
    )
]
py.iplot(matrixplot, filename='labelled-heatmap')


### Logistic Regression Model (Scikit Learn)

In [53]:
x2_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
x2_train = df_train[x2_cols]
y2_train = df_train['Is_Dead_Cat']

In [54]:
logModel = linear_model.LogisticRegression()
DCB_model = logModel.fit(x2_train, y2_train)


In [55]:
print logModel.score(df_test[x2_cols],df_test['Is_Dead_Cat'])

0.818897637795


### Decision Tree Model

In [56]:
treeModel = DecisionTreeClassifier(max_depth = 4, criterion="entropy")
tree_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']

In [57]:
treedf_train = df_train.copy()
X_tree = treedf_train[tree_cols]
Y_tree = treedf_train['Is_Dead_Cat']

treeModel.fit(X_tree, Y_tree)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [58]:
def visualize_tree(model):
    dotfile = open("tree.dot", 'w')
    export_graphviz(model, out_file = dotfile, feature_names = X_tree.columns)
    dotfile.close()
    system("dot -Tpng tree.dot -o tree.png")

<img src="Tree.png">

In [59]:
# Get the test dataset but remove the columns attached by the log regression model
treedf_test = df_test.copy()
treedf_test = treedf_test.drop(['Intercept','DCB_Prob','Log_Prediction','Check'], axis=1)
X_treetest = treedf_test[tree_cols]
treedf_test.head()

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat
0,GERN,-11.520737,107.567568,-2.431942,11.02712,1
1,SM,-76.817489,36.936937,-46.337051,6.238982,1
2,GLNG,-29.862069,79.301834,14.257349,5.214547,1
3,CLF,-72.387239,16.744186,-8.482774,8.084908,1
4,SAIA,-14.505572,60.564784,-9.375586,2.542879,1


In [60]:
treedf_test['Tree_Prediction'] = treeModel.predict(X_treetest)
treedf_test['Check'] = treedf_test.Tree_Prediction == treedf_test.Is_Dead_Cat
print treedf_test['Check'].value_counts(True)

True     0.826772
False    0.173228
Name: Check, dtype: float64


In [61]:
treedf_test.head(10)

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat,Tree_Prediction,Check
0,GERN,-11.520737,107.567568,-2.431942,11.02712,1,0,False
1,SM,-76.817489,36.936937,-46.337051,6.238982,1,1,True
2,GLNG,-29.862069,79.301834,14.257349,5.214547,1,0,False
3,CLF,-72.387239,16.744186,-8.482774,8.084908,1,1,True
4,SAIA,-14.505572,60.564784,-9.375586,2.542879,1,1,True
5,SN,-18.683652,234.497817,-6.188453,3.557872,1,0,False
6,HOS,-54.241262,9.933436,-12.9741,6.098567,1,1,True
7,OPK,-1.796407,84.269663,11.994797,12.536319,1,0,False
8,CCOI,-11.080471,19.186828,0.784575,7.103883,1,0,False
9,CUDA,0.0,91.607811,15.939442,2.219147,1,0,False


As shown above, the decision tree model does not accurately predict Dead Cat Bounces without adjusting for the natural probability of a Dead Cat Bounce occuring, which is quite low. This is verified when looking at the accuracy for only the Dead Cat Bounce stocks

In [77]:
subset2 = treedf_test[treedf_test['Is_Dead_Cat']==1]
print subset2['Check'].value_counts(True)

False    0.695652
True     0.304348
Name: Check, dtype: float64
