In [1]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from os import system 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from random import randint
import plotly.plotly as py
import plotly.graph_objs as go
import warnings
warnings.simplefilter(action='ignore')
py.sign_in('kaushik.316', '86eaqb1c8w')

### Data Format

In [2]:
project_df = pd.read_csv('../Data/CONSOLIDATED_DCB_DATA.csv')
project_df = project_df.drop(project_df.columns[[0]],axis=1)
project_df = project_df[np.isfinite(project_df['Chg_from_50davg'])]
project_df.count()

Ticker             425
Chg_from_Hi        425
Chg_from_Lo        425
Chg_from_50davg    425
Short_Ratio        425
Is_Dead_Cat        425
dtype: int64

In [3]:
project_df.head()

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat
0,USCR,-7.649554,72.938921,1.453999,4.083659,1
1,SRPT,-45.872218,110.614525,23.350694,3.639042,1
2,SPWR,-49.756256,14.60341,-8.774085,7.637297,1
3,CMA,-23.00885,29.01354,-8.893328,3.340925,1
4,AKS,-8.971963,166.120219,13.191348,2.565646,1


### Visualizations

In [4]:
DCB_df = project_df[project_df['Is_Dead_Cat']==1]
nonDCB_df = project_df[project_df['Is_Dead_Cat']==0]

#### Distribution

In [5]:
from plotly.tools import FigureFactory as FF

x1 = DCB_df['Chg_from_Hi']
x2 = DCB_df['Chg_from_Lo'] 
x3 = DCB_df['Chg_from_50davg']
x4 = DCB_df['Short_Ratio']

hist_data = [x1,x2,x3,x4]
group_labels = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
fig = FF.create_distplot(hist_data, group_labels, show_hist=False)

py.iplot(fig, filename='DCB Distplot with Multiple Datasets', validate=False)

#### Mean Comparison

In [6]:
mean_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
DCB_means = DCB_df[mean_cols].mean()
nonDCB_means = nonDCB_df[mean_cols].mean()

x1 = go.Bar(
    x= mean_cols,
    y=DCB_means,
    name='DCB Stocks'
)

x2 = go.Bar(
    x=mean_cols,
    y=nonDCB_means,
    name='nonDCB stocks'
)

data = [x1,x2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='DCB mean grouped-bar', title="Averages")

#### Median Comparison

In [7]:
median_cols = ['Chg_from_Hi','Chg_from_Lo','Chg_from_50davg','Short_Ratio']
DCB_medians = DCB_df[median_cols].median()
nonDCB_medians = nonDCB_df[median_cols].median()

m1 = go.Bar(
    x= median_cols,
    y=DCB_medians,
    name='nonDCB Stocks'
)

m2 = go.Bar(
    x=median_cols,
    y=nonDCB_medians,
    name='nonDCB stocks'
)

data = [m1,m2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='DCB median grouped-bar')

### Creating the Training and Test Dataset

In [8]:
# Function that makes sure duplicates of class '1' are not all exactly the same
def make_noise(x):
    rand = randint(0,1)
    if rand == 0:
        return x * .95
    else:
        return x * 1.05    

In [9]:
# Oversampling - create copies of less represented class. 
DCB_df_copy = DCB_df
DCB_df_copy.head()

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat
0,USCR,-7.649554,72.938921,1.453999,4.083659,1
1,SRPT,-45.872218,110.614525,23.350694,3.639042,1
2,SPWR,-49.756256,14.60341,-8.774085,7.637297,1
3,CMA,-23.00885,29.01354,-8.893328,3.340925,1
4,AKS,-8.971963,166.120219,13.191348,2.565646,1


In [10]:
# Add a little noise to the copied values
for col in DCB_df_copy.columns[1:-1]:
    DCB_df_copy[col] = DCB_df_copy[col].apply(make_noise)

DCB_df_copy.head()

Unnamed: 0,Ticker,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,Is_Dead_Cat
0,USCR,-8.032032,69.291975,1.526699,4.287842,1
1,SRPT,-43.578607,105.083799,24.518229,3.45709,1
2,SPWR,-47.268443,15.33358,-8.335381,8.019161,1
3,CMA,-21.858407,30.464217,-8.448662,3.507972,1
4,AKS,-9.420561,174.42623,12.531781,2.693928,1


In [11]:
project_df = pd.concat([project_df, DCB_df_copy])
print project_df.Is_Dead_Cat.value_counts()

0    363
1    124
Name: Is_Dead_Cat, dtype: int64


#### Training Dataset

In [12]:
# shuffle data
project_df = project_df.sample(frac=1)

X = project_df.iloc[:, 1:-1]
y = project_df.iloc[:, -1:]

In [13]:
x_traindf, x_testdf, y_traindf, y_testdf = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
x_traindf.count()

Chg_from_Hi        365
Chg_from_Lo        365
Chg_from_50davg    365
Short_Ratio        365
dtype: int64

In [15]:
y_traindf['Is_Dead_Cat'].value_counts()

0    280
1     85
Name: Is_Dead_Cat, dtype: int64

#### Test Dataset

In [16]:
x_testdf.count()

Chg_from_Hi        122
Chg_from_Lo        122
Chg_from_50davg    122
Short_Ratio        122
dtype: int64

In [17]:
y_testdf['Is_Dead_Cat'].value_counts()

0    83
1    39
Name: Is_Dead_Cat, dtype: int64

In [18]:
# Convert to numpy array for uniformity
x_train = x_traindf.as_matrix()
y_train = y_traindf.as_matrix()
x_test = x_testdf.as_matrix()
y_test = y_testdf.as_matrix()

# reshape y
y_train = np.reshape(y_train, (len(y_train), ))
y_test = np.reshape(y_test, (len(y_test), ))

### Support Vector Machine 

In [19]:
# Parameter grid for grid search
# Omit C values from the grid as class_weight param does this automatically for us
svm_parameters = {'kernel': ['linear', 'rbf'],
			  	  'degree': [1,3,5],
				  'gamma': [1e-1, 1e2, 5]}

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2)
svm_clf = SVC(class_weight='balanced', probability=True)

In [20]:
grid = GridSearchCV(svm_clf, param_grid=svm_parameters, cv=cv)
grid.fit(x_train, y_train)
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

grid_score = grid.score(x_test, y_test)
print "Grid search's score on new test data was {}".format(grid_score)

The best parameters are {'kernel': 'rbf', 'gamma': 0.1, 'degree': 1} with a score of 0.82
Grid search's score on new test data was 0.77868852459


In [21]:
# Map from predictions to probabilities based on set threshold
def map_predictions(x):
    threshold = 0.3
    if x > threshold:
        return 1
    else:
        return 0

In [22]:
# Make predictions on the model
x_testdf['predicted_prob'] = grid.predict_proba(x_test)[:, 1]
x_testdf['predictions'] = x_testdf['predicted_prob'].apply(map_predictions)
x_testdf['actuals'] = y_test
x_testdf.head()

Unnamed: 0,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,predicted_prob,predictions,actuals
140,-2.31,13.27,-0.1,0.0,0.008542,0,0
61,-55.151157,60.075503,13.501665,8.382043,0.229425,0,1
30,-86.499596,52.372263,15.721296,6.448787,0.229469,0,1
429,-5.16,20.02,0.25,13.78,0.163676,0,0
226,-7.34,38.36,-2.97,2.15,1.0,1,0


#### Accuracy measure

Since occurrences of dead cat bounces are rare, our data is unbalanced. To ensure that our model is not simply predicting a zero every time, we can examine the precision, which should be high. A high precision indicates that the number of true positives is high relative to the number of false positives.

In [23]:
grid_score = grid.score(x_test, y_test)
grid_precision = precision_score(x_testdf["predictions"], x_testdf["actuals"])
grid_recall = recall_score(x_testdf["predictions"], x_testdf["actuals"])

print "Model accuracy is {}".format(grid_score)
print "Model precision is {}".format(grid_precision)
print "Model recall is {}".format(grid_recall)

Model accuracy is 0.77868852459
Model precision is 0.666666666667
Model recall is 0.928571428571


We can see that the precision of our model is lower than overall accuracy, though the overall precision is not bad. As shown in the heat map below, the number of false positives and false negatives predicted is acceptable.

#### Confusion Matrix

In [24]:
y_true = x_testdf["actuals"]
y_pred = x_testdf["predictions"] 
matrix = confusion_matrix(y_true, y_pred)


print "True Negatives: %s" % (matrix[0][0])
print "False Positives: %s" % (matrix[0][1])
print "False Negatives: %s" % (matrix[1][0])
print "True Positives: %s" % (matrix[1][1])

True Negatives: 81
False Positives: 2
False Negatives: 13
True Positives: 26


In [25]:
matrixplot = [
    go.Heatmap(
        z= [[matrix[1][0], matrix[1][1]],[matrix[0][0], matrix[0][1]]],
        x=['Predicted No', 'Predicted Yes'],
        y=['Actual Yes', 'Actual No'],
    )
]
py.iplot(matrixplot, filename='labelled-heatmap')


### Decision Tree Model

In [26]:
tree_parameters = {'criterion': ['gini', 'entropy'],
			  	  'max_depth': [2,4,6],}

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2)
tree_clf = DecisionTreeClassifier(class_weight="balanced")

In [27]:
tree_grid = GridSearchCV(tree_clf, param_grid=tree_parameters, cv=cv)
tree_grid.fit(x_train, y_train)
print("The best parameters are %s with a score of %0.2f" % (tree_grid.best_params_, tree_grid.best_score_))

tree_grid_score = tree_grid.score(x_test, y_test)
print "Grid search's score on new test data was {}".format(tree_grid_score)

The best parameters are {'criterion': 'gini', 'max_depth': 6} with a score of 0.82
Grid search's score on new test data was 0.819672131148


In [28]:
def visualize_tree(model):
    dotfile = open("tree.dot", 'w')
    export_graphviz(model, out_file = dotfile, feature_names = DCB_df.columns[1:5])
    dotfile.close()
    system("dot -Tpng tree.dot -o ../tree.png")

tree_clf.fit(x_train, y_train)
visualize_tree(tree_clf)

<img src="Tree.png">

In [29]:
# Make predictions on the model
x_testdf['t_predicted_prob'] = tree_grid.predict_proba(x_test)[:, 1]
x_testdf['t_predictions'] = x_testdf['predicted_prob'].apply(map_predictions)
x_testdf.head()

Unnamed: 0,Chg_from_Hi,Chg_from_Lo,Chg_from_50davg,Short_Ratio,predicted_prob,predictions,actuals,t_predicted_prob,t_predictions
140,-2.31,13.27,-0.1,0.0,0.008542,0,0,0.0,0
61,-55.151157,60.075503,13.501665,8.382043,0.229425,0,1,0.0,0
30,-86.499596,52.372263,15.721296,6.448787,0.229469,0,1,0.89172,0
429,-5.16,20.02,0.25,13.78,0.163676,0,0,0.147757,0
226,-7.34,38.36,-2.97,2.15,1.0,1,0,0.900585,1


In [30]:
tree_grid_score = tree_grid.score(x_test, y_test)
tree_grid_precision = precision_score(x_testdf["t_predictions"], x_testdf["actuals"])
tree_grid_recall = recall_score(x_testdf["t_predictions"], x_testdf["actuals"])

print "Model accuracy is {}".format(grid_score)
print "Model precision is {}".format(grid_precision)
print "Model recall is {}".format(grid_recall)

Model accuracy is 0.77868852459
Model precision is 0.666666666667
Model recall is 0.928571428571


In [31]:
y_true = x_testdf["actuals"]
y_pred = x_testdf["t_predictions"] 
matrix = confusion_matrix(y_true, y_pred)

print "True Negatives: %s" % (matrix[0][0])
print "False Positives: %s" % (matrix[0][1])
print "False Negatives: %s" % (matrix[1][0])
print "True Positives: %s" % (matrix[1][1])

True Negatives: 81
False Positives: 2
False Negatives: 13
True Positives: 26


In [32]:
matrixplot = [
    go.Heatmap(
        z= [[matrix[1][0], matrix[1][1]],[matrix[0][0], matrix[0][1]]],
        x=['Predicted No', 'Predicted Yes'],
        y=['Actual Yes', 'Actual No'],
    )
]
py.iplot(matrixplot, filename='labelled-heatmap', title='Decision Tree Heatmap')
