In [0]:
# https://www.datacamp.com/community/tutorials/decision-tree-classification-python
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics 

In [0]:
# Import the bid/ask imbalance dataset
imbal = pd.read_csv("https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/20131002-bidAskImbal5.csv")
imbal

In [0]:
# Scatter plot the L1 & L2 imbalance
downs = imbal[(imbal['Dir'] == 'd')]
ups = imbal[(imbal['Dir'] == 'u')] 

fig = plt.figure(figsize=(16, 10))
ax = fig.add_subplot()
ax.scatter(downs['L1'].tolist(), downs['L2'].tolist(), c='blue', label='Down')
ax.scatter(ups['L1'].tolist(), ups['L2'].tolist(), c='red', label='Up')

plt.title('L1 & L2 Order Book Imbalance')
plt.legend(loc=2)
plt.show()

In [0]:
# split dataset in features and target variable
feature_cols = ['L1', 'L2', 'L3']
X = imbal[feature_cols] # Features
y = imbal.Dir # Target variable

In [0]:
# Split dataset into training set and test set
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 


In [0]:
# Create Decision Tree classifer object with maxiumn 3 level
clf = tree.DecisionTreeClassifier(max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

In [0]:
# Visualize the decision tree
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("imbal") 

dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=feature_cols,  
                      class_names=["D", "U"],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

In [0]:
# gini: quantifies the purity of the node/leaf. 
# A gini score greater than zero implies that samples contained within that node belong to different classes. 
# A gini score of zero means that the node is pure, that within that node only a single class of samples exist. 
 
# value: how many samples in each category (Down vs Up in this case)

In [0]:
# Measure the performance
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print("Train Accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", metrics.accuracy_score(y_test, y_pred_test))