In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# https://www.datacamp.com/community/tutorials/decision-tree-classification-python
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics 

SCREEN_X, SCREEN_Y = 12, 8

In [0]:
# Import the bid/ask imbalance dataset
imbal = pd.read_csv("https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/20131002-bidAskImbal5.csv")
imbal

In [0]:
# Scatter plot the order book features
sns.pairplot(imbal, hue="Dir",palette="bright")

In [0]:
# split dataset in features and target variable
feature_cols = ['L1', 'L2', 'L3']
X = imbal[feature_cols] # Features
y = imbal.Dir # Target variable

In [0]:
# Split dataset into training set and test set
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 


In [0]:
# Create Decision Tree classifer object with maxiumn 3 level (max_depth=3)
# max_depth is the hyperparamter that determine model capacity 
dtree = tree.DecisionTreeClassifier(max_depth=3)

# Train Decision Tree Classifer
dtree = dtree.fit(X_train,y_train)

In [0]:
# Visualize the decision tree
import graphviz 
dot_data = tree.export_graphviz(dtree, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("imbal") 

dot_data = tree.export_graphviz(dtree, out_file=None, 
                      feature_names=feature_cols,  
                      class_names=["D", "U"],  
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

In [0]:
# gini: quantifies the purity of the node/leaf. 
# A gini score greater than zero implies that samples contained within that node belong to different classes. 
# A gini score of zero means that the node is pure, that within that node only a single class of samples exist. 
 
# value: how many samples in each category (Down vs Up in this case)

In [0]:
# Measure the Decision tree performance
y_dtree_pred_train = dtree.predict(X_train)
y_dtree_pred_test = dtree.predict(X_test)

print("Train Accuracy:", metrics.accuracy_score(y_train, y_dtree_pred_train))
print("Test Accuracy:", metrics.accuracy_score(y_test, y_dtree_pred_test))

In [0]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC

# use SVM with default settings
svm = SVC()
svm.fit(X, y)

In [0]:
# Measure the SVM performance
y_svm_pred_train = svm.predict(X_train)
y_svm_pred_test = svm.predict(X_test)

print("Train Accuracy:", metrics.accuracy_score(y_train, y_svm_pred_train))
print("Test Accuracy:", metrics.accuracy_score(y_test, y_svm_pred_test))