In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtree import DecisionTree

plt.ioff()

## Part 1: Tennis example

The tennis data comprises of 4 features (Outlook, Temperature, Humidity, Wind) and a label (PlayTennis).

In [34]:
# reading training data
data = pd.read_csv('tennis.csv', header=0)
features = data.columns[:-1]
label = data.columns[-1]
data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


First, training and testing decision on the same instances as a sanity check.

In [35]:
# creating empty decision tree, demonstrating information gain calculations
d = DecisionTree()
maxig, name, split = d.max_info_gain(data, features, label)
# From observation, the best attribute is Outlook
print(maxig, name)

0.246749819774 Outlook


In [36]:
# training the tree on the loaded data
d.train(data, label)
# after training, the root node should have Outlook attribute, and the child node for Overcase should have a label
print('Root:', d.root)
print('Children:', d.root.children)

Root: Outlook node
Children: {'Overcast': Yes label, 'Rain': Wind node, 'Sunny': Humidity node}


In [37]:
# checking if trained tree perfectly predicts training instances
(d.predict(data) == data[label]).all()

True

Then, splitting the training data in a 4/10 test/train split.

In [38]:
train = data.head(10)  # training set
test = data.tail(4)    # validation set
d = DecisionTree()
hist = d.train(train, label, validation={'test': test, 'train': train})  # training and obtaining error history
print('Accuracy:', (d.predict(test) == test[label]).sum() / len(test))
# plotting
plt.plot(hist['test'], label='Test error')
plt.plot(hist['train'], label='Train error')
plt.xlabel('Iterations')
plt.ylabel('Error')
plt.title('Error History')
plt.legend()
plt.show()

Accuracy: 0.75


<IPython.core.display.Javascript object>

## Part 2: Exoplanet Survey Data

The survey is a collection of more than 9000 astronomical observations and their status as confirmed, false positive, or candidate exoplanets. We only consider *confirmed* or *false positive* labels. Additionally, the 14 features are continuous and have large distributions. So as a preprocessing step, the values are scaled logarithmically and discretized into `nbins` categories each.

In [21]:
raw_data = pd.read_csv('exoplanets.csv', header=0)
raw_data = raw_data.sample(frac=1)  # shuffling data
features = raw_data.columns[1:] # feature column names
label = raw_data.columns[0]     # label column name
# plotting
plt.boxplot(raw_data[features].values, labels=features, whis='range')
plt.xticks(rotation=70)
plt.title('Feature distributions')
plt.show()
raw_data.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,Disposition,Orbital Period,Transit Epoch,Impact Parameter,Transit Duration,Transit Depth,Planetary Radius,Equilibrium Temperature,Transit Signal-to-Noise,Stellar Effective Temperature,Stellar Surface Gravity,Stellar Radius,Right Ascension,Declination,Kepler-band
1413,CONFIRMED,17.13,147.15,0.3,3.06,850.0,2.31,611.0,26.4,5477.0,4.58,0.79,295.57,42.48,15.84
455,CONFIRMED,6.58,174.79,0.14,3.09,2320.0,4.56,881.0,116.2,5188.0,4.4,0.97,297.32,46.02,15.47
3379,CONFIRMED,62.87,166.01,0.95,6.84,342.0,1.96,349.0,16.2,4762.0,4.54,0.78,282.42,46.01,14.59
4282,FALSE POSITIVE,112.65,183.0,1.28,7.04,1960.0,35.36,377.0,23.3,5895.0,4.49,0.99,292.56,47.33,15.7
1995,FALSE POSITIVE,0.93,131.51,0.99,6.35,518.0,3.18,1594.0,55.5,5402.0,4.58,0.82,292.78,47.45,15.81


In [29]:
# preprocessing raw_data by logarithmic scaling and discretization
nbins = 3
# scaling features
data = np.log(raw_data[features] + 1)
data[label] = raw_data[label]
# discretizing features
for feature in features:
    _, bins = np.histogram(data[feature], nbins)
    data[feature] = np.digitize(data[feature], bins, right=True)
# splitting into train/test sets
ntrain = 500
ntest = 100
nval = 100
train = data.iloc[:ntrain]
test = data.tail(ntest)
# visualizing change in feature distributions
plt.boxplot(data[features].values, labels=features, whis='range')
plt.xticks(rotation=70)
plt.title('Feature distributions after scaling & discretization')
plt.show()

<IPython.core.display.Javascript object>

In [33]:
d = DecisionTree()
hist = d.train(train, label, validation={'test': test, 'train': train})
# plotting
plt.figure()
plt.plot(hist['test'], label='Test error')
plt.plot(hist['train'], label='Train error')
plt.xlabel('Iterations')
plt.ylabel('Error')
plt.title('Error History')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

## Part 3: Pruning

In [31]:
# split training into training/validation
train = train.sample(frac=1)
val = train.iloc[:nval]
subtrain = train.iloc[nval:]
# train on new split
d = DecisionTree()
hist = d.train(train, label, validation={'test': test, 'train': subtrain, 'val': val})
# prune
prune_hist = d.prune(val, label, train, test)
# plotting
plt.figure()
plt.plot(hist['val'] + prune_hist[0], label='Validation error')
plt.plot(hist['train'] + prune_hist[1], label='Train error')
plt.plot(hist['test'] + prune_hist[2], label='Test error')
nsteps = len(hist['val'])  # number of steps before pruning
plt.axvline(x=nsteps-1, label='Pruning start', linestyle='dotted', linewidth=1)
plt.xlabel('Iterations')
plt.ylabel('Error')
plt.title('Error History')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>