In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor
from treeinterpreter import treeinterpreter
from utils import draw_tree, waterfallplot
pd.set_option('display.max_columns', None)
%matplotlib inline

# Data

from http://data.princeton.edu/wws509/datasets/#salary

These are the salary data used in Weisberg's book, consisting of observations on six variables for 52 tenure-track professors in a small college. The variables are:

    sx = Sex
    rk = Rank
    yr = Number of years in current rank
    dg = Highest degree
    yd = Number of years since highest degree was earned
    sl = Academic year salary, in dollars. 

## Read data

In [None]:
df = pd.read_csv('data/salaries_data.dat', sep='\s+')
print(df.shape)
df.head()

## Explore variables

In [None]:
# numerical variables
df.describe()

In [None]:
# categorical variables
cat_vars = ['sx', 'rk', 'dg']
for var in cat_vars:
    print(f'{var}: ', df[var].unique())

In [None]:
# view counts
print(f'total={len(df)}')
for var in cat_vars:
    print(f'{var}: ', [f'{v}={c} ({c/len(df):.2})' for v,c in zip(*np.unique(df[var], return_counts=True))])

## Preprocess

In [None]:
# Convert to category type
cat_dict = dict()
for var in cat_vars:
    df[var] = df[var].astype('category')
    cat_dict[var] = {i+1:c for i,c in enumerate(df[var].cat.categories)}
    df[var] = df[var].cat.codes+1

pprint(cat_dict)  
df.head()

In [None]:
x = df.drop('sl', axis=1)
y = df['sl']

# Decision tree

In [None]:
m = DecisionTreeRegressor(max_depth=3)
m.fit(x, y)

In [None]:
m.score(x, y)

# View the tree

In [None]:
draw_tree(m, x)

# Using dummies

In [None]:
df = pd.read_csv('data/salaries_data.dat', sep='\s+')
df = pd.get_dummies(df)
df.head()

In [None]:
x = df.drop('sl', axis=1)
y = df['sl']
m = DecisionTreeRegressor(max_depth=3)
m.fit(x, y)
m.score(x, y)

In [None]:
draw_tree(m, x)

# Feature importance

In [None]:
m = DecisionTreeRegressor()
m.fit(x, y)

In [None]:
feat_importance = pd.DataFrame({'cols':x.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)
feat_importance

In [None]:
feat_importance.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False);

# Tree interpreter

In [None]:
sample = x.sample()
sample

In [None]:
prediction, bias, contributions = treeinterpreter.predict(m, sample)
prediction[0], bias[0], y[sample.index[0]]

In [None]:
waterfallplot(sample, contributions[0], size=(12,5));