In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [3]:
hr = pd.read_csv('HR.csv')
hr_dummies = pd.get_dummies(hr)

train_indices, test_indices = train_test_split(np.array(hr_dummies.index), test_size=0.2, random_state=0)

hr_train = hr_dummies.loc[train_indices].copy()
hr_test = hr_dummies.loc[test_indices].copy() 

In [4]:
x_train = hr_train.drop('left', axis=1)
y_train = hr_train['left']

x_test = hr_test.drop('left', axis=1)
y_test = hr_test['left']

In [5]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5, random_state=0)
tree.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5,
                       random_state=0)

In [8]:
y_pred = tree.predict(x_test)
f1_score(y_test, y_pred)

0.6493506493506493

# Random Search

In [9]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [10]:
parameters = {'min_samples_leaf': np.arange(1, 100),
              'max_depth': np.arange(1, 20)}

random_search = RandomizedSearchCV(DecisionTreeClassifier(criterion='entropy'), parameters, 
                                   n_iter=25, cv=5, scoring="f1", random_state=0)
model = random_search.fit(x_train, y_train)

In [11]:
model.best_score_

0.9427067093673112

In [12]:
model.best_params_

{'min_samples_leaf': 9, 'max_depth': 7}

# Co-Ordinate Descent

In [13]:
base_model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=9)
parameters = {'max_depth': np.arange(1, 20)}

search = GridSearchCV(base_model, parameters, cv=5, scoring="f1")
search.fit(x_train, y_train);

In [14]:
search.best_params_

{'max_depth': 8}

In [15]:
base_model = DecisionTreeClassifier(criterion='entropy', max_depth=8)
parameters = {'min_samples_leaf': np.arange(1, 100)}

search = GridSearchCV(base_model, parameters, cv=5, scoring="f1")
search.fit(x_train, y_train);

In [16]:
search.best_params_

{'min_samples_leaf': 2}

In [17]:
base_model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=2)
parameters = {'max_depth': np.arange(1, 20)}

search = GridSearchCV(base_model, parameters, cv=5, scoring="f1")
search.fit(x_train, y_train);

In [18]:
search.best_params_

{'max_depth': 9}

In [20]:
base_model = DecisionTreeClassifier(criterion='entropy', max_depth=9)
parameters = {'min_samples_leaf': np.arange(1, 100)}

search = GridSearchCV(base_model, parameters, cv=5, scoring="f1")
search.fit(x_train, y_train);

In [21]:
search.best_params_

{'min_samples_leaf': 1}

In [22]:
base_model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1)
parameters = {'max_depth': np.arange(1, 20)}

search = GridSearchCV(base_model, parameters, cv=5, scoring="f1")
search.fit(x_train, y_train);

search.best_params_

{'max_depth': 9}

In [24]:
search.best_score_

0.9554346389374981

# Visualize Tree

In [25]:
from sklearn.tree import export_graphviz
import graphviz

In [26]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=9, random_state=0)
tree.fit(hr_train.drop(columns='left'), hr_train['left'])

DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=9,
                       random_state=0)

In [28]:
x_train.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales_IT', 'sales_RandD', 'sales_accounting',
       'sales_hr', 'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical', 'salary_high',
       'salary_low', 'salary_medium'],
      dtype='object')

In [27]:
feature_names = ['Satisfaction Level', 'Last Evaluation Score', 'Number of Projects',
                 'Average Monthly Hours', 'Time Spent at the Company', 'Work Accident',
                 'Promotion in the Last 5 Years', 'Department: IT', 'Department: R&D', 
                 'Department: Accounting', 'Department: HR', 'Department: Management', 
                 'Department: Marketing', 'Department: Product Management', 'Department: Sales', 
                 'Department: Support', 'Department: Technical', 'Salary: High', 
                 'Salary: Low', 'Salary: Medium']

dot_data = export_graphviz(tree, max_depth=3, impurity=False, feature_names=feature_names,
                           class_names=['Stayed', 'Left'], rounded=True, filled=True) 
graph = graphviz.Source(dot_data)
graph.render('Tree')

'Tree.pdf'

# Perth Housing

In [29]:
from sklearn.metrics import mean_squared_log_error
from sklearn.tree import DecisionTreeRegressor

In [30]:
perth = pd.read_csv('PerthHousing.csv', parse_dates=['date_sold'], infer_datetime_format=True)

# We only really care about the year in which the house was sold
perth['date_sold'] = pd.DatetimeIndex(perth['date_sold']).year

perth['log10_price'] = np.log10(perth['price'])
perth.drop(columns=['price', 'address', 'nearest_sch', 'nearest_stn'], inplace=True)
perth = pd.get_dummies(perth, columns=['suburb'])

train_indices, test_indices = train_test_split(np.array(perth.index), test_size=0.2, random_state=0)

perth_train = perth.loc[train_indices].copy()
perth_test = perth.loc[test_indices].copy()

In [32]:
# Null values in the garage column are houses that have 0 garage spaces
perth_test.fillna({'garage': 0, 'build_year': perth_train['build_year'].mean(), 'nearest_sch_rank': 150}, 
                  inplace=True)
perth_train.fillna({'garage': 0, 'build_year': perth_train['build_year'].mean(), 'nearest_sch_rank': 150}, 
                   inplace=True)

## Co-Ordinate Descent

In [33]:
parameters = {'max_depth': np.arange(1, 25),
              'min_samples_leaf': np.arange(1, 50)}

random_search = RandomizedSearchCV(DecisionTreeRegressor(criterion='squared_error', random_state=0), 
                                   parameters, n_iter=5, cv=5, 
                                   scoring='neg_mean_squared_error', random_state=0)
                                   
random_search.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price'])
random_search.best_params_

{'min_samples_leaf': 29, 'max_depth': 16}

In [34]:
base_model = DecisionTreeRegressor(criterion='squared_error', max_depth=16)
parameters = {'min_samples_leaf': np.arange(1, 50)}

model = GridSearchCV(base_model, parameters, cv=5, scoring='neg_mean_squared_error')
model.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price']);

In [35]:
model.best_params_

{'min_samples_leaf': 13}

In [36]:
base_model = DecisionTreeRegressor(criterion='squared_error', min_samples_leaf=13)
parameters = {'max_depth': np.arange(1, 25)}

model = GridSearchCV(base_model, parameters, cv=5, scoring='neg_mean_squared_error')
model.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price']);

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(min_samples_leaf=13),
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])},
             scoring='neg_mean_squared_error')

In [37]:
model.best_params_

{'max_depth': 18}

In [40]:
model.best_score_

-0.011334684940916516

In [38]:
final_model = DecisionTreeRegressor(criterion='squared_error', min_samples_leaf=13, max_depth=18)
final_model.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price'])

DecisionTreeRegressor(max_depth=18, min_samples_leaf=13)

In [39]:
predictions = 10**final_model.predict(perth_test.drop(columns='log10_price'))
mean_squared_log_error(10**perth_test['log10_price'], predictions, squared=False)

0.2435026391081913

In [41]:
parameters = {'max_depth': np.arange(1, 25),
              'min_samples_leaf': np.arange(1, 50)}

random_search = RandomizedSearchCV(DecisionTreeRegressor(criterion='squared_error', random_state=0), 
                                   parameters, n_iter=10, cv=5, 
                                   scoring='neg_mean_squared_error', random_state=0)
                                   
model = random_search.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price'])

print(random_search.best_params_)

predictions = 10**model.predict(perth_test.drop(columns='log10_price'))
mean_squared_log_error(10**perth_test['log10_price'], predictions, squared=False)

{'min_samples_leaf': 17, 'max_depth': 23}


0.24383778611380308

In [42]:
model.best_score_

-0.011414564024239708

# Visualize Tree

In [49]:
tree = DecisionTreeRegressor(criterion = 'squared_error', max_depth=23, min_samples_leaf=17, random_state=0)
tree.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price'])

feature_names = perth_train.drop(columns='log10_price').columns

dot_data = export_graphviz(tree, max_depth=3, impurity=False, feature_names=feature_names, rounded=True, 
                           filled=True) 
graph = graphviz.Source(dot_data)
graph.render('Tree')

'Tree.pdf'

In [50]:
perth_train['log10_price'].mean()

5.747511815001846

In [52]:
split = perth_train.loc[perth_train["nearest_sch_rank"] > 34.5, "log10_price"]
split.mean()

5.705587956874365