In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [40]:
hr = pd.read_csv('HR.csv')
hr_dummies = pd.get_dummies(hr)

train_indices, test_indices = train_test_split(np.array(hr_dummies.index), test_size=0.2, random_state=0)

hr_train = hr_dummies.loc[train_indices].copy()
hr_test = hr_dummies.loc[test_indices].copy() 

In [41]:
x_train = hr_train.drop('left', axis=1)
y_train = hr_train['left']

x_test = hr_test.drop('left', axis=1)
y_test = hr_test['left']

In [42]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5, random_state=0)
model.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5,
                       random_state=0)

In [43]:
y_pred = model.predict(x_test)
f1_score(y_test, y_pred)

0.6493506493506493

# Forward Selection - Classifica

In [45]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import log_loss

In [48]:
scores = []

for predictor in x_train.columns:
    X_split = x_train.loc[:, [predictor]]

    model = LogisticRegression(penalty="none")
    model.fit(X_split, y_train)

    y_pred = model.predict(X_split)
    ce = log_loss(y_train, y_pred)

    scores.append((ce, predictor))
    
best_ce, best_predictor = sorted(scores)[0]

In [49]:
best_ce, best_predictor

(7.9849230648046765, 'satisfaction_level')

# Randomize Search

In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [9]:
parameters = {'min_samples_leaf': np.arange(1, 100),
              'max_depth': np.arange(1, 20)}

random_search = RandomizedSearchCV(DecisionTreeClassifier(criterion='entropy'), parameters, 
                                   n_iter=25, cv=5, scoring="f1", random_state=0)
model = random_search.fit(x_train, y_train)

In [10]:
model.best_score_

0.9427067093673112

In [11]:
model.best_params_

{'min_samples_leaf': 9, 'max_depth': 7}

# Co-Ordinate Descent

In [13]:
base_model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=9)
parameters = {'max_depth': np.arange(1, 20)}

search = GridSearchCV(base_model, parameters, cv=5, scoring='f1')
search.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(criterion='entropy',
                                              min_samples_leaf=9),
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
             scoring='f1')

In [14]:
search.best_params_

{'max_depth': 9}

In [15]:
base_model = DecisionTreeClassifier(criterion='entropy', max_depth=9)
parameters = {'min_samples_leaf': np.arange(1, 100)}

search = GridSearchCV(base_model, parameters, cv=5, scoring='f1')
search.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(criterion='entropy', max_depth=9),
             param_grid={'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])},
             scoring='f1')

In [16]:
search.best_params_

{'min_samples_leaf': 1}

In [17]:
base_model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=1)
parameters = {'max_depth': np.arange(1, 20)}

search = GridSearchCV(base_model, parameters, cv=5, scoring='f1')
search.fit(x_train, y_train);

In [18]:
search.best_params_

{'max_depth': 9}

In [19]:
search.best_score_

0.9559443289149921

# Visualize Tree

In [20]:
from sklearn.tree import export_graphviz
import graphviz

In [21]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=9, random_state=0)
tree.fit(hr_train.drop(columns='left'), hr_train['left'])

DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=9,
                       random_state=0)

In [23]:
x_train.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'sales_IT', 'sales_RandD', 'sales_accounting',
       'sales_hr', 'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical', 'salary_high',
       'salary_low', 'salary_medium'],
      dtype='object')

In [22]:
feature_names = ['Satisfaction Level', 'Last Evaluation Score', 'Number of Projects',
                 'Average Monthly Hours', 'Time Spent at the Company', 'Work Accident',
                 'Promotion in the Last 5 Years', 'Department: IT', 'Department: R&D', 
                 'Department: Accounting', 'Department: HR', 'Department: Management', 
                 'Department: Marketing', 'Department: Product Management', 'Department: Sales', 
                 'Department: Support', 'Department: Technical', 'Salary: High', 
                 'Salary: Low', 'Salary: Medium']

dot_data = export_graphviz(tree, max_depth=3, impurity=False, feature_names=feature_names,
                           class_names=['Stayed', 'Left'], rounded=True, filled=True) 
graph = graphviz.Source(dot_data)
graph.render('Tree')

'Tree.pdf'

# Perth Housing

In [30]:
from sklearn.metrics import mean_squared_log_error
from sklearn.tree import DecisionTreeRegressor

In [28]:
perth = pd.read_csv('PerthHousing.csv', parse_dates=['date_sold'], infer_datetime_format=True)

# We only really care about the year in which the house was sold
perth['date_sold'] = pd.DatetimeIndex(perth['date_sold']).year

perth['log10_price'] = np.log10(perth['price'])
perth.drop(columns=['price', 'address', 'nearest_sch', 'nearest_stn'], inplace=True)
perth = pd.get_dummies(perth, columns=['suburb'])

train_indices, test_indices = train_test_split(np.array(perth.index), test_size=0.2, random_state=0)

perth_train = perth.loc[train_indices].copy()
perth_test = perth.loc[test_indices].copy()

In [29]:
# Null values in the garage column are houses that have 0 garage spaces
perth_test.fillna({'garage': 0, 'build_year': perth_train['build_year'].mean(), 'nearest_sch_rank': 150}, 
                  inplace=True)
perth_train.fillna({'garage': 0, 'build_year': perth_train['build_year'].mean(), 'nearest_sch_rank': 150}, 
                   inplace=True)

In [36]:
parameters = {'max_depth': np.arange(1, 25),
              'min_samples_leaf': np.arange(1, 50)}

random_search = RandomizedSearchCV(DecisionTreeRegressor(criterion='squared_error', random_state=0), 
                                   parameters, n_iter=5, cv=5, 
                                   scoring='neg_mean_squared_error', random_state=0)
                                   
random_search.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price'])
random_search.best_params_

{'min_samples_leaf': 29, 'max_depth': 16}

In [37]:
random_search.best_score_

-0.011578414206504618

## Co-Ordinate Descent

In [31]:
base_model = DecisionTreeRegressor(criterion='squared_error', max_depth=16)
parameters = {'min_samples_leaf': np.arange(1, 50)}

model = GridSearchCV(base_model, parameters, cv=5, scoring='neg_mean_squared_error')
model.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price']);

In [32]:
model.best_params_

{'min_samples_leaf': 13}

In [33]:
base_model = DecisionTreeRegressor(criterion='squared_error', min_samples_leaf=13)
parameters = {'max_depth': np.arange(1, 25)}

model = GridSearchCV(base_model, parameters, cv=5, scoring='neg_mean_squared_error')
model.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price']);

In [34]:
model.best_params_

{'max_depth': 18}

In [35]:
model.best_score_

-0.011331706862915394

# Visualize Tree

In [38]:
tree = DecisionTreeRegressor(criterion = 'squared_error', max_depth=23, min_samples_leaf=17, random_state=0)
tree.fit(perth_train.drop(columns='log10_price'), perth_train['log10_price'])

feature_names = perth_train.drop(columns='log10_price').columns

dot_data = export_graphviz(tree, max_depth=3, impurity=False, feature_names=feature_names, rounded=True, 
                           filled=True) 
graph = graphviz.Source(dot_data)
graph.render('Tree')

'Tree.pdf'