In [91]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
# need to install conda install python-graphviz to enable this!
import graphviz
# dtreeviz has installation instructions on their site, follow it!
from dtreeviz.trees import *

In [2]:
trainingFeaturesFilename = 'ModelInput/features.csv'
trainingLabelsFilename = 'ModelInput/labels.csv'

trainingFeatures = pd.read_csv(trainingFeaturesFilename)
trainingLabels = pd.read_csv(trainingLabelsFilename)

# For classification this isn't necessary!
#trainingFeatures['cityHasRiver'] = trainingFeatures['cityHasRiver'].astype('category')

print("Training features: {}".format(trainingFeatures.shape))
print("Training labels: {}".format(trainingLabels.shape))

#print(trainingFeatures.head())
#print(trainingFeatures.dtypes)

Training features: (128, 70)
Training labels: (128, 13)


In [3]:
tempDF = pd.merge(trainingFeatures, trainingLabels, on='cityId', how='inner')
print(tempDF.shape)

(128, 82)


In [45]:
# Start at 2nd column, i.e. exclude country_code
X = pd.get_dummies(trainingFeatures.iloc[:,2:len(trainingFeatures)])
del X['hasRiver']
y = tempDF['cityScore'].values
print(X.shape)
print(type(X))
print(y)

(128, 67)
<class 'pandas.core.frame.DataFrame'>
[1 3 1 0 0 3 3 3 4 3 2 1 2 0 1 2 0 3 4 4 4 3 4 3 4 3 3 3 4 0 0 3 1 0 1 0 0
 0 0 2 2 2 0 0 1 0 0 3 2 1 4 4 3 4 2 0 1 2 0 0 1 0 3 0 1 1 4 4 3 4 4 1 0 0
 1 0 3 0 2 1 3 4 4 4 3 4 2 2 1 1 2 0 2 1 0 4 3 4 4 3 4 2 2 1 1 2 0 2 1 0 4
 3 4 4 3 2 4 1 0 1 1 2 1 0 0 0 0 3]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
clf = tree.DecisionTreeClassifier()
#clf.max_depth = 4
#clf.min_samples_leaf = 4

clf = clf.fit(X_train, y_train)

In [None]:
# This is basically the "simplest" version.
y_predict = clf.predict(X_test)
metrics.accuracy_score(y_test, y_predict)

In [50]:
scores = cross_val_score(clf, X, y, cv=10)
print(scores)

[0.2        0.2        0.26666667 0.35714286 0.15384615 0.66666667
 0.63636364 0.81818182 0.81818182 0.36363636]


In [80]:
param_grid = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }
clf = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(clf,param_grid=param_grid, cv=10)
grid_search.fit(X, y)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 10, 20], 'max_depth': [None, 2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'max_leaf_nodes': [None, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [87]:
# We now have optimised version. Unlimited depth and breadth. tests as 100 reliable!?
y_predict = grid_search.best_estimator_.predict(X_test)
metrics.accuracy_score(y_test, y_predict)
#grid_search.best_estimator_
#gives...
#DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
#            max_features=None, max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
#            splitter='best')

1.0

In [89]:
dot_data = tree.export_graphviz(grid_search.best_estimator_, out_file=None, 
                      feature_names=X.columns.values,  
                      class_names=['poor', 'okay', 'average', 'good', 'excellent'],
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("Civ6CitySettlementDT")

'Civ6CitySettlementDT.pdf'

In [88]:
viz = dtreeviz(grid_search.best_estimator_,
               X_train,
               y_train,
               target_name = 'city grading',
               feature_names=X.columns.values,  
               class_names=['poor', 'okay', 'average', 'good', 'excellent'],
#               fancy=False
              )
viz.view()

Consider this version 1 of the model.

'Civ6CitySettlementDT.pdf' and 'Civ6CitySettlementDT_dtree.pdf' has the pretty pictures.

Next step is to simply inputs based on this Decision Tree!