In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import tree
#from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
# need to install conda install python-graphviz to enable this!
import graphviz
# dtreeviz has installation instructions on their site, follow it!
from dtreeviz.trees import *

In [2]:
trainingFeaturesFilename = 'ModelInput/features.csv'
trainingLabelsFilename = 'ModelInput/labels.csv'

trainingFeatures = pd.read_csv(trainingFeaturesFilename)
trainingLabels = pd.read_csv(trainingLabelsFilename)

# For classification this isn't necessary!
#trainingFeatures['cityHasRiver'] = trainingFeatures['cityHasRiver'].astype('category')

print("Training features: {}".format(trainingFeatures.shape))
print("Training labels: {}".format(trainingLabels.shape))

#print(trainingFeatures.head())
#print(trainingFeatures.dtypes)

Training features: (128, 36)
Training labels: (128, 13)


In [3]:
tempDF = pd.merge(trainingFeatures, trainingLabels, on='cityId', how='inner')
print(tempDF.shape)

(128, 48)


In [4]:
# Start at 2nd column, i.e. exclude country_code
#X = pd.get_dummies(trainingFeatures.iloc[:,2:len(trainingFeatures)])
X = pd.get_dummies(trainingFeatures[['PlainsHillsRainforest', 'GrasslandMarsh', 'GrasslandHillsWoods', 'Lux',
       'Bonus', 'GrasslandHills']])
y = tempDF['cityScore'].values
print(X.shape)
#print(type(X))
print(y)

(128, 6)
[1 2 0 0 0 2 1 1 2 2 1 0 1 0 0 0 0 1 2 2 2 2 2 2 2 1 1 2 2 0 0 1 0 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 2 1 0 2 2 1 2 1 0 0 1 0 0 0 0 1 0 0 0 2 2 2 2 2 1 0 0
 1 0 1 0 1 0 2 2 2 2 1 2 1 1 1 0 1 0 1 0 0 2 1 2 2 1 2 1 1 1 0 1 0 1 0 0 2
 1 2 2 1 1 2 1 0 0 0 0 0 0 0 0 0 1]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
clf = tree.DecisionTreeClassifier()
#clf.max_depth = 4
#clf.min_samples_leaf = 4

clf = clf.fit(X_train, y_train)
#clf = clf.fit(X, y)

In [7]:
# This is basically the "simplest" version.
y_predict = clf.predict(X_test)
metrics.accuracy_score(y_test, y_predict)

0.46875

In [8]:
scores = cross_val_score(clf, X, y, cv=10)
print(scores)

[0.42857143 0.5        0.21428571 0.21428571 0.23076923 0.33333333
 0.75       0.58333333 0.75       0.36363636]


In [9]:
param_grid = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }
clf = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(clf,param_grid=param_grid, cv=10)
grid_search.fit(X, y)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['gini', 'entropy'], 'min_samples_split': [2, 10, 20], 'max_depth': [None, 2, 5, 10], 'min_samples_leaf': [1, 5, 10], 'max_leaf_nodes': [None, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
# We now have optimised version. Unlimited depth and breadth. tests as 100 reliable!?
y_predict = grid_search.best_estimator_.predict(X_test)
metrics.accuracy_score(y_test, y_predict)
#grid_search.best_estimator_
#gives...
#DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
#            max_features=None, max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
#            splitter='best')

0.875

In [11]:
dot_data = tree.export_graphviz(grid_search.best_estimator_, out_file=None, 
                      feature_names=X.columns.values,  
                      class_names=['poor', 'average', 'good'],
                      filled=True, rounded=True,  
                      special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("Civ6CitySettlementDT_Ver02")

'Civ6CitySettlementDT_Ver02.pdf'

In [12]:
viz = dtreeviz(grid_search.best_estimator_,
               X_train,
               y_train,
               target_name = 'city grading',
               feature_names=X.columns.values,  
               class_names=['poor', 'average', 'good']
#               fancy=False
              )
viz.view()

Consider this version 2 of the model.

'Civ6CitySettlementDT_Ver02.pdf' and 'Civ6CitySettlementDT_dtree_Ver02.pdf' has the pretty pictures.

Next step is to simply inputs based on this Decision Tree!