In [8]:
# Import Libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn import tree
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Experimental 1: (100 Funder & Installer Categories)

In [3]:
# Import experimental dataframe 1

df1 = pd.read_csv('experimental1_training_set')
df1.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region,region_code,district_code,lga,...,payment,payment_type,water_quality,quantity,source,waterpoint_type,well_age,status_group,top_funded,top_installers
0,6000.0,1390,34.938093,-9.856322,0,Lake Nyasa,Iringa,11,5,Ludewa,...,pay annually,annually,soft,enough,spring,communal standpipe,12,functional,Roman,Roman
1,0.0,1399,34.698766,-2.147466,0,Lake Victoria,Mara,20,2,Serengeti,...,never pay,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3,functional,other,other
2,25.0,686,37.460664,-3.821329,0,Pangani,Manyara,21,4,Simanjiro,...,pay per bucket,per bucket,soft,enough,dam,communal standpipe multiple,4,functional,other,World Vision
3,0.0,263,38.486161,-11.155298,0,Ruvuma / Southern Coast,Mtwara,90,63,Nanyumbu,...,never pay,never pay,soft,dry,machine dbh,communal standpipe multiple,27,non functional,Unicef,UNICEF
4,0.0,0,31.130847,-1.825359,0,Lake Victoria,Kagera,18,1,Karagwe,...,never pay,never pay,soft,seasonal,rainwater harvesting,communal standpipe,12,functional,other,Artisan


In [6]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

df1['district_code'] = df1['district_code'].astype('str')
df1['region_code'] = df1['region_code'].astype('str')

In [7]:
# Isolate continuous variables 

df1_cont = df1[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

# Isolate categorical variables 

df1_cat = df1.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age', 'status_group'], axis = 1)

# One-hot encode categorical variables using pd.get_dummies

cat_dummies_1 = pd.get_dummies(df1_cat, drop_first = True)
cat_dummies_1.head()

Unnamed: 0,num_private,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,...,top_installers_WATER AID,top_installers_WEDECO,top_installers_WU,top_installers_WVT,top_installers_Wizara ya maji,top_installers_World,top_installers_World Bank,top_installers_World Vision,top_installers_other,top_installers_wananchi
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Convert the outcome column 'status_group' into numeric values

df1['status_group'].replace('functional', 1, inplace = True)
df1['status_group'].replace('non functional', 0, inplace = True)
df1['status_group'].replace('functional needs repair', 2, inplace = True)

In [9]:
# Concatenate the OHE categorical and continuous variables and the target variable back together 

processed_df1 = pd.concat([cat_dummies_1, df1_cont, df1['status_group']], axis = 1)
processed_df1.head()

Unnamed: 0,num_private,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,...,top_installers_World Vision,top_installers_other,top_installers_wananchi,amount_tsh,gps_height,longitude,latitude,population,well_age,status_group
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,6000.0,1390,34.938093,-9.856322,109,12,1
1,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0.0,1399,34.698766,-2.147466,280,3,1
2,0,0,0,0,0,1,0,0,0,0,...,1,0,0,25.0,686,37.460664,-3.821329,250,4,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0.0,263,38.486161,-11.155298,58,27,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0.0,0,31.130847,-1.825359,0,12,1


In [10]:
# Split data into target (y) and predictors (X)

X = processed_df1.drop(['status_group'], axis = 1)
y = processed_df1['status_group']

In [11]:
# Split data using train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [13]:
# Instantiate XGBClassifier
clf1 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs= -1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf1.fit(X_train, y_train)

# Predict on training and test sets
training_preds = clf1.predict(X_train)
test_preds = clf1.predict(X_test)

# Accuracy of training and test sets
training_accuracy = accuracy_score(y_train, training_preds)
test_accuracy = accuracy_score(y_test, test_preds)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 87.42%
Validation accuracy: 80.38%


# Experimental 2: (25 Funder & Installer Categories)

In [17]:
# Import experimental dataframe 2

df2 = pd.read_csv('experimental2_training_set')
df2.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,lga,population,...,management,payment,water_quality,quantity,source,waterpoint_type,well_age,status_group,top_funded,top_installers
0,6000.0,1390,34.938093,-9.856322,Lake Nyasa,Iringa,11,5,Ludewa,109,...,vwc,pay annually,soft,enough,spring,communal standpipe,12,functional,other,other
1,0.0,1399,34.698766,-2.147466,Lake Victoria,Mara,20,2,Serengeti,280,...,wug,never pay,soft,insufficient,rainwater harvesting,communal standpipe,3,functional,other,other
2,25.0,686,37.460664,-3.821329,Pangani,Manyara,21,4,Simanjiro,250,...,vwc,pay per bucket,soft,enough,dam,communal standpipe multiple,4,functional,other,World Vision
3,0.0,263,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,90,63,Nanyumbu,58,...,vwc,never pay,soft,dry,machine dbh,communal standpipe multiple,27,non functional,Unicef,other
4,0.0,0,31.130847,-1.825359,Lake Victoria,Kagera,18,1,Karagwe,0,...,other,never pay,soft,seasonal,rainwater harvesting,communal standpipe,12,functional,other,other


In [19]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

df2['district_code'] = df2['district_code'].astype('str')
df2['region_code'] = df2['region_code'].astype('str')

# Isolate continuous variables 

df2_cont = df2[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

# Isolate categorical variables 

df2_cat = df2.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age', 'status_group'], axis = 1)

# One-hot encode categorical variables using pd.get_dummies

cat_dummies_2 = pd.get_dummies(df2_cat, drop_first = True)
cat_dummies_2.head()

# Convert the outcome column 'status_group' into numeric values

df2['status_group'].replace('functional', 1, inplace = True)
df2['status_group'].replace('non functional', 0, inplace = True)
df2['status_group'].replace('functional needs repair', 2, inplace = True)

# Concatenate the OHE categorical and continuous variables and the target variable back together 

processed_df2 = pd.concat([cat_dummies_2, df2_cont, df2['status_group']], axis = 1)
processed_df2.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_installers_WU,top_installers_World Vision,top_installers_other,amount_tsh,gps_height,longitude,latitude,population,well_age,status_group
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,6000.0,1390,34.938093,-9.856322,109,12,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0.0,1399,34.698766,-2.147466,280,3,1
2,0,0,0,0,1,0,0,0,0,0,...,0,1,0,25.0,686,37.460664,-3.821329,250,4,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0.0,263,38.486161,-11.155298,58,27,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0.0,0,31.130847,-1.825359,0,12,1


In [25]:
# Split data into target (y) and predictors (X)

X2 = processed_df2.drop(['status_group'], axis = 1)
y2 = processed_df2['status_group']

# Split data using train_test_split

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.20, random_state = 42)

In [26]:
# Instantiate XGBClassifier
clf2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs= -1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf2.fit(X2_train, y2_train)

# Predict on training and test sets
training_preds2 = clf2.predict(X2_train)
test_preds2 = clf2.predict(X2_test)

# Accuracy of training and test sets
training_accuracy2 = accuracy_score(y2_train, training_preds2)
test_accuracy2 = accuracy_score(y2_test, test_preds2)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 88.03%
Validation accuracy: 80.36%


# Experimental 3: Drop 'amount_tsh'

In [22]:
df3 = processed_df2.drop(['amount_tsh'], axis = 1)
df3.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_installers_WEDECO,top_installers_WU,top_installers_World Vision,top_installers_other,gps_height,longitude,latitude,population,well_age,status_group
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1390,34.938093,-9.856322,109,12,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1399,34.698766,-2.147466,280,3,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,686,37.460664,-3.821329,250,4,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,263,38.486161,-11.155298,58,27,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,31.130847,-1.825359,0,12,1


In [27]:
# Split data into target (y) and predictors (X)

X3 = df3.drop(['status_group'], axis = 1)
y3 = df3['status_group']

# Split data using train_test_split

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.20, random_state = 42)

In [28]:
# Instantiate XGBClassifier
clf3 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs= -1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf3.fit(X3_train, y3_train)

# Predict on training and test sets
training_preds3 = clf3.predict(X3_train)
test_preds3 = clf3.predict(X3_test)

# Accuracy of training and test sets
training_accuracy3 = accuracy_score(y3_train, training_preds3)
test_accuracy3 = accuracy_score(y3_test, test_preds3)

print('Training Accuracy: {:.4}%'.format(training_accuracy3 * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy3 * 100))

Training Accuracy: 87.73%
Validation accuracy: 80.28%


# Experimental 4: All Columns

In [29]:
# Import experimental dataframe 2

df4 = pd.read_csv('all_features_training_set')
df4.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,lga,population,...,source,waterpoint_type,well_age,status_group,top_funded,top_installers,top_wpt_name,top_ward,top_subvillage,top_scheme_name
0,6000.0,1390,34.938093,-9.856322,Lake Nyasa,Iringa,11,5,Ludewa,109,...,spring,communal standpipe,12,functional,Roman,other,none,other,other,Roman
1,0.0,1399,34.698766,-2.147466,Lake Victoria,Mara,20,2,Serengeti,280,...,rainwater harvesting,communal standpipe,3,functional,other,other,Zahanati,other,other,other
2,25.0,686,37.460664,-3.821329,Pangani,Manyara,21,4,Simanjiro,250,...,dam,communal standpipe multiple,4,functional,other,World Vision,other,other,Majengo,other
3,0.0,263,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,90,63,Nanyumbu,58,...,machine dbh,communal standpipe multiple,27,non functional,Unicef,UNICEF,other,other,other,other
4,0.0,0,31.130847,-1.825359,Lake Victoria,Kagera,18,1,Karagwe,0,...,rainwater harvesting,communal standpipe,12,functional,other,other,Shuleni,other,other,other


In [30]:
# Change data type of district_code and region_code to 'string' in order to be one-hot encoded

df4['district_code'] = df4['district_code'].astype('str')
df4['region_code'] = df4['region_code'].astype('str')

# Isolate continuous variables 

df4_cont = df4[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'well_age']]

# Isolate categorical variables 

df4_cat = df4.drop(columns = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
                                        'well_age', 'status_group'], axis = 1)

# One-hot encode categorical variables using pd.get_dummies

cat_dummies_4 = pd.get_dummies(df4_cat, drop_first = True)
cat_dummies_4.head()

# Convert the outcome column 'status_group' into numeric values

df4['status_group'].replace('functional', 1, inplace = True)
df4['status_group'].replace('non functional', 0, inplace = True)
df4['status_group'].replace('functional needs repair', 2, inplace = True)

# Concatenate the OHE categorical and continuous variables and the target variable back together 

processed_df4 = pd.concat([cat_dummies_4, df4_cont, df4['status_group']], axis = 1)
processed_df4.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_scheme_name_upper Ruvu,top_scheme_name_wanging'ombe supply scheme,top_scheme_name_wanging'ombe water supply s,amount_tsh,gps_height,longitude,latitude,population,well_age,status_group
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,6000.0,1390,34.938093,-9.856322,109,12,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0.0,1399,34.698766,-2.147466,280,3,1
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,25.0,686,37.460664,-3.821329,250,4,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0.0,263,38.486161,-11.155298,58,27,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0.0,0,31.130847,-1.825359,0,12,1


In [31]:
# Split data into target (y) and predictors (X)

X4 = processed_df4.drop(['status_group'], axis = 1)
y4 = processed_df4['status_group']

# Split data using train_test_split

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size = 0.20, random_state = 42)

In [32]:
# Instantiate XGBClassifier
clf4 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs= -1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf4.fit(X4_train, y4_train)

# Predict on training and test sets
training_preds4 = clf4.predict(X4_train)
test_preds4 = clf4.predict(X4_test)

# Accuracy of training and test sets
training_accuracy4 = accuracy_score(y4_train, training_preds4)
test_accuracy4 = accuracy_score(y4_test, test_preds4)

print('Training Accuracy: {:.4}%'.format(training_accuracy * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy * 100))

Training Accuracy: 88.03%
Validation accuracy: 80.36%


# Experimental 5: MinMaxScaling Data

In [5]:
df5 = pd.read_csv('processed_training_set')
df5.head()

Unnamed: 0,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,region_Dar es Salaam,region_Dodoma,...,top_installers_WVT,top_installers_World Vision,top_installers_other,amount_tsh,gps_height,longitude,latitude,population,well_age,status_group
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,6000.0,1390,34.938093,-9.856322,109,12,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0.0,1399,34.698766,-2.147466,280,3,1
2,0,0,0,0,1,0,0,0,0,0,...,0,1,0,25.0,686,37.460664,-3.821329,250,4,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0.0,263,38.486161,-11.155298,58,27,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0.0,0,31.130847,-1.825359,0,12,1


In [6]:
# Split data into target (y) and predictors (X)

X5 = df5.drop(['status_group'], axis = 1)
y5 = df5['status_group']

# Split data using train_test_split

X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size = 0.20, random_state = 42)

In [9]:
#scale to make sure that all of our data is represented at the same scale
scale = MinMaxScaler()

#fit
X5_train = scale.fit_transform(X5_train) #to create a scaled version of the training dataset
X5_test = scale.transform(X5_test) #to create a scaled version of the test dataset

In [10]:
# Instantiate XGBClassifier
clf5 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=250,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf5.fit(X5_train, y5_train)

# Predict on training and test sets
training_preds5 = clf5.predict(X5_train)
test_preds5 = clf5.predict(X5_test)

# Accuracy of training and test sets
training_accuracy5 = accuracy_score(y5_train, training_preds5)
test_accuracy5 = accuracy_score(y5_test, test_preds5)

print('Training Accuracy: {:.4}%'.format(training_accuracy5 * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy5 * 100))

Training Accuracy: 88.82%
Validation accuracy: 81.17%


# Experimental 6: Modeling with SMOTE

In [None]:
df6 = pd.read_csv('processed_training_set')
df6.head()

In [None]:
# Split data into target (y) and predictors (X)

X6 = df6.drop(['status_group'], axis = 1)
y6 = df6['status_group']

# Split data using train_test_split

X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size = 0.20, random_state = 42)

In [11]:
#previous original class distribution
print('Class Distribution Before:')
print('Train Set')
print(y5_train.value_counts())

#fit
smote = SMOTE()
X5_train, y5_train = smote.fit_resample(X5_train, y5_train)

#check class distribution
print('\n')
print('Class Distribution After:')
print('Train Set')
print(pd.Series(y5_train).value_counts())

Class Distribution Before:
Train Set
1    25802
0    18252
2     3466
Name: status_group, dtype: int64


Class Distribution After:
Train Set
2    25802
1    25802
0    25802
Name: status_group, dtype: int64


In [12]:
# Instantiate XGBClassifier
clf5 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=250,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

# Fit XGBClassifier
clf5.fit(X5_train, y5_train)

# Predict on training and test sets
training_preds5 = clf5.predict(X5_train)
test_preds5 = clf5.predict(X5_test)

# Accuracy of training and test sets
training_accuracy5 = accuracy_score(y5_train, training_preds5)
test_accuracy5 = accuracy_score(y5_test, test_preds5)

print('Training Accuracy: {:.4}%'.format(training_accuracy5 * 100))
print('Validation accuracy: {:.4}%'.format(test_accuracy5 * 100))

Training Accuracy: 90.51%
Validation accuracy: 78.59%


# Experimental 7: Reducing Number of Features

In [None]:
#feature importance
decision_tree_clf.feature_importances_;

def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize = (20, 100))
    plt.barh(range(n_features), model.feature_importances_, align = 'center') 
    plt.yticks(np.arange(n_features), X.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(decision_tree_clf)

#save
plt.savefig('./Data/decision_tree_clf_feature_importances')