# Import Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('cleaned_status_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40495 entries, 0 to 40494
Columns: 182 entries, Unnamed: 0 to wp_type_improved_spring
dtypes: float64(5), int64(177)
memory usage: 56.2 MB


In [3]:
df.drop(columns='Unnamed: 0', axis=1, inplace=True)

# Assign dependent/independent variables

In [4]:
y = df['target']
X = df.drop('target', axis=1)
X.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,district_code,population,public_meeting,permit,...,quantity_seasonal,source_dam,source_rainwater_harvesting,source_river_lake,source_shallow_well,source_spring,wp_type_communal_standpipe,wp_type_dam,wp_type_hand_pump,wp_type_improved_spring
0,34310,0.105961,0.264384,0.735732,0.733417,0,4,0.587264,1,1,...,0,1,0,0,0,0,1,0,0,0
1,67743,0.105961,0.115072,0.831861,0.040709,0,63,0.134434,1,1,...,0,0,0,0,0,0,1,0,0,0
2,19816,0.105961,0.258135,0.351565,0.738609,0,3,0.421958,1,1,...,0,0,0,0,0,0,0,0,1,0
3,53934,0.105961,0.258135,0.290512,0.608232,0,6,0.421958,1,1,...,1,0,0,0,0,0,0,0,1,0
4,50409,0.057763,0.397106,0.577275,0.095597,0,5,0.587264,1,0,...,0,0,0,0,1,0,0,0,1,0


# Training and testing split

In [5]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# GridSearchCV for Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier()
forest_model = forest_clf.fit(X_train, y_train)

In [8]:
forest_params = {'criterion' : ['gini', 'entropy'],
                 'min_samples_split' : [4, 6, 8, 10, 12, 14],
                 'min_samples_leaf' : [4, 6, 8, 10, 12, 14]
                }
grid_search = GridSearchCV(forest_model, param_grid=forest_params, scoring='accuracy')
grid_search.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'min_samples_leaf': [4, 6, 8, 10, 12, 14],
                         'min_samples_split': [4, 6, 8, 10, 12, 14]},
             scoring='accuracy')

In [9]:
grid_search.best_params_

{'criterion': 'gini', 'min_samples_leaf': 4, 'min_samples_split': 8}

# GridSearchCV For XGBoost

In [7]:
from xgboost import XGBClassifier

In [8]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

**Tree_Method - 'Auto'**

In [8]:
tree_params = {
    'tree_method' : ['auto', 'hist', 'approx'],
              }
tree_grid_search = GridSearchCV(xgb_model, param_grid=tree_params, scoring='accuracy')
tree_grid_search.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1,
                                     objective='multi:softprob', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=None, subsample=1,
                                     tree

In [9]:
tree_best = tree_grid_search.best_params_
tree_best

{'tree_method': 'auto'}

**Learning_rate - 0.4**

In [12]:
learn_params = {
    'learning_rate' : [0.1, 0.2, 0.3, 0.4]
               }
learn_grid_search = GridSearchCV(xgb_model, param_grid=learn_params, scoring='accuracy')
learn_grid_search.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1,
                                     objective='multi:softprob', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=None, subsample=1,
                                     tree

In [13]:
learn_best = learn_grid_search_2.best_params_
learn_best

{'learning_rate': 0.4}

**Number_estimators - 350**

In [14]:
n_estimators = range(50, 400, 50)
num_params = dict(n_estimators=n_estimators)
num_grid_search = GridSearchCV(xgb_model, param_grid=num_params, scoring='accuracy')
num_grid_search.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1,
                                     objective='multi:softprob', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=None, subsample=1,
                                     tree

In [16]:
num_best = num_grid_search_1.best_params_
num_best

{'n_estimators': 350}

**Max_depth - 13**

In [17]:
max_depth = range(1, 30, 1)
max_params = dict(max_depth=max_depth)
max_grid_search = GridSearchCV(xgb_model, param_grid=max_params, scoring='accuracy')
max_grid_search.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1,
                                     objective='multi:softprob', random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=None, subsample=1,
                                     tree

In [18]:
max_best = max_grid_search.best_params_
max_best

{'max_depth': 13}

**There could be more tuning.**