In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import  compute_sample_weight
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [8]:
train_df = pd.read_csv('data/ml_training.txt', index_col=[0])
test_df = pd.read_csv('data/ml_testing.txt', index_col=[0])

In [9]:
df_corr=train_df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
df_corr['pairs'] = list(zip(df_corr.level_0, df_corr.level_1))
df_corr.set_index(['pairs'], inplace = True)
df_corr.drop(columns=['level_1', 'level_0'], inplace = True)
df_corr.columns = ['cc']
df_corr.drop_duplicates(inplace=True)
df_corr = df_corr[df_corr['cc'] < 1.0000]
df_corr.head(10)

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1
"(source_class_non_functional, source_class_functional_needs_repair)",0.997228
"(extraction_type_group_functional, extraction_type_functional)",0.997188
"(quantity_non_functional, quantity_group_functional)",0.996439
"(quantity_group_functional, quantity_group_non_functional)",0.996439
"(waterpoint_type_group_non_functional, waterpoint_type_group_functional)",0.996435
"(quality_group_non_functional, quality_group_functional)",0.996039
"(water_quality_functional, quality_group_functional)",0.995305
"(extraction_type_group_non_functional, extraction_type_non_functional)",0.995145
"(waterpoint_type_non_functional, waterpoint_type_functional)",0.99497
"(water_quality_functional, quality_group_non_functional)",0.991363


In [None]:
corr_feats = set()
corr = df_.corr()

for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) > 0.85:
            colname = corr.columns[i]
            corr_feats.add(colname)

In [None]:
df_ = df_.drop(corr_feats, axis=1)
test_df = test_df.drop(corr_feats, axis=1)

In [None]:
unwanted_cols = ['date_recorded', 'recorded_by', 'wpt_name', 'Unnamed: 0.1', 'id']
df_.drop(unwanted_cols, axis=1, inplace=True)
test_df.drop(unwanted_cols, axis=1, inplace=True)

In [None]:
df_.drop('index', axis=1, inplace=True)

In [None]:
cols = [col for col in df_.columns.tolist() if col != 'status_group']

test_df = test_df.reindex(columns=[col for col in df_.columns]).drop('status_group', axis=1)

test_df.columns.tolist() == cols

In [18]:
X = train_df.drop(['status_group', 'id'], axis=1).select_dtypes(['int', 'float'])
y = train_df['status_group']

sample_weights = compute_sample_weight('balanced', y)

le = LabelEncoder()
y = le.fit_transform(y)

X_train_, X_test_, y_train_, y_test_ = train_test_split(X,y, random_state=42, stratify=y)
X_val, X_test_, y_val, y_test_ = train_test_split(X_test_, y_test_, random_state=42, stratify=y_test_)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_)
X_test = scaler.transform(X_test_)

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rf.fit(X_train, y_train_)
y_hat_train = rf.predict(X_train)
y_hat = rf.predict(X_test)

print('Train Accuracy Score:', accuracy_score(y_hat_train, y_train_))
print('Test Accuracy Score:', accuracy_score(y_test_, y_hat))

Train Accuracy Score: 0.9994612794612795
Test Accuracy Score: 0.834365741987611


In [17]:
X_train_

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,well_age,funder_functional,funder_functional_needs_repair,funder_non_functional,...,source_type_non_functional,source_class_functional,source_class_functional_needs_repair,source_class_non_functional,waterpoint_type_functional,waterpoint_type_functional_needs_repair,waterpoint_type_non_functional,waterpoint_type_group_functional,waterpoint_type_group_functional_needs_repair,waterpoint_type_group_non_functional
32062,27.634856,927.475118,35.891117,-6.292893e+00,0,676.520000,22,0.834341,0.022975,0.142684,...,0.462131,0.542320,0.059855,0.397825,0.366213,0.106177,0.527609,0.576491,0.083986,0.339523
13545,4000.000000,937.000000,36.016881,-7.286883e+00,0,407.243013,32,0.550064,0.051027,0.398909,...,0.302744,0.542320,0.059855,0.397825,0.621485,0.079237,0.299278,0.576491,0.083986,0.339523
57464,3124.988836,1283.673823,32.975596,-3.730475e+00,0,668.600054,22,0.457460,0.277435,0.265105,...,0.448348,0.542320,0.059855,0.397825,0.617852,0.058840,0.323307,0.617852,0.058840,0.323307
28328,108.010342,930.199419,0.000000,-2.000000e-08,0,453.385992,22,0.657143,0.057143,0.285714,...,0.259259,0.545168,0.117872,0.336960,0.621485,0.079237,0.299278,0.576491,0.083986,0.339523
18040,12000.000000,1027.000000,35.326772,-9.877166e+00,0,1400.000000,20,0.792683,0.024390,0.182927,...,0.338923,0.545168,0.117872,0.336960,0.621485,0.079237,0.299278,0.576491,0.083986,0.339523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23875,440.844038,2231.144161,33.638030,-9.355884e+00,0,93.983571,22,0.544930,0.120088,0.334982,...,0.338923,0.545168,0.117872,0.336960,0.621485,0.079237,0.299278,0.576491,0.083986,0.339523
39128,25.000000,1512.000000,31.048300,-7.520390e+00,0,277.893343,29,0.219048,0.019048,0.761905,...,0.448348,0.542320,0.059855,0.397825,0.131661,0.045925,0.822414,0.131661,0.045925,0.822414
39454,600.000000,789.000000,29.740101,-4.890159e+00,0,210.000000,19,0.250000,0.450000,0.300000,...,0.448348,0.542320,0.059855,0.397825,0.131661,0.045925,0.822414,0.131661,0.045925,0.822414
5809,20.000000,1553.000000,37.065571,-3.051792e+00,0,170.941685,11,0.977654,0.011173,0.011173,...,0.302744,0.542320,0.059855,0.397825,0.621485,0.079237,0.299278,0.576491,0.083986,0.339523


In [25]:
xgb = XGBClassifier(use_label_encoder=False, \
                    n_jobs=-1,
                    tree_method='gpu_hist',
                    objective='multi:softprob',
                    sample_weight=sample_weights)

watchlist = [(X_train_, y_train_), (X_val, y_val)]

xgb.fit(X_train_, y_train_, eval_set=watchlist, eval_metric='merror')

y_hat = xgb.predict(X_test_)
print('Train Accuracy score:', accuracy_score(y_hat_train, y_train_))
# print('Accuracy score:', accuracy_score(y_test, y_hat))
print('Test Accuracy Score:', accuracy_score(y_test_, y_hat))

Parameters: { "sample_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-merror:0.19917	validation_1-merror:0.19664
[1]	validation_0-merror:0.19109	validation_1-merror:0.19098
[2]	validation_0-merror:0.18857	validation_1-merror:0.18820
[3]	validation_0-merror:0.18727	validation_1-merror:0.18793
[4]	validation_0-merror:0.18274	validation_1-merror:0.18434


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[5]	validation_0-merror:0.18063	validation_1-merror:0.18281
[6]	validation_0-merror:0.17973	validation_1-merror:0.18326
[7]	validation_0-merror:0.17863	validation_1-merror:0.18219
[8]	validation_0-merror:0.17690	validation_1-merror:0.18012
[9]	validation_0-merror:0.17571	validation_1-merror:0.17904
[10]	validation_0-merror:0.17421	validation_1-merror:0.17824
[11]	validation_0-merror:0.17394	validation_1-merror:0.17698
[12]	validation_0-merror:0.17347	validation_1-merror:0.17590
[13]	validation_0-merror:0.17138	validation_1-merror:0.17339
[14]	validation_0-merror:0.17012	validation_1-merror:0.17249
[15]	validation_0-merror:0.16992	validation_1-merror:0.17222
[16]	validation_0-merror:0.16945	validation_1-merror:0.17024
[17]	validation_0-merror:0.16772	validation_1-merror:0.16934
[18]	validation_0-merror:0.16700	validation_1-merror:0.16908
[19]	validation_0-merror:0.16678	validation_1-merror:0.16908
[20]	validation_0-merror:0.16541	validation_1-merror:0.16782
[21]	validation_0-merror:0.16

In [None]:
def score(params):
    print("Training with params: ")
    print(params)
    watchlist = [(X_train_, y_train_), (X_val, y_val)]
    gbm_model = XGBClassifier(
        use_label_encoder=False,
        n_estimators = int(params['n_estimators']),
        max_depth= params['max_depth'],
        min_child_weight= params['min_child_weight'],
        subsample= params['subsample'],
        gamma= params['gamma'],
        colsample_bytree= params['colsample_bytree'],
        eta= params['eta'],
        eval_metric= 'merror',
        objective= 'multi:softprob',
        tree_method='gpu_hist',
        booster= 'gbtree',
        silent= 1,
        seed= 42,
        # num_class= 3,
        sample_weight=sample_weights
    )
    gbm_model.fit(X_train_, y_train_,
                  eval_set=watchlist)
    predictions = gbm_model.predict(X_test_)

    score_ = cross_val_score(gbm_model, X_train_, y_train_, cv=5, scoring='accuracy', n_jobs=-1).mean()

    print("\tScore {0}\n\n".format(score_))

    return {'loss': -score_, 'status': STATUS_OK}


def optimize(trials):
    """
    This is the optimization function that given a space (space here) of
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """

    space = {
        'n_estimators': hp.quniform('n_estimators', 10, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 20, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, \
                space, \
                algo=tpe.suggest, \
                trials=trials, \
                max_evals=100)
    return best

trials = Trials()

best_hyperparams = optimize(trials)

Training with params:                                  
{'colsample_bytree': 0.8500000000000001, 'eta': 0.30000000000000004, 'gamma': 0.8500000000000001, 'max_depth': 16, 'min_child_weight': 4.0, 'n_estimators': 164.0, 'subsample': 0.65}
Parameters: { "sample_weight", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):



[0]	validation_0-merror:0.14909	validation_1-merror:0.18416
[1]	validation_0-merror:0.13645	validation_1-merror:0.17824
[2]	validation_0-merror:0.13008	validation_1-merror:0.17464
[3]	validation_0-merror:0.12563	validation_1-merror:0.17231
[4]	validation_0-merror:0.12274	validation_1-merror:0.17069
[5]	validation_0-merror:0.11924	validation_1-merror:0.16854
[6]	validation_0-merror:0.11612	validation_1-merror:0.16611
[7]	validation_0-merror:0.11432	validation_1-merror:0.16620
[8]	validation_0-merror:0.11253	validation_1-merror:0.16441
[9]	validation_0-merror:0.11041	validation_1-merror:0.16432
[10]	validation_0-merror:0.10813	validation_1-merror:0.16288
[11]	validation_0-merror:0.10622	validation_1-merror:0.16378
[12]	validation_0-merror:0.10453	validation_1-merror:0.16279
[13]	validation_0-merror:0.10328	validation_1-merror:0.16297
[14]	validation_0-merror:0.10189	validation_1-merror:0.16414
[15]	validation_0-merror:0.10121	validation_1-merror:0.16360
[16]	validation_0-merror:0.09847	v

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):



[18]	validation_0-merror:0.20123	validation_1-merror:0.19682                      
[19]	validation_0-merror:0.19883	validation_1-merror:0.19503                      
[20]	validation_0-merror:0.19769	validation_1-merror:0.19386                      
[21]	validation_0-merror:0.19740	validation_1-merror:0.19314                      
[22]	validation_0-merror:0.19681	validation_1-merror:0.19287                      
[23]	validation_0-merror:0.19665	validation_1-merror:0.19242                      
[24]	validation_0-merror:0.19587	validation_1-merror:0.19170                      
[25]	validation_0-merror:0.19513	validation_1-merror:0.19152                      
[26]	validation_0-merror:0.19432	validation_1-merror:0.19054                      
[27]	validation_0-merror:0.19412	validation_1-merror:0.19054                      
[28]	validation_0-merror:0.19376	validation_1-merror:0.18955                      
[29]	validation_0-merror:0.19282	validation_1-merror:0.18964                      
[30]

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):



[4]	validation_0-merror:0.16685	validation_1-merror:0.17258                       
[5]	validation_0-merror:0.16682	validation_1-merror:0.17249                       
[6]	validation_0-merror:0.16451	validation_1-merror:0.17213                       
[7]	validation_0-merror:0.16368	validation_1-merror:0.16970                       
[8]	validation_0-merror:0.16020	validation_1-merror:0.16800                       
[9]	validation_0-merror:0.15845	validation_1-merror:0.16611                       
[10]	validation_0-merror:0.15706	validation_1-merror:0.16566                      
[11]	validation_0-merror:0.15522	validation_1-merror:0.16531                      
[12]	validation_0-merror:0.15421	validation_1-merror:0.16503                      
[13]	validation_0-merror:0.15084	validation_1-merror:0.16279                      
[14]	validation_0-merror:0.15003	validation_1-merror:0.16198                      
[15]	validation_0-merror:0.14954	validation_1-merror:0.16144                      
[16]

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):



[16]	validation_0-merror:0.19430	validation_1-merror:0.19170                        
[17]	validation_0-merror:0.19315	validation_1-merror:0.19054                        
[18]	validation_0-merror:0.19302	validation_1-merror:0.19098                        
[19]	validation_0-merror:0.19187	validation_1-merror:0.19045                        
[20]	validation_0-merror:0.19158	validation_1-merror:0.19045                        
[21]	validation_0-merror:0.19066	validation_1-merror:0.18991                        
[22]	validation_0-merror:0.18929	validation_1-merror:0.18901                        
[23]	validation_0-merror:0.18837	validation_1-merror:0.18865                        
[24]	validation_0-merror:0.18806	validation_1-merror:0.18829                        
[25]	validation_0-merror:0.18721	validation_1-merror:0.18748                        
[26]	validation_0-merror:0.18678	validation_1-merror:0.18730                        
[27]	validation_0-merror:0.18651	validation_1-merror:0.18623     

In [None]:
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

In [None]:
xgb = XGBClassifier(use_label_encoder=False,
                    eval_metric= 'mlogloss',
                    objective= 'multi:softmax',
                    tree_method='gpu_hist',
                    booster= 'gbtree',
                    silent= 1,
                    seed= 42,
                    num_class= 3,
                    sample_weight=sample_weights,
                    colsample_bytree=0.7,
                    eta=0.075,
                    gamma=0.65,
                    max_depth=6,
                    mind_child_weight=6,
                    n_estimators=166,
                    subsample=0.75
                    )

watchlist = [(X_train_, y_train_), (X_val, y_val)]

xgb.fit(X_train_, y_train_, eval_set=watchlist)
y_hat_train = xgb.predict(X_train_)
y_hat = xgb.predict(X_test_)
#
# y_hat = clf.predict(X_test)
print('Train Accuracy score:', accuracy_score(y_hat_train, y_train_))
# # print('Accuracy score:', accuracy_score(y_test, y_hat))
print('Test Accuracy Score:', accuracy_score(y_test_, y_hat))

In [None]:
test_df_ = test_df[[col for col in X.columns]]
pred = pd.DataFrame(le.inverse_transform(xgb.predict(test_df_)), columns=['status_group'])
sub_idx = pd.DataFrame(pd.read_csv(test_url)['id'])
sub_idx.reset_index(drop=True, inplace=True)
final_sub = pd.concat([sub_idx, pred], axis=1)
final_sub.reset_index(drop=True, inplace=True)
final_sub.set_index('id').to_csv('data/final_sub.csv')