In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import  compute_sample_weight
from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


In [None]:
df_corr=df_.corr().abs().stack().reset_index().sort_values(0, ascending=False)
df_corr['pairs'] = list(zip(df_corr.level_0, df_corr.level_1))
df_corr.set_index(['pairs'], inplace = True)
df_corr.drop(columns=['level_1', 'level_0'], inplace = True)
df_corr.columns = ['cc']
df_corr.drop_duplicates(inplace=True)
df_corr = df_corr[df_corr['cc'] < 1.0000]
df_corr.head(10)

In [None]:
corr_feats = set()
corr = df_.corr()

for i in range(len(corr.columns)):
    for j in range(i):
        if abs(corr.iloc[i, j]) > 0.85:
            colname = corr.columns[i]
            corr_feats.add(colname)

In [None]:
df_ = df_.drop(corr_feats, axis=1)
test_df = test_df.drop(corr_feats, axis=1)

In [None]:
unwanted_cols = ['date_recorded', 'recorded_by', 'wpt_name', 'Unnamed: 0.1', 'id']
df_.drop(unwanted_cols, axis=1, inplace=True)
test_df.drop(unwanted_cols, axis=1, inplace=True)

In [None]:
df_.drop('index', axis=1, inplace=True)

In [None]:
cols = [col for col in df_.columns.tolist() if col != 'status_group']

test_df = test_df.reindex(columns=[col for col in df_.columns]).drop('status_group', axis=1)

test_df.columns.tolist() == cols

In [None]:


sample_weights = compute_sample_weight('balanced', y)

X = df_.drop(['status_group'], axis=1).select_dtypes(['int', 'float'])
y = df_['status_group']

le = LabelEncoder()
y = le.fit_transform(y)

X_train_, X_test_, y_train_, y_test_ = train_test_split(X,y, random_state=42, stratify=y)
X_val, X_test_, y_val, y_test_ = train_test_split(X_test_, y_test_, random_state=42, stratify=y_test_)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)
y_hat_train = rf.predict(X_train)
y_hat = rf.predict(X_test)

print('Train Accuracy Score:', accuracy_score(y_hat_train, y_train))
print('Test Accuracy Score:', accuracy_score(y_test, y_hat))

In [None]:
from sklearn.utils.class_weight import  compute_sample_weight
from sklearn.model_selection import cross_val_score
sample_weights = compute_sample_weight('balanced', y)

xgb = XGBClassifier(use_label_encoder=False, \
                    n_jobs=-1,
                    tree_method='gpu_hist',
                    objective='multi:softprob',
                    sample_weight=sample_weights)

xgb.fit(X_train, y_train)

y_hat = xgb.predict(X_test)
print('Train Accuracy score:', accuracy_score(y_hat_train, y_train))
# print('Accuracy score:', accuracy_score(y_test, y_hat))
print('Test Accuracy Score:', accuracy_score(y_test, y_hat))

In [None]:
def score(params):
    print("Training with params: ")
    print(params)
    watchlist = [(X_train_, y_train_), (X_val, y_val)]
    gbm_model = XGBClassifier(
        use_label_encoder=False,
        n_estimators = int(params['n_estimators']),
        max_depth= params['max_depth'],
        min_child_weight= params['min_child_weight'],
        subsample= params['subsample'],
        gamma= params['gamma'],
        colsample_bytree= params['colsample_bytree'],
        eta= params['eta'],
        eval_metric= 'mlogloss',
        objective= 'multi:softmax',
        tree_method='gpu_hist',
        booster= 'gbtree',
        silent= 1,
        seed= 42,
        num_class= 3,
        sample_weight=sample_weights
    )
    gbm_model.fit(X_train_, y_train_,
                  eval_set=watchlist)
    predictions = gbm_model.predict(X_test_)

    score_ = cross_val_score(gbm_model, X_train_, y_train_, cv=5, scoring='accuracy', n_jobs=-1).mean()

    print("\tScore {0}\n\n".format(score_))

    return {'loss': -score_, 'status': STATUS_OK}


def optimize(trials):
    """
    This is the optimization function that given a space (space here) of
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """

    space = {
        'n_estimators': hp.quniform('n_estimators', 10, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 20, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, \
                space, \
                algo=tpe.suggest, \
                trials=trials, \
                max_evals=1000)
    return best

trials = Trials()

best_hyperparams = optimize(trials)
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

print("Best Parameters: {'colsample_bytree': 0.6000000000000001, 'eta': 0.025, 'gamma': 0.55, 'max_depth': 7, 'min_child_weight': 2.0, 'n_estimators': 680.0, 'subsample': 1.0}")

In [None]:
xgb = XGBClassifier(use_label_encoder=False,
                    eval_metric= 'mlogloss',
                    objective= 'multi:softmax',
                    tree_method='gpu_hist',
                    booster= 'gbtree',
                    silent= 1,
                    seed= 42,
                    num_class= 3,
                    sample_weight=sample_weights,
                    colsample_bytree=0.7,
                    eta=0.075,
                    gamma=0.65,
                    max_depth=6,
                    mind_child_weight=6,
                    n_estimators=166,
                    subsample=0.75
                    )

watchlist = [(X_train_, y_train_), (X_val, y_val)]

xgb.fit(X_train_, y_train_, eval_set=watchlist)
y_hat_train = xgb.predict(X_train_)
y_hat = xgb.predict(X_test_)
#
# y_hat = clf.predict(X_test)
print('Train Accuracy score:', accuracy_score(y_hat_train, y_train_))
# # print('Accuracy score:', accuracy_score(y_test, y_hat))
print('Test Accuracy Score:', accuracy_score(y_test_, y_hat))

In [None]:
test_df_ = test_df[[col for col in X.columns]]
pred = pd.DataFrame(le.inverse_transform(xgb.predict(test_df_)), columns=['status_group'])
sub_idx = pd.DataFrame(pd.read_csv(test_url)['id'])
sub_idx.reset_index(drop=True, inplace=True)
final_sub = pd.concat([sub_idx, pred], axis=1)
final_sub.reset_index(drop=True, inplace=True)
final_sub.set_index('id').to_csv('data/final_sub.csv')