## Data Handling

In [37]:
import pandas as pd
import numpy as np
import pickle
np.random.seed(4444)

with open ('../../../../../jaeyeun/01_nh_poc/23_BERT_NH_NO_2/nh_output_20200812_raw_text_embedding/test_set_prediction.pickle', 'rb') as f:
    emb = pickle.load(f)

In [38]:
emb['R1102079-1.txt'].keys()

dict_keys(['pred', 'dataset_n', 'train_val_test', 'softmax', 'label', 'feature'])

In [42]:
X_train_feature = []
X_val_feature = []
X_test_feature = []
y_train = []
y_val = []
y_test = []
for title in emb.keys():
    data = emb[title]
    dataset_n = data['dataset_n']
    tvt = data['train_val_test']
    
    if dataset_n == '0':
        if tvt == '0' or tvt == '1':
            X_train_feature.append(data['feature'])
            y_train.append(data['label'])
        else:
            X_val_feature.append(data['feature'])
            y_val.append(data['label'])
    else:
        X_test_feature.append(data['feature'])
        y_test.append(data['label'])

In [43]:
X_train = np.array(X_train_feature)
X_val = np.array(X_val_feature)
X_test = np.array(X_test_feature)

In [44]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(9174, 768)
(1626, 768)
(273, 768)


In [46]:
print(len(y_train))
print(len(y_val))
print(len(y_test))

9174
1626
273


## Classify

* XGBoost RandomSearch

In [47]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [48]:
xgb_clf = xgb.XGBClassifier()

xgb_param_grid = {'learning_rate': [.01, .015, .025, .05, .1],
#                   'gamma': [.05, .1, .3, .5, .7, .9, 1],
                  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                  'min_child_weight': [1, 3, 5, 7],
                  'subsample': np.linspace(0.6, 1, 5)}

fit_params = {"early_stopping_rounds" : 20,
             "eval_metric" : "merror",
             "eval_set" : [(X_train, y_train), (X_val, y_val)]}

# Create a random search object
xgb_random = RandomizedSearchCV(estimator = xgb_clf,
                                param_distributions = xgb_param_grid,
                                n_iter = 20, # n_iters in param combinations
                                scoring='accuracy',
                                n_jobs=-1,
                                cv = 5,
                                refit=True,
                                return_train_score = True,
                                verbose=10)

# Fit to the training data
xgb_random.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 34.1min
[Parallel(n_jobs=-1)]: Done  48 out of 100 | elapsed: 42.4min remaining: 46.0min
[Parallel(n_jobs=-1)]: Done  59 out of 100 | elapsed: 52.0min remaining: 36.1min
[Parallel(n_jobs=-1)]: Done  70 out of 100 | elapsed: 56.9min remaining: 24.4min
[Parallel(n_jobs=-1)]: Done  81 out of 100 | elapsed: 63.7min remaining: 14.9min
[Parallel(n_jobs=-1)]: Done  92 out of 100 | elapsed: 66.6min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 74.5min finished


[0]	validation_0-merror:0.37377	validation_1-merror:0.49816
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.28363	validation_1-merror:0.43235
[2]	validation_0-merror:0.23359	validation_1-merror:0.39299
[3]	validation_0-merror:0.20013	validation_1-merror:0.36900
[4]	validation_0-merror:0.17582	validation_1-merror:0.36285
[5]	validation_0-merror:0.15871	validation_1-merror:0.35424
[6]	validation_0-merror:0.14476	validation_1-merror:0.34010
[7]	validation_0-merror:0.13353	validation_1-merror:0.33825
[8]	validation_0-merror:0.12121	validation_1-merror:0.32595
[9]	validation_0-merror:0.11413	validation_1-merror:0.31857
[10]	validation_0-merror:0.10813	validation_1-merror:0.31857
[11]	validation_0-merror:0.10017	validation_1-merror:0.31550
[12]	validation_0-merror:0.09124	validation_1-merror:0.30812
[13]	validation_0-merror:0.08448	validation_1-merror:0

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [49]:
pred_train = xgb_random.best_estimator_.predict(X_train)
pred_val = xgb_random.best_estimator_.predict(X_val)
pred_test = xgb_random.best_estimator_.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_val, pred_val))
print(accuracy_score(y_test, pred_test))

0.9982559407019839
0.7736777367773677
0.36996336996337


In [52]:
xgb_random.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=17,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [53]:
xgb_cv_result_df = pd.DataFrame(xgb_random.cv_results_)

In [54]:
df_list = []
for i in range(20):
    df_list.append(pd.DataFrame([xgb_cv_result_df.loc[i, "params"]]))
    
xgb_param_table = pd.concat(df_list)

In [55]:
xgb_param_table['mean_test_score'] = xgb_cv_result_df['mean_test_score'].values

In [56]:
xgb_param_table.sort_values(by='mean_test_score', axis=0)

Unnamed: 0,subsample,min_child_weight,max_depth,learning_rate,mean_test_score
0,1.0,3,9,0.015,0.67844
0,1.0,7,12,0.015,0.684435
0,0.6,7,25,0.01,0.688249
0,0.9,1,25,0.01,0.691193
0,0.7,1,7,0.01,0.69141
0,0.9,3,12,0.01,0.695116
0,0.7,3,7,0.015,0.698823
0,0.7,3,12,0.01,0.702746
0,0.8,5,9,0.015,0.703401
0,1.0,7,12,0.025,0.70678
