## Data Handling

In [7]:
import pandas as pd
import numpy as np
import pickle
np.random.seed(4444)
with open ('../../../../../jaeyeun/01_nh_poc/23_BERT_NH_NO_2/nh_output_20200812_raw_text_embedding/test_set_prediction.pickle', 'rb') as f:
    emb = pickle.load(f)

In [8]:
X_train_feature = []
X_val_feature = []
X_test_feature = []
y_train = []
y_val = []
y_test = []

In [9]:
for title in emb.keys():
    data = emb[title]
    dataset_n = data['dataset_n']
    tvt = data['train_val_test']
    
    if dataset_n == '0':
        if tvt == '0' or tvt == '1':
            X_train_feature.append(data['feature'])
            y_train.append(data['label'])
        else:
            X_val_feature.append(data['feature'])
            y_val.append(data['label'])
    else:
        X_test_feature.append(data['feature'])
        y_test.append(data['label'])

In [10]:
X_train = np.array(X_train_feature)
X_val = np.array(X_val_feature)
X_test = np.array(X_test_feature)

In [11]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(9174, 768)
(1626, 768)
(273, 768)


In [12]:
print(len(y_train))
print(len(y_val))
print(len(y_test))

9174
1626
273


## Classify

* XGBoost RandomSearch

In [29]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgbm
from scipy import stats

In [18]:
lgbm_clf = lgbm.LGBMClassifier()

lgbm_param_grid = {'learning_rate': [.01, .015, .025, .05, .1],
#                   'Gamma': [.05, .1, .3, .5, .7, .9, 1],
                  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                  'min_child_weight': [1, 3, 5, 7],
                  'subsample': np.linspace(0.6, 1, 5)}

fit_params = {"early_stopping_rounds" : 20,
             "eval_metric" : "multi_error",
             "eval_set" : [(X_train, y_train), (X_val, y_val)]}

# Create a random search object
lgbm_random = RandomizedSearchCV(estimator = lgbm_clf,
                                param_distributions = lgbm_param_grid,
                                n_iter = 20, # n_iters in param combinations
                                scoring='accuracy',
                                n_jobs=-1,
                                cv = 5,
                                refit=True,
                                return_train_score = True,
                                verbose=10)

# Fit to the training data
lgbm_random.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  48 out of 100 | elapsed:  6.8min remaining:  7.4min
[Parallel(n_jobs=-1)]: Done  59 out of 100 | elapsed:  7.5min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done  70 out of 100 | elapsed: 10.1min remaining:  4.3min
[Parallel(n_jobs=-1)]: Done  81 out of 100 | elapsed: 11.1min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  92 out of 100 | elapsed: 11.5min remaining:   59.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 11.8min finished


[1]	training's multi_error: 0.684979	training's multi_logloss: 2.09772	valid_1's multi_error: 0.704182	valid_1's multi_logloss: 2.12312
Training until validation scores don't improve for 20 rounds
[2]	training's multi_error: 0.53499	training's multi_logloss: 1.9702	valid_1's multi_error: 0.594096	valid_1's multi_logloss: 2.01905
[3]	training's multi_error: 0.431328	training's multi_logloss: 1.8598	valid_1's multi_error: 0.50738	valid_1's multi_logloss: 1.92812
[4]	training's multi_error: 0.370722	training's multi_logloss: 1.76411	valid_1's multi_error: 0.464945	valid_1's multi_logloss: 1.85543
[5]	training's multi_error: 0.334096	training's multi_logloss: 1.68282	valid_1's multi_error: 0.437884	valid_1's multi_logloss: 1.79211
[6]	training's multi_error: 0.305755	training's multi_logloss: 1.60628	valid_1's multi_error: 0.418204	valid_1's multi_logloss: 1.73226
[7]	training's multi_error: 0.283628	training's multi_logloss: 1.53784	valid_1's multi_error: 0.402829	valid_1's multi_logloss:

RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=20, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.015, 0.025,
                                                          0.05, 0.1],
                                        'max_depth': [3, 5, 7, 9, 12, 15, 17,
                                                      25],
                                        'min_child_weight': [1, 3, 5, 7],
                                        'subsample': array([0.6, 0.7, 0.8, 0.9, 1. ])},
                   return_train_score=True, scoring='accuracy', verbose=10)

In [19]:
pred_train = lgbm_random.best_estimator_.predict(X_train)
pred_val = lgbm_random.best_estimator_.predict(X_val)
pred_test = lgbm_random.best_estimator_.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_val, pred_val))
print(accuracy_score(y_test, pred_test))

0.986047525615871
0.7564575645756457
0.3553113553113553


In [22]:
lgbm_random.best_estimator_

LGBMClassifier(max_depth=5, min_child_weight=3)

In [23]:
lgbm_cv_result_df = pd.DataFrame(lgbm_random.cv_results_)

In [24]:
df_list = []
for i in range(20):
    df_list.append(pd.DataFrame([lgbm_cv_result_df.loc[i, "params"]]))
    
lgbm_param_table = pd.concat(df_list)

In [26]:
lgbm_param_table['mean_test_score'] = lgbm_cv_result_df['mean_test_score'].values

In [28]:
lgbm_param_table.sort_values(by='mean_test_score', axis=0)

Unnamed: 0,subsample,min_child_weight,max_depth,learning_rate,mean_test_score
0,0.8,1,25,0.01,0.629278
0,0.7,5,7,0.01,0.632004
0,0.6,7,15,0.01,0.63353
0,0.8,1,5,0.015,0.641378
0,1.0,3,5,0.015,0.642577
0,0.8,3,3,0.05,0.654567
0,0.9,3,25,0.015,0.6598
0,0.8,3,15,0.015,0.6598
0,1.0,7,9,0.015,0.664487
0,0.8,3,3,0.1,0.706997
