## Data Handling

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
np.random.seed(4444)

In [8]:
df = pd.read_csv('../../../../jaeyeun/01_nh_poc/15_split_data_set_and_make_json_for_train_test_set/split_70_15_15.csv')

df.head()

Unnamed: 0,file_name,label,raw_text,dataset_n,train_val_test
0,R1509261.txt,0,보 도 자 료\nhttp://www.msip.go.kr 보도일시 2015. 9. 4...,0,0
1,R2003733.txt,0,보도일시 2020. 3. 18.(수) 조간(온라인 3. 17. 12:00)부터 보도...,0,0
2,D1507076-1.txt,0,보 도 자 료\nhttp://www.msip.go.kr 보도일시 2015. 7. 1...,0,0
3,R2005031.txt,0,보 도 자 료\n배포일시 2020. 4. 29.(수) 총 4매(본문2) 담당 부서 ...,0,0
4,R2006226.txt,0,<전매체> 2020년 6월 3일(수) 10:00(국무회의 개최시)부터 보도하여 주시...,0,0


* Train : KDI 70+15 (train+val)
* Val : KDI 15(test)
* Test : legal 273(train+val+test)

In [9]:
X_train_df = df[(df['dataset_n'] == 0) & ((df['train_val_test'] == 0) | (df['train_val_test'] == 1))]
X_val_df = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 2)]
X_test_df = df[df['dataset_n'] != 0]
y_train = df[(df['dataset_n'] == 0) & ((df['train_val_test'] == 0) | (df['train_val_test'] == 1))]['label']
y_val = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 2)]['label']
y_test = df[df['dataset_n'] != 0]['label']

In [10]:
print(X_train_df.shape)
print(X_val_df.shape)
print(X_test_df.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(9174, 5)
(1626, 5)
(273, 5)
(9174,)
(1626,)
(273,)


In [11]:
n_features = 100000
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9, # 0.9 만큼의 문서 이상에서 나오면 거른다.
    min_df=5, # 5개 미만의 문서에서 나오면 거른다.
    sublinear_tf = True, # tf value를 완만하게 처리 (outlier 처리 효과)
#     ngram_range = (1, 3),
    max_features=n_features)

In [12]:
X_train_tfidf = tfidf_vectorizer.fit(X_train_df['raw_text'])
# X_test_tfidf = tfidf_vectorizer.fit_transform(prep_text_test)
# X_test_hash = hash_vectorizer.fit_transform(prep_text_test)

In [13]:
X_train_tfidf_transformed = X_train_tfidf.transform(X_train_df['raw_text'])
X_val_tfidf_transformed = X_train_tfidf.transform(X_val_df['raw_text'])
X_test_tfidf_transformed = X_train_tfidf.transform(X_test_df['raw_text'])

In [14]:
print(X_train_tfidf_transformed.shape)
print(X_val_tfidf_transformed.shape)
print(X_test_tfidf_transformed.shape)

(9174, 96971)
(1626, 96971)
(273, 96971)


In [15]:
svd = TruncatedSVD(n_components=5000)
svd.fit(X_train_tfidf_transformed)

TruncatedSVD(n_components=5000)

In [16]:
X_train = svd.transform(X_train_tfidf_transformed)
X_val = svd.transform(X_val_tfidf_transformed)
X_test = svd.transform(X_test_tfidf_transformed)

In [17]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(9174, 5000)
(1626, 5000)
(273, 5000)


## Classify

* XGBoost RandomSearch

In [18]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [19]:
xgb_clf = xgb.XGBClassifier()

xgb_param_grid = {'learning_rate': [.01, .015, .025, .05, .1],
#                   'Gamma': [.05, .1, .3, .5, .7, .9, 1],
                  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                  'min_child_weight': [1, 3, 5, 7],
                  'subsample': np.linspace(0.6, 1, 5)}

fit_params = {"early_stopping_rounds" : 20,
             "eval_metric" : "merror",
             "eval_set" : [(X_train, y_train), (X_val, y_val)]}

# Create a random search object
xgb_random = RandomizedSearchCV(estimator = xgb_clf,
                                param_distributions = xgb_param_grid,
                                n_iter = 20, # n_iters in param combinations
                                scoring='accuracy',
                                n_jobs=-1,
                                cv = 5,
                                refit=True,
                                return_train_score = True,
                                verbose=10)

# Fit to the training data
xgb_random.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 206.1min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed: 284.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 373.6min
[Parallel(n_jobs=-1)]: Done  48 out of 100 | elapsed: 399.9min remaining: 433.2min
[Parallel(n_jobs=-1)]: Done  59 out of 100 | elapsed: 557.1min remaining: 387.1min
[Parallel(n_jobs=-1)]: Done  70 out of 100 | elapsed: 609.4min remaining: 261.2min
[Parallel(n_jobs=-1)]: Done  81 out of 100 | elapsed: 703.4min remaining: 165.0min
[Parallel(n_jobs=-1)]: Done  92 out of 100 | elapsed: 736.7min remaining: 64.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 815.5min finished


[0]	validation_0-merror:0.24351	validation_1-merror:0.32595
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 20 rounds.
[1]	validation_0-merror:0.18999	validation_1-merror:0.28536
[2]	validation_0-merror:0.16874	validation_1-merror:0.26876
[3]	validation_0-merror:0.14933	validation_1-merror:0.26138
[4]	validation_0-merror:0.13844	validation_1-merror:0.26199
[5]	validation_0-merror:0.12743	validation_1-merror:0.25154
[6]	validation_0-merror:0.11990	validation_1-merror:0.24969
[7]	validation_0-merror:0.11271	validation_1-merror:0.23924
[8]	validation_0-merror:0.10639	validation_1-merror:0.23862
[9]	validation_0-merror:0.09843	validation_1-merror:0.23493
[10]	validation_0-merror:0.09342	validation_1-merror:0.23370
[11]	validation_0-merror:0.09134	validation_1-merror:0.23309
[12]	validation_0-merror:0.08557	validation_1-merror:0.23124
[13]	validation_0-merror:0.08034	validation_1-merror:0

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           reg_lambda=None,
                                           scale_pos_weight=None,
                                           subsample=No

In [20]:
pred_train = xgb_random.best_estimator_.predict(X_train)
pred_val = xgb_random.best_estimator_.predict(X_val)
pred_test = xgb_random.best_estimator_.predict(X_test)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_val, pred_val))
print(accuracy_score(y_test, pred_test))

0.9991279703509919
0.8148831488314883
0.20146520146520147


In [23]:
xgb_random.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=12,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
xgb_cv_result_df = pd.DataFrame(xgb_random.cv_results_)

In [25]:
df_list = []
for i in range(20):
    df_list.append(pd.DataFrame([xgb_cv_result_df.loc[i, "params"]]))
    
xgb_param_table = pd.concat(df_list)

In [26]:
xgb_param_table['mean_test_score'] = xgb_cv_result_df['mean_test_score'].values

In [27]:
xgb_param_table.sort_values(by='mean_test_score', axis=0)

Unnamed: 0,subsample,min_child_weight,max_depth,learning_rate,mean_test_score
0,1.0,3,3,0.01,0.70362
0,0.7,5,3,0.025,0.744932
0,0.8,7,17,0.01,0.757905
0,0.9,3,12,0.01,0.762807
0,0.8,1,5,0.015,0.763571
0,0.9,3,5,0.015,0.764116
0,1.0,3,25,0.015,0.770548
0,1.0,5,9,0.015,0.771093
0,0.8,5,5,0.025,0.775889
0,0.6,1,9,0.015,0.776107
