## Data Handling

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
np.random.seed(4444)

In [2]:
df = pd.read_csv('../../../../../../jaeyeun/01_nh_poc/17_add_prep_text_to_excel/split_70_15_15_prep_v9.csv')

df.head()

Unnamed: 0,file_name,label,raw_text,dataset_n,train_val_test,prep_v9_text
0,R1509261.txt,0,보 도 자 료\nhttp://www.msip.go.kr 보도일시 2015. 9. 4...,0,0,온라인N 제도N 혁신N 관호N 미래N 사이트N 플러그인N 지원N 대응N 현황N 공개...
1,R2003733.txt,0,보도일시 2020. 3. 18.(수) 조간(온라인 3. 17. 12:00)부터 보도...,0,0,온라인N 정보N 산업N 기반N 유승N ictS 분야N 창업N 벤처N 지원N 사업N ...
2,D1507076-1.txt,0,보 도 자 료\nhttp://www.msip.go.kr 보도일시 2015. 7. 1...,0,0,이후N 연구N 예산N 내년도N 연구N 개발N r&dN 경제N 혁신N 미래N 성장N ...
3,R2005031.txt,0,보 도 자 료\n배포일시 2020. 4. 29.(수) 총 4매(본문2) 담당 부서 ...,0,0,첨단N 항공N 과장N 문석N 홍일산N 이후N 가능N 일상N 시대N 개막N 시행N 전...
4,R2006226.txt,0,<전매체> 2020년 6월 3일(수) 10:00(국무회의 개최시)부터 보도하여 주시...,0,0,국무회의N 개최N 문의N 기획N 재정관N 과장N 서기관N 위기N 혁신N 기회N 벤처...


* Train : KDI 70+15 (train+val)
* Val : KDI 15(test)
* Test : legal 273(train+val+test)

In [3]:
X_train_df = df[(df['dataset_n'] == 0) & ((df['train_val_test'] == 0) | (df['train_val_test'] == 1))]
X_val_df = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 2)]
X_test_df = df[df['dataset_n'] != 0]
y_train = df[(df['dataset_n'] == 0) & ((df['train_val_test'] == 0) | (df['train_val_test'] == 1))]['label']
y_val = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 2)]['label']
y_test = df[df['dataset_n'] != 0]['label']

In [4]:
print(X_train_df.shape)
print(X_val_df.shape)
print(X_test_df.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(9174, 6)
(1626, 6)
(273, 6)
(9174,)
(1626,)
(273,)


In [5]:
n_features = 100000
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9, # 0.9 만큼의 문서 이상에서 나오면 거른다.
    min_df=5, # 5개 미만의 문서에서 나오면 거른다.
    sublinear_tf = True, # tf value를 완만하게 처리 (outlier 처리 효과)
    ngram_range = (1, 3),
    max_features=n_features)

In [18]:
X_train_tfidf = tfidf_vectorizer.fit(X_train_df['prep_v9_text'])
# X_test_tfidf = tfidf_vectorizer.fit_transform(prep_text_test)
# X_test_hash = hash_vectorizer.fit_transform(prep_text_test)

In [19]:
X_train = X_train_tfidf.transform(X_train_df['prep_v9_text'])
X_val = X_train_tfidf.transform(X_val_df['prep_v9_text'])
X_test = X_train_tfidf.transform(X_test_df['prep_v9_text'])

In [20]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(9174, 100000)
(1626, 100000)
(273, 100000)


## Classify

* LightGBM RandomSearch

In [21]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgbm

In [23]:
lgbm_clf = lgbm.LGBMClassifier()

lgbm_param_grid = {'learning_rate': [.01, .015, .025, .05, .1],
#                   'Gamma': [.05, .1, .3, .5, .7, .9, 1],
                  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                  'min_child_weight': [1, 3, 5, 7],
                  'subsample': np.linspace(0.6, 1, 5)}

fit_params = {"early_stopping_rounds" : 20,
             "eval_metric" : "multi_error",
             "eval_set" : [(X_train, y_train), (X_val, y_val)]}

# Create a random search object
lgbm_random = RandomizedSearchCV(estimator = lgbm_clf,
                                param_distributions = lgbm_param_grid,
                                n_iter = 20, # n_iters in param combinations
                                scoring='accuracy',
                                n_jobs=-1,
                                cv = 5,
                                refit=True,
                                return_train_score = True,
                                verbose=10)

# Fit to the training data
lgbm_random.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed: 50.8min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 68.6min
[Parallel(n_jobs=-1)]: Done  48 out of 100 | elapsed: 92.6min remaining: 100.3min
[Parallel(n_jobs=-1)]: Done  59 out of 100 | elapsed: 100.6min remaining: 69.9min
[Parallel(n_jobs=-1)]: Done  70 out of 100 | elapsed: 111.0min remaining: 47.6min
[Parallel(n_jobs=-1)]: Done  81 out of 100 | elapsed: 148.7min remaining: 34.9min
[Parallel(n_jobs=-1)]: Done  92 out of 100 | elapsed: 157.3min remaining: 13.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 172.7min finished


[1]	training's multi_error: 0.842163	training's multi_logloss: 2.10079	valid_1's multi_error: 0.842558	valid_1's multi_logloss: 2.11186
Training until validation scores don't improve for 20 rounds
[2]	training's multi_error: 0.530412	training's multi_logloss: 1.97036	valid_1's multi_error: 0.54797	valid_1's multi_logloss: 1.99263
[3]	training's multi_error: 0.328646	training's multi_logloss: 1.85849	valid_1's multi_error: 0.383149	valid_1's multi_logloss: 1.89053
[4]	training's multi_error: 0.249073	training's multi_logloss: 1.75975	valid_1's multi_error: 0.317343	valid_1's multi_logloss: 1.7999
[5]	training's multi_error: 0.197951	training's multi_logloss: 1.67168	valid_1's multi_error: 0.265683	valid_1's multi_logloss: 1.72034
[6]	training's multi_error: 0.175496	training's multi_logloss: 1.5927	valid_1's multi_error: 0.242312	valid_1's multi_logloss: 1.64932
[7]	training's multi_error: 0.160671	training's multi_logloss: 1.52091	valid_1's multi_error: 0.228167	valid_1's multi_logloss

RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=20, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.015, 0.025,
                                                          0.05, 0.1],
                                        'max_depth': [3, 5, 7, 9, 12, 15, 17,
                                                      25],
                                        'min_child_weight': [1, 3, 5, 7],
                                        'subsample': array([0.6, 0.7, 0.8, 0.9, 1. ])},
                   return_train_score=True, scoring='accuracy', verbose=10)

In [24]:
pred_train = lgbm_random.best_estimator_.predict(X_train)
pred_val = lgbm_random.best_estimator_.predict(X_val)
pred_test = lgbm_random.best_estimator_.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_val, pred_val))
print(accuracy_score(y_test, pred_test))

0.9456071506431218
0.8198031980319803
0.3772893772893773


In [26]:
cv_result_df = pd.DataFrame(lgbm_random.cv_results_)

df_list = []
for i in range(20):
    df_list.append(pd.DataFrame([cv_result_df.loc[i, "params"]]))
    
param_table = pd.concat(df_list)

param_table['mean_test_score'] = cv_result_df['mean_test_score'].values

param_table.sort_values(by='mean_test_score', axis=0)

Unnamed: 0,subsample,min_child_weight,max_depth,learning_rate,mean_test_score
0,0.9,1,3,0.01,0.751581
0,1.0,3,3,0.015,0.763354
0,0.7,7,7,0.01,0.790386
0,0.6,5,25,0.01,0.793112
0,0.9,1,17,0.015,0.797144
0,0.6,1,17,0.015,0.797144
0,0.6,3,15,0.025,0.799871
0,1.0,7,9,0.015,0.800741
0,0.6,3,25,0.025,0.801179
0,0.9,1,15,0.025,0.802269
