## Data Handling

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
np.random.seed(4444)

In [17]:
df = pd.read_csv('../../../../jaeyeun/01_nh_poc/15_split_data_set_and_make_json_for_train_test_set/split_70_15_15.csv')
df.head()

Unnamed: 0,file_name,label,raw_text,dataset_n,train_val_test
0,R1509261.txt,0,보 도 자 료\nhttp://www.msip.go.kr 보도일시 2015. 9. 4...,0,0
1,R2003733.txt,0,보도일시 2020. 3. 18.(수) 조간(온라인 3. 17. 12:00)부터 보도...,0,0
2,D1507076-1.txt,0,보 도 자 료\nhttp://www.msip.go.kr 보도일시 2015. 7. 1...,0,0
3,R2005031.txt,0,보 도 자 료\n배포일시 2020. 4. 29.(수) 총 4매(본문2) 담당 부서 ...,0,0
4,R2006226.txt,0,<전매체> 2020년 6월 3일(수) 10:00(국무회의 개최시)부터 보도하여 주시...,0,0


* Train : KDI only 70 (train)
* Val : KDI only 15 (val)
* Test : legal 273 (train+val+test)

In [3]:
raw_text = df[df['dataset_n'] == 0]['raw_text']
train_idx = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 0)].index
val_idx = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 1)].index
test_idx = df[df['dataset_n'] != 0].index
y_train = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 0)]['label']
y_val = df[(df['dataset_n'] == 0) & (df['train_val_test'] == 1)]['label']
y_test = df[df['dataset_n'] != 0]['label']

In [4]:
n_features = 100000
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9, # 0.9 만큼의 문서 이상에서 나오면 거른다.
    min_df=5, # 5개 미만의 문서에서 나오면 거른다.
    sublinear_tf = True, # tf value를 완만하게 처리 (outlier 처리 효과)
    ngram_range = (1, 3),
    max_features=n_features)

In [5]:
X_tfidf = tfidf_vectorizer.fit_transform(raw_text)
# X_test_tfidf = tfidf_vectorizer.fit_transform(prep_text_test)
# X_test_hash = hash_vectorizer.fit_transform(prep_text_test)

In [6]:
X_tfidf.shape

(11073, 100000)

* split x values

In [7]:
'''
because fit_transformed data is sparse matrix,
make it dense matrix before split into train and test datasets
and remake it sparse
'''
dense_X_tfidf = X_tfidf.toarray()

In [8]:
X_tfidf_train = dense_X_tfidf[train_idx]
X_tfidf_val = dense_X_tfidf[val_idx]
X_tfidf_test = dense_X_tfidf[test_idx]

In [9]:
print(X_tfidf_train.shape)
print(X_tfidf_val.shape)
print(X_tfidf_test.shape)

(7554, 100000)
(1620, 100000)
(273, 100000)


In [10]:
from scipy import sparse

In [11]:
X_train_tfidf_csr = sparse.csr_matrix(X_tfidf_train)
X_val_tfidf_csr = sparse.csr_matrix(X_tfidf_val)
X_test_tfidf_csr = sparse.csr_matrix(X_tfidf_test)

In [12]:
print(X_train_tfidf_csr.shape)
print(X_val_tfidf_csr.shape)
print(X_test_tfidf_csr.shape)

(7554, 100000)
(1620, 100000)
(273, 100000)


In [13]:
X_train = X_train_tfidf_csr
X_val = X_val_tfidf_csr
X_test = X_test_tfidf_csr

## Classify

* XGBoost RandomSearch

In [14]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [15]:
xgb_clf = xgb.XGBClassifier()

xgb_param_grid = {'learning_rate': [.01, .015, .025, .05, .1],
#                   'Gamma': [.05, .1, .3, .5, .7, .9, 1],
                  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
                  'min_child_weight': [1, 3, 5, 7],
                  'subsample': np.linspace(0.6, 1, 5)}

fit_params = {"early_stopping_rounds" : 20,
             "eval_metric" : "merror",
             "eval_set" : [(X_train, y_train), (X_val, y_val)]}

# Create a random search object
xgb_random = RandomizedSearchCV(estimator = xgb_clf,
                                param_distributions = xgb_param_grid,
                                n_iter = 20, # n_iters in param combinations
                                scoring='accuracy',
                                n_jobs=-1,
                                cv = 5,
                                refit=True,
                                return_train_score = True,
                                verbose=10)

# Fit to the training data
xgb_random.fit(X_train, y_train, **fit_params)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  43 out of  64 | elapsed: 13.5min remaining:  6.6min
[Parallel(n_jobs=-1)]: Done  50 out of  64 | elapsed: 13.5min remaining:  3.8min
[Parallel(n_jobs=-1)]: Done  57 out of  64 | elapsed: 13.5min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed: 13.5min remaining:    0.0s


KeyboardInterrupt: 

In [None]:
pred_train = xgb_random.best_estimator_.predict(X_train)
pred_val = xgb_random.best_estimator_.predict(X_val)
pred_test = xgb_random.best_estimator_.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(accuracy_score(y_train, pred_train))
print(accuracy_score(y_test, pred_val))
print(accuracy_score(y_test, pred_test))

In [None]:
print(accuracy_score(y_train, pred_rand_best_train))

In [None]:
xgb_random.best_estimator_

In [None]:
hr_random_df = pd.DataFrame(xgb_random.cv_results_)
hr_random_df.loc[:, ['mean_test_score', "params"]]

In [None]:
hr_random_df.loc[0, "params"]

In [None]:
df_list = []
for i in range(20):
    df_list.append(pd.DataFrame([hr_random_df.loc[i, "params"]]))
    
hr_random_new = pd.concat(df_list)

In [None]:
a = hr_random_df['mean_test_score']

In [None]:
hr_random_new['mean_test_score'] = a.values

In [None]:
hr_random_new

In [None]:
hr_random_new.sort_values(by='mean_test_score', axis=0)