In [182]:
import pandas as pd
import numpy as np

In [76]:
train = pd.read_csv('train.csv')

In [77]:
test = pd.read_csv('test.csv')

In [78]:
from konlpy.tag import Mecab

In [18]:
mc = Mecab()

In [138]:
def mecab_pos_process(data) :
    pos_list = []
    pos_token = []
    
    for doc in data['document'] :
        doc_pos = mc.pos(doc)
        poss = ''
        for pos in doc_pos :
            if pos[1].startswith('V') or pos[1].startswith('N') or pos[1].startswith('MA'):
                poss += f' {pos[0]}'
            else :
                pass
        pos_token.append(poss[1:])
            
    data['pos_token'] = pos_token
    return data

In [218]:
train = mecab_pos_process(train)
test = mecab_pos_process(test)

In [219]:
train['document'] = train.document.str.replace('ㅡㅡ', '화남')
test['document'] = test.document.str.replace('ㅡㅡ', '화남')

In [220]:
train['cnt'] = [mc.morphs(text) for text in train['document']]

In [221]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [222]:
tfidf = TfidfVectorizer(max_features = 1500, lowercase = False)

In [223]:
tfidf_vect = tfidf.fit_transform(train['pos_token'])

In [224]:
tr_result = pd.DataFrame(tfidf_vect.todense(), columns = tfidf.get_feature_names())

In [225]:
te_result = pd.DataFrame(tfidf.transform(test['pos_token']).todense(), columns = tfidf.get_feature_names())

In [226]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

In [227]:
y = train.label

In [228]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [230]:
cb_acc = []
cb_pred = np.zeros(test.shape[0])

for i, (tr_idx, val_idx) in enumerate(skf.split(tr_result, y)) :
    tr_x, tr_y = tr_result.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = tr_result.iloc[val_idx], y.iloc[val_idx]
    
    tr_data = Pool(data = tr_x, label = tr_y)
    val_data = Pool(data = val_x, label = val_y)
    
    cb = CatBoostClassifier(random_state = 42, n_estimators = 3000, max_depth = 6, learning_rate = 0.1, use_best_model = True)
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 500, verbose = 1000)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    val_cls = [1 if p >= 0.5 else 0 for p in val_pred]
    val_acc = accuracy_score(val_y, val_cls)
    cb_acc.append(val_acc)
    print(f'{i + 1}FOLD ACC = {val_acc}\n')
    
    fold_pred = cb.predict_proba(te_result)[:, 1]
    cb_pred += (fold_pred / 10)
print(f'{cb.__class__.__name__}의 10FOLD 평균 정확도는 {np.mean(cb_acc)}이고 편차는 {np.std(cb_acc)}')

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.6730334	test: 0.6716686	best: 0.6716686 (0)	total: 7.55ms	remaining: 22.7s
1000:	learn: 0.2102869	test: 0.4280185	best: 0.4263824 (827)	total: 8.27s	remaining: 16.5s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4263824321
bestIteration = 827

Shrink model to first 828 iterations.
1FOLD ACC = 0.778

0:	learn: 0.6736628	test: 0.6704995	best: 0.6704995 (0)	total: 7.92ms	remaining: 23.7s
1000:	learn: 0.2118764	test: 0.4260571	best: 0.4260483 (947)	total: 8.24s	remaining: 16.5s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4251656861
bestIteration = 1037

Shrink model to first 1038 iterations.
2FOLD ACC = 0.796

0:	learn: 0.6762765	test: 0.6775324	best: 0.6775324 (0)	total: 8.1ms	remaining: 24.3s
1000:	learn: 0.2099441	test: 0.4332353	best: 0.4313734 (946)	total: 8.25s	remaining: 16.5s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.431373356
bestIteration = 946

Shrink model to first 947 iterations.
3FOLD ACC =

In [231]:
submission = pd.read_csv('sample_submission.csv')

In [232]:
submission['label'] = [1 if p >= 0.5 else 0 for p in cb_pred]

In [233]:
submission.label.value_counts()

0    2770
1    2230
Name: label, dtype: int64

In [202]:
submission.to_csv("0110_cb.csv", index = False)