In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install Mecab


In [None]:
!nvidia-smi

Sat Jul 31 11:40:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!sudo apt-get install g++ openjdk-7-jdk # Install Java 1.7+
# !sudo apt-get install python-dev; pip install konlpy     # Python 2.x
!sudo apt-get install python3-dev; pip3 install konlpy   # Python 3.x
!sudo apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

## INSTALL xgboost

In [None]:
!pip uninstall xgboost
!pip install xgboost


In [None]:
pip install optuna

## PREPROCESSING

In [None]:
import pandas as pd
import re
import numpy as np
from konlpy.tag import Mecab
mecab = Mecab()

import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, log_loss, f1_score

In [None]:
#################
STOPWORDSPATH ="/content/drive/MyDrive/Colab Notebooks/stopwords.txt"
#################
## Import DATA, submission file
train = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/train_data.csv", error_bad_lines=False  )
test = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/test_data.csv",error_bad_lines=False)
submission = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/sample_submission.csv",error_bad_lines=False)
topic_dict = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/topic_dict.csv",error_bad_lines=False)

In [None]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

In [None]:
def clean_punc(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text.strip()

In [None]:
cleaned_train_corpus = []
cleaned_test_corpus = []

for sent in train['title']:
    cleaned_train_corpus.append(clean_punc(sent, punct, punct_mapping))
    
for sent in test['title']:
    cleaned_test_corpus.append(clean_punc(sent, punct, punct_mapping))

In [None]:
def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        texts[i] = texts[i].replace("外人","외국인")
        texts[i] = texts[i].replace("日","일본")
        texts[i] = texts[i].replace("美","미국")
        texts[i] = texts[i].replace("北","북한")
        texts[i] = texts[i].replace("英","영국")
        texts[i] = texts[i].replace("中","중국")
        texts[i] = texts[i].replace("與","여당")
        texts[i] = texts[i].replace("靑","청와대")
        texts[i] = texts[i].replace("野","야당")
        texts[i] = texts[i].replace("伊","이탈리아")
        texts[i] = texts[i].replace("韓","한국")
        texts[i] = texts[i].replace("南","한국")
        texts[i] = texts[i].replace("獨","독일")
        texts[i] = texts[i].replace("佛","프랑스")
        texts[i] = texts[i].replace("檢","검찰")
        texts[i] = texts[i].replace("銀","은행")
        texts[i] = texts[i].replace("亞","아시아")
        texts[i] = texts[i].replace("人","사람")
        texts[i] = texts[i].replace("孫","손혜원")
        texts[i] = texts[i].replace("企","기업")
        texts[i] = texts[i].replace("前","이전")
        texts[i] = texts[i].replace("反","반대")
        texts[i] = texts[i].replace("安","안철수")
        texts[i] = texts[i].replace("展","전시회")
        texts[i] = texts[i].replace("故","사망")
        texts[i] = texts[i].replace("文","문재인")
        texts[i] = texts[i].replace("新","새로운")
        texts[i] = texts[i].replace("曺","조국")
        texts[i] = texts[i].replace("朴","박정치인")
        texts[i] = texts[i].replace("株","주식")
        texts[i] = texts[i].replace("男","남자")
        texts[i] = texts[i].replace("硏","연구")
        texts[i] = texts[i].replace("車","자동차")
        texts[i] = texts[i].replace("軍","군대")
        texts[i] = texts[i].replace("重","중공업")
        texts[i] = texts[i].replace("R&D","연구개발")
        texts[i] = texts[i].replace("문정부","문재인정부")
        


        
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '',str(texts[i])) #remove punctuation
        review = re.sub(r'\d+','', str(review))# remove number
        review = review.lower() #lower case
        review = re.sub(r'\s+', ' ', review) #remove extra space
        review = re.sub(r'<[^>]+>','',review) #remove Html tags
        review = re.sub(r'\s+', ' ', review) #remove spaces
        review = re.sub(r"^\s+", '', review) #remove space from start
        review = re.sub(r'\s+$', '', review) #remove space from the end
        review = re.sub("[一-龥]",'', review)


        corpus.append(review)
    return corpus

In [None]:
basic_preprocessed_train_corpus = clean_text(cleaned_train_corpus)
basic_preprocessed_test_corpus = clean_text(cleaned_test_corpus)

In [None]:
stopwords = []
with open("/content/drive/MyDrive/공민표/xgboost/stopwords.txt") as f:
    for line in f:
        stopwords.append(line.strip())

removed_stopword_train_corpus = []
removed_stopword_test_corpus = []

for tagged in basic_preprocessed_train_corpus:
    tagged=mecab.pos(tagged)
    
    temp = []
    for tag in tagged:
        #일반명사, 고유명사, 동사, 형용사, 긍정지정사, 부정 지정사, 관형사, 일반부사, 체언접두사, 외국어, 한자
        if tag[0] in stopwords or tag[1] not in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
            continue
        temp.append(tag[0])

    removed_stopword_train_corpus.append(' '.join(temp))
    
for tagged in basic_preprocessed_test_corpus:
    tagged=mecab.pos(tagged)
    
    temp = []
    for tag in tagged:
        if tag[0] in stopwords or tag[1] not in ["NNG", "NNP", "VV", "VA", "VCP", "VCN", "MM", "MAG", "XPN", "SL", "SH"]:
            continue
        temp.append(tag[0])

    removed_stopword_test_corpus.append(' '.join(temp))

In [None]:
for i in range(0,10):
    print(removed_stopword_train_corpus[i])
print('\n')
for i in range(0,10):
    print(removed_stopword_test_corpus[i])

인천 핀란드 항공기 결항 휴가철 여행객 분통
실리콘밸리 넘어서 구글 조원 미국 전역 거점
이란 외무 긴장 완화 해결책 미국 경제 전쟁 멈추
nyt 클린턴 측근 한국 기업 특수 관계 조명 공과 종합
시진핑 트럼프 중미 무역 협상 조속 타결 희망
팔레스타인 가 세 소년 이스라엘 총격 사망
인도 파키스탄 공습 테러 캠프 폭격 종 합보
미국 대선 tv 토론 음담패설 만회 실패 트럼프 사과 대신 빌클린턴 공격 역효과
푸틴 한반도 상황 진전 방안 김정은 위원장 논의
특검 면죄부 받 트럼프 스캔들 보도 언론 맹공 국민 적


유튜브 내달 일 크리에이터 지원 공간 운영
어버이날 맑 남부 지방 옅 황사
내년 국가 rd 평가 때 논문 건수 반영 않
김명자 신임 과 총 회장 원로 젊 과학자 지혜
회색 인간 작가 김동식 심 새 소설 집 출간
야외 생 방송 하 액션 캠 전용 요금제
월드컵 태극전사 강 전 초기 레오강 입 종합
미세먼지 속 출근길
왓츠 앱 원 레바논 민심 총리 사퇴 종 합보
베트남 경제 고성장 지속 분기 gdp 성장


In [None]:
train_text = removed_stopword_train_corpus
test_text = removed_stopword_test_corpus
train_label = np.asarray(train.topic_idx)
train['clear_title'] = train_text
test['clear_title'] = test_text

In [None]:
def split(text):
    tokens_ko = text.split()
    return tokens_ko

tfidf_vect = TfidfVectorizer(tokenizer=split)
tfidf_vect.fit(train['clear_title'])
tfidf_matrix_train = tfidf_vect.transform(train['clear_title'])
tfidf_matrix_test = tfidf_vect.transform(test['clear_title'])


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



## XGB

In [None]:
rand_num = 977

def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": rand_num,
        "nthread": -1,
        # "booster " : trial.suggest_categorical("booster ", ["gbtree", "dart"]),
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.3),
        "n_estimators": trial.suggest_int('n_estimators', 3000, 10000), # 반복 수행하는 트리의 개수 너무 크면 overfitting 발생
        "objective": 'multi:softmax',
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2), # L2 규제(과적합 제어)
        "max_depth": trial.suggest_int("max_depth", 3, 10), # 트리의 최대 깊이 ★★★★★ default는 깊이제한 없음
        "gamma": trial.suggest_int("gamma", 0, 10), 
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0), # 개별 트리를 학습할 때마다 무작위로 선택하는 feature의 비율을 제어
        "subsample": trial.suggest_float("subsample", 0.5, 1.0), # 과적합을 제어하기 위해 데이터를 샘플링 하는 비율 ★★★★★ ; 0.5면 전체 data에 절반을 tree 생성에 사용
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10), # 최종 결정 클래스인 leaf node가 되기 위한 최소한의 데이터 개체 수(과적합 제어) ★★★★★
    }
    X_train, X_valid, y_train, y_valid = train_test_split(tfidf_matrix_train, train['topic_idx'], random_state=rand_num, test_size=0.2)

    model = XGBClassifier(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=50,
        verbose=100,
    )

    lgb_pred = model.predict_proba(X_valid)
    log_score = log_loss(y_valid, lgb_pred)
    
    return log_score

In [None]:
sampler = TPESampler(seed=rand_num)
study = optuna.create_study(
    study_name="xgb_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=20)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-07-31 08:20:06,299][0m A new study created in memory with name: xgb_parameter_opt[0m




[0]	validation_0-mlogloss:1.85392	validation_1-mlogloss:1.85324
[1]	validation_0-mlogloss:1.77875	validation_1-mlogloss:1.77874
[2]	validation_0-mlogloss:1.72312	validation_1-mlogloss:1.72350
[3]	validation_0-mlogloss:1.67653	validation_1-mlogloss:1.67750
[4]	validation_0-mlogloss:1.63140	validation_1-mlogloss:1.63317
[5]	validation_0-mlogloss:1.59045	validation_1-mlogloss:1.59143
[6]	validation_0-mlogloss:1.55271	validation_1-mlogloss:1.55442
[7]	validation_0-mlogloss:1.51951	validation_1-mlogloss:1.52229
[8]	validation_0-mlogloss:1.48954	validation_1-mlogloss:1.49221
[9]	validation_0-mlogloss:1.46229	validation_1-mlogloss:1.46504
[10]	validation_0-mlogloss:1.43695	validation_1-mlogloss:1.44118
[11]	validation_0-mlogloss:1.41501	validation_1-mlogloss:1.41908
[12]	validation_0-mlogloss:1.39278	validation_1-mlogloss:1.39687
[13]	validation_0-mlogloss:1.37489	validation_1-mlogloss:1.37871
[14]	validation_0-mlogloss:1.35690	validation_1-mlogloss:1.36108
[15]	validation_0-mlogloss:1.33931	

[32m[I 2021-07-31 08:33:12,659][0m Trial 0 finished with value: 0.5675616140138935 and parameters: {'learning_rate': 0.11169641469356031, 'n_estimators': 5742, 'reg_lambda': 0.005519740604278561, 'max_depth': 6, 'gamma': 4, 'colsample_bytree': 0.5984680356993906, 'subsample': 0.686109724295435, 'min_child_weight': 1}. Best is trial 0 with value: 0.5675616140138935.[0m




[0]	validation_0-mlogloss:1.83969	validation_1-mlogloss:1.84041
[1]	validation_0-mlogloss:1.75586	validation_1-mlogloss:1.75725
[2]	validation_0-mlogloss:1.69444	validation_1-mlogloss:1.69599
[3]	validation_0-mlogloss:1.64411	validation_1-mlogloss:1.64636
[4]	validation_0-mlogloss:1.59442	validation_1-mlogloss:1.59738
[5]	validation_0-mlogloss:1.55164	validation_1-mlogloss:1.55411
[6]	validation_0-mlogloss:1.51111	validation_1-mlogloss:1.51465
[7]	validation_0-mlogloss:1.47547	validation_1-mlogloss:1.47967
[8]	validation_0-mlogloss:1.44487	validation_1-mlogloss:1.44909
[9]	validation_0-mlogloss:1.41572	validation_1-mlogloss:1.42030
[10]	validation_0-mlogloss:1.39007	validation_1-mlogloss:1.39508
[11]	validation_0-mlogloss:1.36746	validation_1-mlogloss:1.37300
[12]	validation_0-mlogloss:1.34390	validation_1-mlogloss:1.34935
[13]	validation_0-mlogloss:1.32574	validation_1-mlogloss:1.33050
[14]	validation_0-mlogloss:1.30727	validation_1-mlogloss:1.31218
[15]	validation_0-mlogloss:1.28950	

[32m[I 2021-07-31 08:43:57,904][0m Trial 1 finished with value: 0.5665515024989071 and parameters: {'learning_rate': 0.1347035836484785, 'n_estimators': 9694, 'reg_lambda': 0.08946131175705235, 'max_depth': 6, 'gamma': 4, 'colsample_bytree': 0.5352668062930858, 'subsample': 0.6626777120705869, 'min_child_weight': 1}. Best is trial 1 with value: 0.5665515024989071.[0m




[0]	validation_0-mlogloss:1.83092	validation_1-mlogloss:1.83019
[1]	validation_0-mlogloss:1.73639	validation_1-mlogloss:1.73435
[2]	validation_0-mlogloss:1.67161	validation_1-mlogloss:1.66865
[3]	validation_0-mlogloss:1.61834	validation_1-mlogloss:1.61572
[4]	validation_0-mlogloss:1.56736	validation_1-mlogloss:1.56546
[5]	validation_0-mlogloss:1.52293	validation_1-mlogloss:1.52076
[6]	validation_0-mlogloss:1.48064	validation_1-mlogloss:1.47998
[7]	validation_0-mlogloss:1.44499	validation_1-mlogloss:1.44497
[8]	validation_0-mlogloss:1.41465	validation_1-mlogloss:1.41376
[9]	validation_0-mlogloss:1.38633	validation_1-mlogloss:1.38631
[10]	validation_0-mlogloss:1.36120	validation_1-mlogloss:1.36084
[11]	validation_0-mlogloss:1.33982	validation_1-mlogloss:1.33943
[12]	validation_0-mlogloss:1.31736	validation_1-mlogloss:1.31758
[13]	validation_0-mlogloss:1.29924	validation_1-mlogloss:1.29957
[14]	validation_0-mlogloss:1.28117	validation_1-mlogloss:1.28167
[15]	validation_0-mlogloss:1.26415	

[32m[I 2021-07-31 08:58:43,942][0m Trial 2 finished with value: 0.6446038002242807 and parameters: {'learning_rate': 0.17792983053063136, 'n_estimators': 7037, 'reg_lambda': 0.008646390181712755, 'max_depth': 5, 'gamma': 7, 'colsample_bytree': 0.5029423578575457, 'subsample': 0.7629041391307959, 'min_child_weight': 5}. Best is trial 1 with value: 0.5665515024989071.[0m




[0]	validation_0-mlogloss:1.86159	validation_1-mlogloss:1.86087
[1]	validation_0-mlogloss:1.79283	validation_1-mlogloss:1.79066
[2]	validation_0-mlogloss:1.74760	validation_1-mlogloss:1.74644
[3]	validation_0-mlogloss:1.70813	validation_1-mlogloss:1.70661
[4]	validation_0-mlogloss:1.67092	validation_1-mlogloss:1.66951
[5]	validation_0-mlogloss:1.63735	validation_1-mlogloss:1.63573
[6]	validation_0-mlogloss:1.60675	validation_1-mlogloss:1.60510
[7]	validation_0-mlogloss:1.57882	validation_1-mlogloss:1.57786
[8]	validation_0-mlogloss:1.55563	validation_1-mlogloss:1.55359
[9]	validation_0-mlogloss:1.53432	validation_1-mlogloss:1.53226
[10]	validation_0-mlogloss:1.51311	validation_1-mlogloss:1.51157
[11]	validation_0-mlogloss:1.49477	validation_1-mlogloss:1.49278
[12]	validation_0-mlogloss:1.47704	validation_1-mlogloss:1.47440
[13]	validation_0-mlogloss:1.46157	validation_1-mlogloss:1.45903
[14]	validation_0-mlogloss:1.44580	validation_1-mlogloss:1.44326
[15]	validation_0-mlogloss:1.43178	

[32m[I 2021-07-31 09:10:30,920][0m Trial 3 finished with value: 0.664787510030443 and parameters: {'learning_rate': 0.14984611948066903, 'n_estimators': 4742, 'reg_lambda': 0.009738520398986625, 'max_depth': 3, 'gamma': 8, 'colsample_bytree': 0.6125690514972519, 'subsample': 0.7440972234337493, 'min_child_weight': 4}. Best is trial 1 with value: 0.5665515024989071.[0m




[0]	validation_0-mlogloss:1.71323	validation_1-mlogloss:1.71271
[1]	validation_0-mlogloss:1.56967	validation_1-mlogloss:1.56823
[2]	validation_0-mlogloss:1.47539	validation_1-mlogloss:1.47492
[3]	validation_0-mlogloss:1.40261	validation_1-mlogloss:1.40320
[4]	validation_0-mlogloss:1.33982	validation_1-mlogloss:1.34227
[5]	validation_0-mlogloss:1.28534	validation_1-mlogloss:1.28908
[6]	validation_0-mlogloss:1.23757	validation_1-mlogloss:1.24336
[7]	validation_0-mlogloss:1.19656	validation_1-mlogloss:1.20410
[8]	validation_0-mlogloss:1.16147	validation_1-mlogloss:1.16992
[9]	validation_0-mlogloss:1.13010	validation_1-mlogloss:1.13990
[10]	validation_0-mlogloss:1.10301	validation_1-mlogloss:1.11372
[11]	validation_0-mlogloss:1.07930	validation_1-mlogloss:1.08977
[12]	validation_0-mlogloss:1.05693	validation_1-mlogloss:1.06788
[13]	validation_0-mlogloss:1.03676	validation_1-mlogloss:1.04818
[14]	validation_0-mlogloss:1.01800	validation_1-mlogloss:1.03057
[15]	validation_0-mlogloss:1.00051	

[32m[I 2021-07-31 09:15:27,108][0m Trial 4 finished with value: 0.633941211503699 and parameters: {'learning_rate': 0.2538957111278106, 'n_estimators': 4553, 'reg_lambda': 0.08010238382110212, 'max_depth': 9, 'gamma': 4, 'colsample_bytree': 0.5819737076182503, 'subsample': 0.9916851419357203, 'min_child_weight': 9}. Best is trial 1 with value: 0.5665515024989071.[0m




[0]	validation_0-mlogloss:1.79702	validation_1-mlogloss:1.79501
[1]	validation_0-mlogloss:1.69778	validation_1-mlogloss:1.69623
[2]	validation_0-mlogloss:1.62893	validation_1-mlogloss:1.62859
[3]	validation_0-mlogloss:1.57576	validation_1-mlogloss:1.57561
[4]	validation_0-mlogloss:1.52618	validation_1-mlogloss:1.52608
[5]	validation_0-mlogloss:1.48409	validation_1-mlogloss:1.48369
[6]	validation_0-mlogloss:1.44765	validation_1-mlogloss:1.44721
[7]	validation_0-mlogloss:1.41604	validation_1-mlogloss:1.41796
[8]	validation_0-mlogloss:1.38822	validation_1-mlogloss:1.39111
[9]	validation_0-mlogloss:1.36267	validation_1-mlogloss:1.36611
[10]	validation_0-mlogloss:1.33849	validation_1-mlogloss:1.34233
[11]	validation_0-mlogloss:1.31665	validation_1-mlogloss:1.32142
[12]	validation_0-mlogloss:1.29655	validation_1-mlogloss:1.30140
[13]	validation_0-mlogloss:1.27785	validation_1-mlogloss:1.28289
[14]	validation_0-mlogloss:1.26070	validation_1-mlogloss:1.26652
[15]	validation_0-mlogloss:1.24424	

[32m[I 2021-07-31 09:22:07,852][0m Trial 5 finished with value: 0.5186562176549009 and parameters: {'learning_rate': 0.22174263767666802, 'n_estimators': 8842, 'reg_lambda': 0.03682855362340644, 'max_depth': 4, 'gamma': 1, 'colsample_bytree': 0.7013365325453821, 'subsample': 0.9769213047446981, 'min_child_weight': 2}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.78279	validation_1-mlogloss:1.78067
[1]	validation_0-mlogloss:1.68649	validation_1-mlogloss:1.68618
[2]	validation_0-mlogloss:1.61536	validation_1-mlogloss:1.61420
[3]	validation_0-mlogloss:1.55988	validation_1-mlogloss:1.55937
[4]	validation_0-mlogloss:1.51409	validation_1-mlogloss:1.51258
[5]	validation_0-mlogloss:1.47433	validation_1-mlogloss:1.47360
[6]	validation_0-mlogloss:1.43953	validation_1-mlogloss:1.43847
[7]	validation_0-mlogloss:1.40922	validation_1-mlogloss:1.40887
[8]	validation_0-mlogloss:1.38240	validation_1-mlogloss:1.38341
[9]	validation_0-mlogloss:1.35807	validation_1-mlogloss:1.35957
[10]	validation_0-mlogloss:1.33496	validation_1-mlogloss:1.33739
[11]	validation_0-mlogloss:1.31447	validation_1-mlogloss:1.31648
[12]	validation_0-mlogloss:1.29509	validation_1-mlogloss:1.29694
[13]	validation_0-mlogloss:1.27734	validation_1-mlogloss:1.27903
[14]	validation_0-mlogloss:1.26016	validation_1-mlogloss:1.26264
[15]	validation_0-mlogloss:1.24419	

[32m[I 2021-07-31 09:34:05,792][0m Trial 6 finished with value: 0.6772237389245013 and parameters: {'learning_rate': 0.2284146784377603, 'n_estimators': 9056, 'reg_lambda': 0.03456736045175999, 'max_depth': 4, 'gamma': 9, 'colsample_bytree': 0.9775295325755127, 'subsample': 0.6380682020110822, 'min_child_weight': 5}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.71573	validation_1-mlogloss:1.71442
[1]	validation_0-mlogloss:1.58292	validation_1-mlogloss:1.58233
[2]	validation_0-mlogloss:1.49107	validation_1-mlogloss:1.49022
[3]	validation_0-mlogloss:1.42159	validation_1-mlogloss:1.42254
[4]	validation_0-mlogloss:1.36569	validation_1-mlogloss:1.36811
[5]	validation_0-mlogloss:1.31818	validation_1-mlogloss:1.32439
[6]	validation_0-mlogloss:1.27656	validation_1-mlogloss:1.28521
[7]	validation_0-mlogloss:1.24109	validation_1-mlogloss:1.25126
[8]	validation_0-mlogloss:1.20870	validation_1-mlogloss:1.22077
[9]	validation_0-mlogloss:1.18047	validation_1-mlogloss:1.19460
[10]	validation_0-mlogloss:1.15491	validation_1-mlogloss:1.17028
[11]	validation_0-mlogloss:1.13154	validation_1-mlogloss:1.14979
[12]	validation_0-mlogloss:1.10942	validation_1-mlogloss:1.12884
[13]	validation_0-mlogloss:1.08855	validation_1-mlogloss:1.10865
[14]	validation_0-mlogloss:1.06972	validation_1-mlogloss:1.09250
[15]	validation_0-mlogloss:1.05170	

[32m[I 2021-07-31 09:38:34,195][0m Trial 7 finished with value: 0.5365039756225329 and parameters: {'learning_rate': 0.2788128382362757, 'n_estimators': 7689, 'reg_lambda': 0.053040599876820894, 'max_depth': 6, 'gamma': 2, 'colsample_bytree': 0.8015673207492311, 'subsample': 0.5593663080184921, 'min_child_weight': 1}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.80914	validation_1-mlogloss:1.80625
[1]	validation_0-mlogloss:1.71714	validation_1-mlogloss:1.71664
[2]	validation_0-mlogloss:1.64850	validation_1-mlogloss:1.64656
[3]	validation_0-mlogloss:1.59524	validation_1-mlogloss:1.59354
[4]	validation_0-mlogloss:1.55144	validation_1-mlogloss:1.54939
[5]	validation_0-mlogloss:1.51218	validation_1-mlogloss:1.50974
[6]	validation_0-mlogloss:1.47767	validation_1-mlogloss:1.47602
[7]	validation_0-mlogloss:1.44728	validation_1-mlogloss:1.44711
[8]	validation_0-mlogloss:1.42119	validation_1-mlogloss:1.42003
[9]	validation_0-mlogloss:1.39727	validation_1-mlogloss:1.39654
[10]	validation_0-mlogloss:1.37494	validation_1-mlogloss:1.37476
[11]	validation_0-mlogloss:1.35457	validation_1-mlogloss:1.35468
[12]	validation_0-mlogloss:1.33573	validation_1-mlogloss:1.33505
[13]	validation_0-mlogloss:1.31830	validation_1-mlogloss:1.31852
[14]	validation_0-mlogloss:1.30150	validation_1-mlogloss:1.30289
[15]	validation_0-mlogloss:1.28586	

[32m[I 2021-07-31 09:47:05,762][0m Trial 8 finished with value: 0.6358304218461248 and parameters: {'learning_rate': 0.19586207635474406, 'n_estimators': 4846, 'reg_lambda': 0.0872988038646662, 'max_depth': 4, 'gamma': 7, 'colsample_bytree': 0.9644319244715236, 'subsample': 0.6048316723407505, 'min_child_weight': 3}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.75757	validation_1-mlogloss:1.75436
[1]	validation_0-mlogloss:1.63517	validation_1-mlogloss:1.63409
[2]	validation_0-mlogloss:1.55430	validation_1-mlogloss:1.55290
[3]	validation_0-mlogloss:1.48744	validation_1-mlogloss:1.48818
[4]	validation_0-mlogloss:1.43412	validation_1-mlogloss:1.43600
[5]	validation_0-mlogloss:1.38714	validation_1-mlogloss:1.38920
[6]	validation_0-mlogloss:1.34799	validation_1-mlogloss:1.35201
[7]	validation_0-mlogloss:1.31354	validation_1-mlogloss:1.31839
[8]	validation_0-mlogloss:1.28288	validation_1-mlogloss:1.28898
[9]	validation_0-mlogloss:1.25528	validation_1-mlogloss:1.26277
[10]	validation_0-mlogloss:1.23046	validation_1-mlogloss:1.23935
[11]	validation_0-mlogloss:1.20715	validation_1-mlogloss:1.21708
[12]	validation_0-mlogloss:1.18610	validation_1-mlogloss:1.19748
[13]	validation_0-mlogloss:1.16641	validation_1-mlogloss:1.17923
[14]	validation_0-mlogloss:1.14818	validation_1-mlogloss:1.16170
[15]	validation_0-mlogloss:1.13029	

[32m[I 2021-07-31 09:52:15,289][0m Trial 9 finished with value: 0.5365885396584725 and parameters: {'learning_rate': 0.22166499713334187, 'n_estimators': 7343, 'reg_lambda': 0.03239049993442917, 'max_depth': 6, 'gamma': 2, 'colsample_bytree': 0.9007957566921302, 'subsample': 0.7250594402660778, 'min_child_weight': 3}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.66023	validation_1-mlogloss:1.66210
[1]	validation_0-mlogloss:1.50651	validation_1-mlogloss:1.50850
[2]	validation_0-mlogloss:1.40557	validation_1-mlogloss:1.41176
[3]	validation_0-mlogloss:1.33217	validation_1-mlogloss:1.34188
[4]	validation_0-mlogloss:1.26713	validation_1-mlogloss:1.28076
[5]	validation_0-mlogloss:1.21304	validation_1-mlogloss:1.22730
[6]	validation_0-mlogloss:1.16665	validation_1-mlogloss:1.18380
[7]	validation_0-mlogloss:1.12751	validation_1-mlogloss:1.14794
[8]	validation_0-mlogloss:1.09384	validation_1-mlogloss:1.11729
[9]	validation_0-mlogloss:1.06403	validation_1-mlogloss:1.08806
[10]	validation_0-mlogloss:1.03684	validation_1-mlogloss:1.06304
[11]	validation_0-mlogloss:1.01258	validation_1-mlogloss:1.03938
[12]	validation_0-mlogloss:0.98982	validation_1-mlogloss:1.01940
[13]	validation_0-mlogloss:0.97018	validation_1-mlogloss:1.00143
[14]	validation_0-mlogloss:0.95138	validation_1-mlogloss:0.98473
[15]	validation_0-mlogloss:0.93473	

[32m[I 2021-07-31 09:54:47,051][0m Trial 10 finished with value: 0.5879818419749312 and parameters: {'learning_rate': 0.2954979646344742, 'n_estimators': 8622, 'reg_lambda': 0.06365707724165476, 'max_depth': 9, 'gamma': 0, 'colsample_bytree': 0.7090156552481182, 'subsample': 0.9938108764326694, 'min_child_weight': 8}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.66121	validation_1-mlogloss:1.66050
[1]	validation_0-mlogloss:1.51044	validation_1-mlogloss:1.51399
[2]	validation_0-mlogloss:1.40965	validation_1-mlogloss:1.41504
[3]	validation_0-mlogloss:1.33503	validation_1-mlogloss:1.34229
[4]	validation_0-mlogloss:1.27191	validation_1-mlogloss:1.28185
[5]	validation_0-mlogloss:1.21964	validation_1-mlogloss:1.23495
[6]	validation_0-mlogloss:1.17247	validation_1-mlogloss:1.19151
[7]	validation_0-mlogloss:1.13343	validation_1-mlogloss:1.15503
[8]	validation_0-mlogloss:1.10046	validation_1-mlogloss:1.12543
[9]	validation_0-mlogloss:1.07134	validation_1-mlogloss:1.09957
[10]	validation_0-mlogloss:1.04428	validation_1-mlogloss:1.07617
[11]	validation_0-mlogloss:1.01899	validation_1-mlogloss:1.05320
[12]	validation_0-mlogloss:0.99559	validation_1-mlogloss:1.03192
[13]	validation_0-mlogloss:0.97412	validation_1-mlogloss:1.01125
[14]	validation_0-mlogloss:0.95456	validation_1-mlogloss:0.99403
[15]	validation_0-mlogloss:0.93628	

[32m[I 2021-07-31 09:58:55,589][0m Trial 11 finished with value: 0.5349734739332677 and parameters: {'learning_rate': 0.29801108798231546, 'n_estimators': 7883, 'reg_lambda': 0.054145141866170886, 'max_depth': 8, 'gamma': 1, 'colsample_bytree': 0.8217162064051583, 'subsample': 0.5015108707980034, 'min_child_weight': 1}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.66121	validation_1-mlogloss:1.66165
[1]	validation_0-mlogloss:1.51096	validation_1-mlogloss:1.51212
[2]	validation_0-mlogloss:1.41000	validation_1-mlogloss:1.41462
[3]	validation_0-mlogloss:1.33382	validation_1-mlogloss:1.34311
[4]	validation_0-mlogloss:1.27103	validation_1-mlogloss:1.28373
[5]	validation_0-mlogloss:1.21703	validation_1-mlogloss:1.23286
[6]	validation_0-mlogloss:1.17231	validation_1-mlogloss:1.19156
[7]	validation_0-mlogloss:1.13374	validation_1-mlogloss:1.15628
[8]	validation_0-mlogloss:1.10101	validation_1-mlogloss:1.12718
[9]	validation_0-mlogloss:1.07175	validation_1-mlogloss:1.10151
[10]	validation_0-mlogloss:1.04339	validation_1-mlogloss:1.07556
[11]	validation_0-mlogloss:1.01842	validation_1-mlogloss:1.05369
[12]	validation_0-mlogloss:0.99517	validation_1-mlogloss:1.03334
[13]	validation_0-mlogloss:0.97445	validation_1-mlogloss:1.01477
[14]	validation_0-mlogloss:0.95425	validation_1-mlogloss:0.99783
[15]	validation_0-mlogloss:0.93617	

[32m[I 2021-07-31 10:03:28,634][0m Trial 12 finished with value: 0.5264319007590665 and parameters: {'learning_rate': 0.29871183617703023, 'n_estimators': 8382, 'reg_lambda': 0.04244287920634283, 'max_depth': 8, 'gamma': 0, 'colsample_bytree': 0.8053217364205824, 'subsample': 0.8722947286406789, 'min_child_weight': 2}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.70740	validation_1-mlogloss:1.70830
[1]	validation_0-mlogloss:1.56784	validation_1-mlogloss:1.56800
[2]	validation_0-mlogloss:1.47477	validation_1-mlogloss:1.47611
[3]	validation_0-mlogloss:1.40583	validation_1-mlogloss:1.40997
[4]	validation_0-mlogloss:1.34272	validation_1-mlogloss:1.34956
[5]	validation_0-mlogloss:1.29067	validation_1-mlogloss:1.29912
[6]	validation_0-mlogloss:1.24503	validation_1-mlogloss:1.25642
[7]	validation_0-mlogloss:1.20583	validation_1-mlogloss:1.22070
[8]	validation_0-mlogloss:1.17273	validation_1-mlogloss:1.18951
[9]	validation_0-mlogloss:1.14313	validation_1-mlogloss:1.16101
[10]	validation_0-mlogloss:1.11661	validation_1-mlogloss:1.13645
[11]	validation_0-mlogloss:1.09195	validation_1-mlogloss:1.11257
[12]	validation_0-mlogloss:1.06977	validation_1-mlogloss:1.09133
[13]	validation_0-mlogloss:1.04962	validation_1-mlogloss:1.07160
[14]	validation_0-mlogloss:1.03061	validation_1-mlogloss:1.05527
[15]	validation_0-mlogloss:1.01299	

[32m[I 2021-07-31 10:06:16,168][0m Trial 13 finished with value: 0.5824153281878233 and parameters: {'learning_rate': 0.2546050497749313, 'n_estimators': 9856, 'reg_lambda': 0.026642839550198246, 'max_depth': 8, 'gamma': 0, 'colsample_bytree': 0.7085354059932923, 'subsample': 0.8848001584478115, 'min_child_weight': 7}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.67202	validation_1-mlogloss:1.67347
[1]	validation_0-mlogloss:1.51872	validation_1-mlogloss:1.51971
[2]	validation_0-mlogloss:1.41202	validation_1-mlogloss:1.41676
[3]	validation_0-mlogloss:1.33765	validation_1-mlogloss:1.34601
[4]	validation_0-mlogloss:1.27239	validation_1-mlogloss:1.28318
[5]	validation_0-mlogloss:1.21753	validation_1-mlogloss:1.23172
[6]	validation_0-mlogloss:1.17005	validation_1-mlogloss:1.18690
[7]	validation_0-mlogloss:1.12946	validation_1-mlogloss:1.14968
[8]	validation_0-mlogloss:1.09454	validation_1-mlogloss:1.11652
[9]	validation_0-mlogloss:1.06400	validation_1-mlogloss:1.08935
[10]	validation_0-mlogloss:1.03658	validation_1-mlogloss:1.06352
[11]	validation_0-mlogloss:1.01181	validation_1-mlogloss:1.04097
[12]	validation_0-mlogloss:0.98826	validation_1-mlogloss:1.02043
[13]	validation_0-mlogloss:0.96784	validation_1-mlogloss:1.00126
[14]	validation_0-mlogloss:0.94837	validation_1-mlogloss:0.98429
[15]	validation_0-mlogloss:0.93078	

[32m[I 2021-07-31 10:12:36,599][0m Trial 14 finished with value: 0.5305482458370053 and parameters: {'learning_rate': 0.2616558161316559, 'n_estimators': 8700, 'reg_lambda': 0.04347944465913808, 'max_depth': 10, 'gamma': 2, 'colsample_bytree': 0.7599643132120011, 'subsample': 0.8832054364940808, 'min_child_weight': 3}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.72531	validation_1-mlogloss:1.72670
[1]	validation_0-mlogloss:1.58881	validation_1-mlogloss:1.59148
[2]	validation_0-mlogloss:1.49705	validation_1-mlogloss:1.49962
[3]	validation_0-mlogloss:1.42400	validation_1-mlogloss:1.42858
[4]	validation_0-mlogloss:1.36363	validation_1-mlogloss:1.37218
[5]	validation_0-mlogloss:1.31186	validation_1-mlogloss:1.32244
[6]	validation_0-mlogloss:1.26775	validation_1-mlogloss:1.28230
[7]	validation_0-mlogloss:1.23003	validation_1-mlogloss:1.24766
[8]	validation_0-mlogloss:1.19592	validation_1-mlogloss:1.21537
[9]	validation_0-mlogloss:1.16576	validation_1-mlogloss:1.18801
[10]	validation_0-mlogloss:1.13821	validation_1-mlogloss:1.16318
[11]	validation_0-mlogloss:1.11252	validation_1-mlogloss:1.13938
[12]	validation_0-mlogloss:1.08880	validation_1-mlogloss:1.11712
[13]	validation_0-mlogloss:1.06777	validation_1-mlogloss:1.09830
[14]	validation_0-mlogloss:1.04799	validation_1-mlogloss:1.08052
[15]	validation_0-mlogloss:1.02934	

[32m[I 2021-07-31 10:19:12,203][0m Trial 15 finished with value: 0.5225418781161577 and parameters: {'learning_rate': 0.22139372976120336, 'n_estimators': 5834, 'reg_lambda': 0.020931776499439537, 'max_depth': 8, 'gamma': 0, 'colsample_bytree': 0.8758472941049056, 'subsample': 0.8895388078214407, 'min_child_weight': 2}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.81983	validation_1-mlogloss:1.81775
[1]	validation_0-mlogloss:1.73385	validation_1-mlogloss:1.73313
[2]	validation_0-mlogloss:1.67305	validation_1-mlogloss:1.67116
[3]	validation_0-mlogloss:1.62177	validation_1-mlogloss:1.61989
[4]	validation_0-mlogloss:1.57969	validation_1-mlogloss:1.57855
[5]	validation_0-mlogloss:1.54317	validation_1-mlogloss:1.54182
[6]	validation_0-mlogloss:1.51174	validation_1-mlogloss:1.50906
[7]	validation_0-mlogloss:1.48390	validation_1-mlogloss:1.48021
[8]	validation_0-mlogloss:1.45888	validation_1-mlogloss:1.45628
[9]	validation_0-mlogloss:1.43556	validation_1-mlogloss:1.43406
[10]	validation_0-mlogloss:1.41449	validation_1-mlogloss:1.41296
[11]	validation_0-mlogloss:1.39464	validation_1-mlogloss:1.39373
[12]	validation_0-mlogloss:1.37644	validation_1-mlogloss:1.37609
[13]	validation_0-mlogloss:1.35913	validation_1-mlogloss:1.35912
[14]	validation_0-mlogloss:1.34292	validation_1-mlogloss:1.34251
[15]	validation_0-mlogloss:1.32771	

[32m[I 2021-07-31 10:24:17,402][0m Trial 16 finished with value: 0.5625256270174461 and parameters: {'learning_rate': 0.21889762421938125, 'n_estimators': 3103, 'reg_lambda': 0.01953578850513057, 'max_depth': 3, 'gamma': 1, 'colsample_bytree': 0.8985067214390453, 'subsample': 0.9401686413931176, 'min_child_weight': 6}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.78340	validation_1-mlogloss:1.78304
[1]	validation_0-mlogloss:1.67332	validation_1-mlogloss:1.67336
[2]	validation_0-mlogloss:1.59728	validation_1-mlogloss:1.59824
[3]	validation_0-mlogloss:1.53440	validation_1-mlogloss:1.53654
[4]	validation_0-mlogloss:1.47647	validation_1-mlogloss:1.47864
[5]	validation_0-mlogloss:1.42916	validation_1-mlogloss:1.43184
[6]	validation_0-mlogloss:1.38599	validation_1-mlogloss:1.39048
[7]	validation_0-mlogloss:1.34773	validation_1-mlogloss:1.35431
[8]	validation_0-mlogloss:1.31471	validation_1-mlogloss:1.32169
[9]	validation_0-mlogloss:1.28600	validation_1-mlogloss:1.29407
[10]	validation_0-mlogloss:1.25912	validation_1-mlogloss:1.26851
[11]	validation_0-mlogloss:1.23464	validation_1-mlogloss:1.24476
[12]	validation_0-mlogloss:1.21170	validation_1-mlogloss:1.22256
[13]	validation_0-mlogloss:1.19229	validation_1-mlogloss:1.20423
[14]	validation_0-mlogloss:1.17309	validation_1-mlogloss:1.18669
[15]	validation_0-mlogloss:1.15537	

[32m[I 2021-07-31 10:33:06,848][0m Trial 17 finished with value: 0.54753770542231 and parameters: {'learning_rate': 0.17920843668206923, 'n_estimators': 6009, 'reg_lambda': 0.017217398337706417, 'max_depth': 7, 'gamma': 3, 'colsample_bytree': 0.6608552731334815, 'subsample': 0.8152255818292785, 'min_child_weight': 2}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.68776	validation_1-mlogloss:1.68663
[1]	validation_0-mlogloss:1.53779	validation_1-mlogloss:1.53578
[2]	validation_0-mlogloss:1.44019	validation_1-mlogloss:1.43992
[3]	validation_0-mlogloss:1.36345	validation_1-mlogloss:1.36476
[4]	validation_0-mlogloss:1.30065	validation_1-mlogloss:1.30542
[5]	validation_0-mlogloss:1.24855	validation_1-mlogloss:1.25530
[6]	validation_0-mlogloss:1.20527	validation_1-mlogloss:1.21298
[7]	validation_0-mlogloss:1.16761	validation_1-mlogloss:1.17691
[8]	validation_0-mlogloss:1.13455	validation_1-mlogloss:1.14438
[9]	validation_0-mlogloss:1.10541	validation_1-mlogloss:1.11680
[10]	validation_0-mlogloss:1.07895	validation_1-mlogloss:1.09105
[11]	validation_0-mlogloss:1.05439	validation_1-mlogloss:1.06846
[12]	validation_0-mlogloss:1.03296	validation_1-mlogloss:1.04745
[13]	validation_0-mlogloss:1.01283	validation_1-mlogloss:1.02890
[14]	validation_0-mlogloss:0.99458	validation_1-mlogloss:1.01244
[15]	validation_0-mlogloss:0.97772	

[32m[I 2021-07-31 10:42:43,048][0m Trial 18 finished with value: 0.6323224672454864 and parameters: {'learning_rate': 0.23881375072617522, 'n_estimators': 3305, 'reg_lambda': 0.0233640474714529, 'max_depth': 10, 'gamma': 6, 'colsample_bytree': 0.8852391665690571, 'subsample': 0.9416778750825452, 'min_child_weight': 4}. Best is trial 5 with value: 0.5186562176549009.[0m




[0]	validation_0-mlogloss:1.75812	validation_1-mlogloss:1.75907
[1]	validation_0-mlogloss:1.63769	validation_1-mlogloss:1.63826
[2]	validation_0-mlogloss:1.55125	validation_1-mlogloss:1.55177
[3]	validation_0-mlogloss:1.48447	validation_1-mlogloss:1.48681
[4]	validation_0-mlogloss:1.42795	validation_1-mlogloss:1.43183
[5]	validation_0-mlogloss:1.37895	validation_1-mlogloss:1.38364
[6]	validation_0-mlogloss:1.33748	validation_1-mlogloss:1.34504
[7]	validation_0-mlogloss:1.30054	validation_1-mlogloss:1.31107
[8]	validation_0-mlogloss:1.26834	validation_1-mlogloss:1.27988
[9]	validation_0-mlogloss:1.23894	validation_1-mlogloss:1.25265
[10]	validation_0-mlogloss:1.21191	validation_1-mlogloss:1.22663
[11]	validation_0-mlogloss:1.18786	validation_1-mlogloss:1.20450
[12]	validation_0-mlogloss:1.16511	validation_1-mlogloss:1.18380
[13]	validation_0-mlogloss:1.14451	validation_1-mlogloss:1.16428
[14]	validation_0-mlogloss:1.12563	validation_1-mlogloss:1.14663
[15]	validation_0-mlogloss:1.10789	

[32m[I 2021-07-31 10:49:14,343][0m Trial 19 finished with value: 0.5208340073649566 and parameters: {'learning_rate': 0.20180319158502422, 'n_estimators': 5894, 'reg_lambda': 0.06118795664644994, 'max_depth': 7, 'gamma': 1, 'colsample_bytree': 0.8511524643801492, 'subsample': 0.815975767751384, 'min_child_weight': 2}. Best is trial 5 with value: 0.5186562176549009.[0m


Best Score: 0.5186562176549009
Best trial: {'learning_rate': 0.22174263767666802, 'n_estimators': 8842, 'reg_lambda': 0.03682855362340644, 'max_depth': 4, 'gamma': 1, 'colsample_bytree': 0.7013365325453821, 'subsample': 0.9769213047446981, 'min_child_weight': 2}


In [None]:
optuna.visualization.plot_optimization_history(study).show()
#plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study).show()
'''plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search
went and which parts of the space were explored more.'''
optuna.visualization.plot_slice(study).show()

#Visualize parameter importances.
optuna.visualization.plot_param_importances(study).show()
#Visualize empirical distribution function
optuna.visualization.plot_edf(study).show()

In [None]:
best_params = {}
best_params.update(study.best_params)
best_params

{'colsample_bytree': 0.7013365325453821,
 'gamma': 1,
 'learning_rate': 0.22174263767666802,
 'max_depth': 4,
 'min_child_weight': 2,
 'n_estimators': 8842,
 'reg_lambda': 0.03682855362340644,
 'subsample': 0.9769213047446981}

In [None]:
best_params = {'colsample_bytree': 0.7013365325453821,
 'gamma': 1,
 'learning_rate': 0.22174263767666802,
 'max_depth': 4,
 'min_child_weight': 2,
 'n_estimators': 8842,
 'reg_lambda': 0.03682855362340644,
 'subsample': 0.9769213047446981}

In [None]:
from sklearn.model_selection import train_test_split, KFold

In [None]:
# tfidf_matrix_train = tfidf_vect.fit_transform(t)
# tfidf_matrix_test = tfidf_vect.transform(test['clear_title'])

# submission = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/sample_submission.csv",error_bad_lines=False)

cls_model = XGBClassifier(**best_params)

folds = KFold(n_splits = 5, shuffle = True, random_state = rand_num)
best_result_list=[]
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)) :
    print(n_fold)

    X_train, y_train = tfidf_matrix_train[trn_idx], train.topic_idx[trn_idx] 
    X_valid, y_valid = tfidf_matrix_train[val_idx], train.topic_idx[val_idx]
    
    cls_model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
         early_stopping_rounds=100,
        verbose=100
    )
    xgbm_pred = cls_model.predict_proba(tfidf_matrix_test)
    # mean_submission = mean_submission.append({f'{n_fold}':lgbm_pred }, ignore_index=True)
    best_result_list.append(xgbm_pred)
    # mean_submission = mean_submission.append(lgbm_pred)
    # mean_submission['topic_idx'] = lgbm_pred
# #     sub_preds += model.predict(x_test) / folds.n_splits
#     oof_preds[val_idx] = model.predict_prova(val_x)

0






[0]	validation_0-mlogloss:1.81182	validation_1-mlogloss:1.81167
[100]	validation_0-mlogloss:0.75402	validation_1-mlogloss:0.81162
[200]	validation_0-mlogloss:0.58610	validation_1-mlogloss:0.68295
[300]	validation_0-mlogloss:0.49570	validation_1-mlogloss:0.62309
[400]	validation_0-mlogloss:0.43520	validation_1-mlogloss:0.58911
[500]	validation_0-mlogloss:0.39037	validation_1-mlogloss:0.56547
[600]	validation_0-mlogloss:0.35493	validation_1-mlogloss:0.54987
[700]	validation_0-mlogloss:0.32594	validation_1-mlogloss:0.53807
[800]	validation_0-mlogloss:0.30169	validation_1-mlogloss:0.53015
[900]	validation_0-mlogloss:0.28076	validation_1-mlogloss:0.52436
[1000]	validation_0-mlogloss:0.26313	validation_1-mlogloss:0.52156
[1100]	validation_0-mlogloss:0.24983	validation_1-mlogloss:0.51977
[1200]	validation_0-mlogloss:0.23955	validation_1-mlogloss:0.51910
[1300]	validation_0-mlogloss:0.23283	validation_1-mlogloss:0.51841
[1400]	validation_0-mlogloss:0.22801	validation_1-mlogloss:0.51842
[1458]	

In [None]:
submission_1 = submission
a = best_result_list[0] + best_result_list[1] + best_result_list[2] + best_result_list[3] + best_result_list[4]
a = a/5
a

array([[0.11555467, 0.13150184, 0.4925018 , ..., 0.0804616 , 0.03499027,
        0.06398706],
       [0.00470573, 0.01379676, 0.02814108, ..., 0.00793905, 0.00547685,
        0.00860378],
       [0.13398163, 0.06805144, 0.5565732 , ..., 0.04488104, 0.07398963,
        0.04321815],
       ...,
       [0.01678685, 0.01876858, 0.09996491, ..., 0.0997593 , 0.04125392,
        0.00679419],
       [0.3419889 , 0.03967629, 0.48923892, ..., 0.03551904, 0.00884889,
        0.00868185],
       [0.01059138, 0.00751798, 0.8353736 , ..., 0.01767996, 0.00344386,
        0.06810734]], dtype=float32)

In [None]:
i, j, k=  [], [], []
m, n, o = [], [], []
l = []
for row in a:
  i.append(row[0])
  j.append(row[1])
  k.append(row[2])
  l.append(row[3])
  m.append(row[4])
  n.append(row[5])
  o.append(row[6])

submission_1['0'] = i
submission_1['1'] = j
submission_1['2'] = k
submission_1['3'] = l
submission_1['4'] = m
submission_1['5'] = n
submission_1['6'] = o

In [None]:
pred = np.argmax(a, axis = 1)
submission = pd.read_csv("/content/drive/MyDrive/DACON/topic_classification/sample_submission.csv",error_bad_lines=False)
submission.topic_idx = pred
submission

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3
...,...,...
9126,54780,3
9127,54781,2
9128,54782,3
9129,54783,2


In [None]:
submission_1.topic_idx = pred
submission_1

Unnamed: 0,index,topic_idx,0,1,2,3,4,5,6
0,45654,2,0.115555,0.131502,0.492502,0.081003,0.080462,0.034990,0.063987
1,45655,3,0.004706,0.013797,0.028141,0.931337,0.007939,0.005477,0.008604
2,45656,2,0.133982,0.068051,0.556573,0.079305,0.044881,0.073990,0.043218
3,45657,2,0.157513,0.051998,0.598129,0.116318,0.042828,0.018335,0.014880
4,45658,3,0.000013,0.000007,0.000022,0.999943,0.000007,0.000004,0.000004
...,...,...,...,...,...,...,...,...,...
9126,54780,3,0.001167,0.001927,0.003874,0.990690,0.000921,0.000795,0.000626
9127,54781,2,0.012579,0.016145,0.760913,0.013347,0.023326,0.007805,0.165885
9128,54782,3,0.016787,0.018769,0.099965,0.716672,0.099759,0.041254,0.006794
9129,54783,2,0.341989,0.039676,0.489239,0.076046,0.035519,0.008849,0.008682


In [None]:
# f
# xgb_clf = XGBClassifier(booster='gbtree', 
#                     colsample_bylevel=0.8, 
#                     colsample_bytree=0.7, 
#                     gamma=0, 
#                     max_depth=5, learning_rate=0.15,
#                     n_estimators=10000, 
#                     nthread=4,
#                     objective = 'multi:softmax',
#                     silent= False,
#                     random_state = SEED)

# xgb_clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
#              early_stopping_rounds=10)

# xgb_clf.score(X_valid, y_valid)

array([2, 3, 2, ..., 3, 2, 2])

In [None]:
# pred = xgb_clf.predict(X_valid)
crosstab = pd.crosstab(y_valid, pred, rownames=['real'], colnames=['pred'])
crosstab

In [None]:
# pred = xgb_clf.predict(X_test)

In [None]:
# submission = pd.read_csv('/content/drive/MyDrive/공민표/xgboost/sample_submission.csv')

In [None]:
# submission['topic_idx'] = pred
# submission.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [None]:
submission.to_csv('/content/xgboost_baseline_1.csv',index = False)
submission_1.to_csv('/content/_xgboost_.csv',index = False)
# submission.to_csv('bert_baseline_1.csv',index = True)

In [None]:
!pip install /content/dacon_submit_api-0.0.4-py3-none-any.whl

Processing ./dacon_submit_api-0.0.4-py3-none-any.whl
Installing collected packages: dacon-submit-api
Successfully installed dacon-submit-api-0.0.4


In [None]:
from dacon_submit_api import dacon_submit_api 

result = dacon_submit_api.post_submission_file(
'/content/xgboost_baseline_1.csv', # 파일경로
'2c58838c19aed942708bdab3e40f51fed148481645b17357529b6fce671b9160',  # 개인토큰
'235747', # 대회 id
'Healthy Guys',  # 팀이름
'xgboost') # 노트

{'isSubmitted': True, 'detail': 'Success'}
