# [Dacon] 소설 작가 분류 AI 경진대회

## 1. 라이브러리 및 데이터

## Library & Data

In [1]:
import pandas as pd
import numpy as np

import re

import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, pipeline, model_selection, naive_bayes
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb

import time

from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [2]:
pd.set_option('display.max_columns', 200)
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test_x.csv')

## 2. 데이터 전처리
## Data Cleansing & Pre-Processing

In [3]:
X_train=train['text'].str.replace('[^a-zA-Z0-9]', ' ')
Y_train = LabelEncoder().fit_transform(train['author'])
y_train=train['author']
X_test=test['text'].str.replace('[^a-zA-Z0-9]', ' ')

In [4]:
# 구두점 비율(문장 안에 각 부호가 얼마나 있는지)
punctuations = [{"id":1, "p" : "[;:]"},
                {"id":2, "p" : "[,.]"},
                {"id":3, "p" : "[?]"},
                {"id":4, "p" : "[!]"},
                {"id":5, "p" : "[‘’\']"},
                {"id":6, "p" : "[“”\"]"},
                {"id":7, "p" : "[;:,.?!\'“”‘’\"]"}]

for p in punctuations:
    punctuation = p["p"]
    _train =  [sentence.split() for sentence in train['text']]
    train['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _train]

    _test =  [sentence.split() for sentence in test['text']]
    test['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _test]

## Pipeline
   - TfidfVectorizer
   - CountVectorizer
           analyzer : word, char, char_wb

### TfidfVectorizer - word

In [5]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
                  'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0"] = pred_train[ : , 0]
train["tfidf_MNB_1"] = pred_train[ : , 1]
train["tfidf_MNB_2"] = pred_train[ : , 2]
train["tfidf_MNB_3"] = pred_train[ : , 3]
train["tfidf_MNB_4"] = pred_train[ : , 4]
test["tfidf_MNB_0"] = pred_full_test[ : , 0]
test["tfidf_MNB_1"] = pred_full_test[ : , 1]
test["tfidf_MNB_2"] = pred_full_test[ : , 2]
test["tfidf_MNB_3"] = pred_full_test[ : , 3]
test["tfidf_MNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 20:40
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   21.6s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   23.0s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.3
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   18.9s remaining:   57.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   19.8s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   16.5s remaining:   49.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   18.7s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   19.3s remaining:   58.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   20.4s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.3
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   18.6s remaining:   55.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   19.9s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
cv score :  [0.6029955683225479, 0.604635802543013, 0.6211772682884821, 0.6004100043878932, 0.6154952568908586]
Mean cv score :  0.608942780086559
2020/12/10 20:40
2020/12/10 20:44


In [6]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0"] = pred_train[ : , 0]
train["tfidf_CMNB_1"] = pred_train[ : , 1]
train["tfidf_CMNB_2"] = pred_train[ : , 2]
train["tfidf_CMNB_3"] = pred_train[ : , 3]
train["tfidf_CMNB_4"] = pred_train[ : , 4]
test["tfidf_CMNB_0"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 20:44
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.2s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
cv score :  [0.5976665473926792, 0.5975338255728783, 0.6173897988183089, 0.5963371295574684, 0.6096579691870107]
Mean cv score :  0.6037170541056691
2020/12/10 20:44
2020/12/10 20:48


In [7]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0"] = pred_train[ : , 0]
train["tfidf_CBNB_1"] = pred_train[ : , 1]
train["tfidf_CBNB_2"] = pred_train[ : , 2]
train["tfidf_CBNB_3"] = pred_train[ : , 3]
train["tfidf_CBNB_4"] = pred_train[ : , 4]
test["tfidf_CBNB_0"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 20:48
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.6s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.0s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.7s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
cv score :  [0.6292165710596017, 0.6394984741232733, 0.646819786619428, 0.6312645279907801, 0.6562403089836737]
Mean cv score :  0.6406079337553514
2020/12/10 20:48
2020/12/10 20:52


In [8]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0"] = pred_train[ : , 0]
train["tfidf_CH_1"] = pred_train[ : , 1]
train["tfidf_CH_2"] = pred_train[ : , 2]
train["tfidf_CH_3"] = pred_train[ : , 3]
train["tfidf_CH_4"] = pred_train[ : , 4]
test["tfidf_CH_0"] = pred_full_test[ : , 0]
test["tfidf_CH_1"] = pred_full_test[ : , 1]
test["tfidf_CH_2"] = pred_full_test[ : , 2]
test["tfidf_CH_3"] = pred_full_test[ : , 3]
test["tfidf_CH_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 20:52
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.6s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.8s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.8s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.7s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.1s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.6034359823604672, 0.6184589677348814, 0.6177396133835682, 0.6024721411834132, 0.6200243325185175]
Mean cv score :  0.6124262074361695
2020/12/10 20:52
2020/12/10 20:57


In [9]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0"] = pred_train[ : , 0]
train["tfidf_L_1"] = pred_train[ : , 1]
train["tfidf_L_2"] = pred_train[ : , 2]
train["tfidf_L_3"] = pred_train[ : , 3]
train["tfidf_L_4"] = pred_train[ : , 4]
test["tfidf_L_0"] = pred_full_test[ : , 0]
test["tfidf_L_1"] = pred_full_test[ : , 1]
test["tfidf_L_2"] = pred_full_test[ : , 2]
test["tfidf_L_3"] = pred_full_test[ : , 3]
test["tfidf_L_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 20:57
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.4min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.5620624641494352, 0.5857273048883577, 0.5885432314650478, 0.5693716851561413, 0.5916510341517979]
Mean cv score :  0.579471143962156
2020/12/10 20:57
2020/12/10 21:26


### CountVectorizer - word

In [10]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
                  'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0"] = pred_train[ : , 0]
train["count_MNB_1"] = pred_train[ : , 1]
train["count_MNB_2"] = pred_train[ : , 2]
train["count_MNB_3"] = pred_train[ : , 3]
train["count_MNB_4"] = pred_train[ : , 4]
test["count_MNB_0"] = pred_full_test[ : , 0]
test["count_MNB_1"] = pred_full_test[ : , 1]
test["count_MNB_2"] = pred_full_test[ : , 2]
test["count_MNB_3"] = pred_full_test[ : , 3]
test["count_MNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 21:26
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   15.9s remaining:   47.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   20.5s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   18.0s remaining:   54.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   18.7s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   17.6s remaining:   52.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   18.4s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   18.3s remaining:   55.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   19.4s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   17.7s remaining:   53.4s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   19.6s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
cv score :  [0.5776704326928397, 0.5804705854660563, 0.6017163249986536, 0.5748083523087922, 0.5919857145779784]
Mean cv score :  0.5853302820088641
2020/12/10 21:26
2020/12/10 21:30


In [11]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0"] = pred_train[ : , 0]
train["count_CMNB_1"] = pred_train[ : , 1]
train["count_CMNB_2"] = pred_train[ : , 2]
train["count_CMNB_3"] = pred_train[ : , 3]
train["count_CMNB_4"] = pred_train[ : , 4]
test["count_CMNB_0"] = pred_full_test[ : , 0]
test["count_CMNB_1"] = pred_full_test[ : , 1]
test["count_CMNB_2"] = pred_full_test[ : , 2]
test["count_CMNB_3"] = pred_full_test[ : , 3]
test["count_CMNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 21:30
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   14.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   14.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.0s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
cv score :  [0.5721619114226169, 0.5751870347960057, 0.5952297376162479, 0.5675133432744751, 0.5877951662222627]
Mean cv score :  0.5795774386663217
2020/12/10 21:30
2020/12/10 21:33


In [12]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0"] = pred_train[ : , 0]
train["count_CBNB_1"] = pred_train[ : , 1]
train["count_CBNB_2"] = pred_train[ : , 2]
train["count_CBNB_3"] = pred_train[ : , 3]
train["count_CBNB_4"] = pred_train[ : , 4]
test["count_CBNB_0"] = pred_full_test[ : , 0]
test["count_CBNB_1"] = pred_full_test[ : , 1]
test["count_CBNB_2"] = pred_full_test[ : , 2]
test["count_CBNB_3"] = pred_full_test[ : , 3]
test["count_CBNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 21:33
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 102.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 102.5min finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.3s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.4s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
cv score :  [0.6292165710596003, 0.6394984741232734, 0.6468197866192531, 0.6312645279907797, 0.6562403089836758]
Mean cv score :  0.6406079337553165
2020/12/10 21:33
2020/12/10 23:18


In [13]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0"] = pred_train[ : , 0]
train["count_CH_1"] = pred_train[ : , 1]
train["count_CH_2"] = pred_train[ : , 2]
train["count_CH_3"] = pred_train[ : , 3]
train["count_CH_4"] = pred_train[ : , 4]
test["count_CH_0"] = pred_full_test[ : , 0]
test["count_CH_1"] = pred_full_test[ : , 1]
test["count_CH_2"] = pred_full_test[ : , 2]
test["count_CH_3"] = pred_full_test[ : , 3]
test["count_CH_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 23:18
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   10.5s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   11.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   11.2s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.6s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.5s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.1s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.6112053600471076, 0.6304643525606525, 0.6301438865457136, 0.6108571808403607, 0.6309156687101501]
Mean cv score :  0.622717289740797
2020/12/10 23:18
2020/12/10 23:21


In [14]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0"] = pred_train[ : , 0]
train["count_L_1"] = pred_train[ : , 1]
train["count_L_2"] = pred_train[ : , 2]
train["count_L_3"] = pred_train[ : , 3]
train["count_L_4"] = pred_train[ : , 4]
test["count_L_0"] = pred_full_test[ : , 0]
test["count_L_1"] = pred_full_test[ : , 1]
test["count_L_2"] = pred_full_test[ : , 2]
test["count_L_3"] = pred_full_test[ : , 3]
test["count_L_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 23:21
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.59456263122513, 0.6247646427582598, 0.6199121419343707, 0.5983970572232508, 0.6207081495507819]
Mean cv score :  0.6116689245383586
2020/12/10 23:21
2020/12/10 23:40


### TfidfVectorizer - char

In [15]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 3)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0_char"] = pred_train[ : , 0]
train["tfidf_MNB_1_char"] = pred_train[ : , 1]
train["tfidf_MNB_2_char"] = pred_train[ : , 2]
train["tfidf_MNB_3_char"] = pred_train[ : , 3]
train["tfidf_MNB_4_char"] = pred_train[ : , 4]
test["tfidf_MNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_MNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_MNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_MNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_MNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 23:40
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.6s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.9s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.4s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.9s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   13.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   13.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [1.0115280239076807, 1.0125400559034543, 1.0284658391939439, 1.012186660775046, 1.032666145731224]
Mean cv score :  1.01947734510227
2020/12/10 23:40
2020/12/10 23:44


In [16]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0_char"] = pred_train[ : , 0]
train["tfidf_CMNB_1_char"] = pred_train[ : , 1]
train["tfidf_CMNB_2_char"] = pred_train[ : , 2]
train["tfidf_CMNB_3_char"] = pred_train[ : , 3]
train["tfidf_CMNB_4_char"] = pred_train[ : , 4]
test["tfidf_CMNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/10 23:44
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.1min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.4min finished
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.4min remaining:    0.0s


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.7min finished
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.7min remaining:    0.0s


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min finished
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.5min remaining:    0.0s


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
cv score :  [0.583538896506643, 0.5725063199060331, 0.5955298726762939, 0.5699324625848107, 0.6007632148029367]
Mean cv score :  0.5844541532953433
2020/12/10 23:44
2020/12/11 00:16


In [17]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0_char"] = pred_train[ : , 0]
train["tfidf_CBNB_1_char"] = pred_train[ : , 1]
train["tfidf_CBNB_2_char"] = pred_train[ : , 2]
train["tfidf_CBNB_3_char"] = pred_train[ : , 3]
train["tfidf_CBNB_4_char"] = pred_train[ : , 4]
test["tfidf_CBNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 00:16
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.9min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.8min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.9min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.8min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.8min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


cv score :  [0.775734358442237, 0.78177261076148, 0.813803751371879, 0.7902246807326629, 0.8351873041845946]
Mean cv score :  0.7993445410985707
2020/12/11 00:16
2020/12/11 00:42


  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [18]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0_char"] = pred_train[ : , 0]
train["tfidf_CH_1_char"] = pred_train[ : , 1]
train["tfidf_CH_2_char"] = pred_train[ : , 2]
train["tfidf_CH_3_char"] = pred_train[ : , 3]
train["tfidf_CH_4_char"] = pred_train[ : , 4]
test["tfidf_CH_0_char"] = pred_full_test[ : , 0]
test["tfidf_CH_1_char"] = pred_full_test[ : , 1]
test["tfidf_CH_2_char"] = pred_full_test[ : , 2]
test["tfidf_CH_3_char"] = pred_full_test[ : , 3]
test["tfidf_CH_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


2020/12/11 00:42
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
cv score :  [0.5902080924675737, 0.5987877920592308, 0.6051677872017293, 0.585456933590771, 0.5980923984285439]
Mean cv score :  0.5955426007495698
2020/12/11 00:42
2020/12/11 01:13


In [19]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0_char"] = pred_train[ : , 0]
train["tfidf_L_1_char"] = pred_train[ : , 1]
train["tfidf_L_2_char"] = pred_train[ : , 2]
train["tfidf_L_3_char"] = pred_train[ : , 3]
train["tfidf_L_4_char"] = pred_train[ : , 4]
test["tfidf_L_0_char"] = pred_full_test[ : , 0]
test["tfidf_L_1_char"] = pred_full_test[ : , 1]
test["tfidf_L_2_char"] = pred_full_test[ : , 2]
test["tfidf_L_3_char"] = pred_full_test[ : , 3]
test["tfidf_L_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 01:13
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.2min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min finished
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min finished
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.1min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
cv score :  [0.550177760163995, 0.5637414305437569, 0.5709449401238513, 0.5457623095916745, 0.5677299802712953]
Mean cv score :  0.5596712841389146
2020/12/11 01:13
2020/12/11 03:12


### CountVectorizer - char

In [20]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 3)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0_char"] = pred_train[ : , 0]
train["count_MNB_1_char"] = pred_train[ : , 1]
train["count_MNB_2_char"] = pred_train[ : , 2]
train["count_MNB_3_char"] = pred_train[ : , 3]
train["count_MNB_4_char"] = pred_train[ : , 4]
test["count_MNB_0_char"] = pred_full_test[ : , 0]
test["count_MNB_1_char"] = pred_full_test[ : , 1]
test["count_MNB_2_char"] = pred_full_test[ : , 2]
test["count_MNB_3_char"] = pred_full_test[ : , 3]
test["count_MNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 03:12
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.9s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.1s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.6s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.3s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [1.2025769972789655, 1.196017341161219, 1.2158048511371249, 1.204632802204555, 1.215416507101536]
Mean cv score :  1.20688969977668
2020/12/11 03:12
2020/12/11 03:16


In [21]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0_char"] = pred_train[ : , 0]
train["count_CMNB_1_char"] = pred_train[ : , 1]
train["count_CMNB_2_char"] = pred_train[ : , 2]
train["count_CMNB_3_char"] = pred_train[ : , 3]
train["count_CMNB_4_char"] = pred_train[ : , 4]
test["count_CMNB_0_char"] = pred_full_test[ : , 0]
test["count_CMNB_1_char"] = pred_full_test[ : , 1]
test["count_CMNB_2_char"] = pred_full_test[ : , 2]
test["count_CMNB_3_char"] = pred_full_test[ : , 3]
test["count_CMNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 03:16
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min remaining:    0.0s


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.2min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  2.1min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 6)
cv score :  [0.6062053488691869, 0.608204380895722, 0.6301709027666109, 0.6004675728066964, 0.6262398088058964]
Mean cv score :  0.6142576028288225
2020/12/11 03:16
2020/12/11 03:38


In [22]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0_char"] = pred_train[ : , 0]
train["count_CBNB_1_char"] = pred_train[ : , 1]
train["count_CBNB_2_char"] = pred_train[ : , 2]
train["count_CBNB_3_char"] = pred_train[ : , 3]
train["count_CBNB_4_char"] = pred_train[ : , 4]
test["count_CBNB_0_char"] = pred_full_test[ : , 0]
test["count_CBNB_1_char"] = pred_full_test[ : , 1]
test["count_CBNB_2_char"] = pred_full_test[ : , 2]
test["count_CBNB_3_char"] = pred_full_test[ : , 3]
test["count_CBNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 03:38
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  3.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  3.1min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


cv score :  [0.7757343584422367, 0.7817726107614854, 0.813803751371872, 0.7902246807326817, 0.8351873041845956]
Mean cv score :  0.7993445410985742
2020/12/11 03:38
2020/12/11 04:04


  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [23]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0_char"] = pred_train[ : , 0]
train["count_CH_1_char"] = pred_train[ : , 1]
train["count_CH_2_char"] = pred_train[ : , 2]
train["count_CH_3_char"] = pred_train[ : , 3]
train["count_CH_4_char"] = pred_train[ : , 4]
test["count_CH_0_char"] = pred_full_test[ : , 0]
test["count_CH_1_char"] = pred_full_test[ : , 1]
test["count_CH_2_char"] = pred_full_test[ : , 2]
test["count_CH_3_char"] = pred_full_test[ : , 3]
test["count_CH_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


2020/12/11 04:04
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.2min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.2min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
cv score :  [0.6035258738327683, 0.619852610467435, 0.623098286127428, 0.6024450777603059, 0.6164943895087571]
Mean cv score :  0.6130832475393388
2020/12/11 04:04
2020/12/11 04:34


In [24]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0_char"] = pred_train[ : , 0]
train["count_L_1_char"] = pred_train[ : , 1]
train["count_L_2_char"] = pred_train[ : , 2]
train["count_L_3_char"] = pred_train[ : , 3]
train["count_L_4_char"] = pred_train[ : , 4]
test["count_L_0_char"] = pred_full_test[ : , 0]
test["count_L_1_char"] = pred_full_test[ : , 1]
test["count_L_2_char"] = pred_full_test[ : , 2]
test["count_L_3_char"] = pred_full_test[ : , 3]
test["count_L_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 04:34
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.8min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.5min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)
cv score :  [0.5886968390280568, 0.6193664081052572, 0.6113992355748873, 0.5914337779896759, 0.6181004671659751]
Mean cv score :  0.6057993455727704
2020/12/11 04:34
2020/12/11 06:41


### TfidfVectorizer - char_wb

In [25]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 4)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0_char_wb"] = pred_train[ : , 0]
train["tfidf_MNB_1_char_wb"] = pred_train[ : , 1]
train["tfidf_MNB_2_char_wb"] = pred_train[ : , 2]
train["tfidf_MNB_3_char_wb"] = pred_train[ : , 3]
train["tfidf_MNB_4_char_wb"] = pred_train[ : , 4]
test["tfidf_MNB_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_MNB_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_MNB_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_MNB_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_MNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 06:41
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   32.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   32.5s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   31.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   31.5s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.9s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   31.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   31.7s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   28.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   28.7s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
cv score :  [0.9033411197664454, 0.892041555707428, 0.9261146817577017, 0.8996159895761203, 0.9281333671317467]
Mean cv score :  0.9098493427878885
2020/12/11 06:41
2020/12/11 06:48


In [26]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0_char_wb"] = pred_train[ : , 0]
train["tfidf_CMNB_1_char_wb"] = pred_train[ : , 1]
train["tfidf_CMNB_2_char_wb"] = pred_train[ : , 2]
train["tfidf_CMNB_3_char_wb"] = pred_train[ : , 3]
train["tfidf_CMNB_4_char_wb"] = pred_train[ : , 4]
test["tfidf_CMNB_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 06:48
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
cv score :  [0.6937506845908086, 0.7037523543429575, 0.7205482551194616, 0.6907853135222449, 0.7125186653595897]
Mean cv score :  0.7042710545870124
2020/12/11 06:48
2020/12/11 07:00


In [27]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0_char_wb"] = pred_train[ : , 0]
train["tfidf_CBNB_1_char_wb"] = pred_train[ : , 1]
train["tfidf_CBNB_2_char_wb"] = pred_train[ : , 2]
train["tfidf_CBNB_3_char_wb"] = pred_train[ : , 3]
train["tfidf_CBNB_4_char_wb"] = pred_train[ : , 4]
test["tfidf_CBNB_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 07:00
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.7s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.6s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   44.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   44.7s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   47.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   47.3s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   47.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   47.1s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


cv score :  [0.8701149893515928, 0.8763124922320249, 0.8968884931320447, 0.8804419941372349, 0.9200480018935118]
Mean cv score :  0.8887611941492818
2020/12/11 07:00
2020/12/11 07:11


  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [28]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 5)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0_char_wb"] = pred_train[ : , 0]
train["tfidf_CH_1_char_wb"] = pred_train[ : , 1]
train["tfidf_CH_2_char_wb"] = pred_train[ : , 2]
train["tfidf_CH_3_char_wb"] = pred_train[ : , 3]
train["tfidf_CH_4_char_wb"] = pred_train[ : , 4]
test["tfidf_CH_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_CH_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_CH_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_CH_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_CH_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


2020/12/11 07:11
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.4min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
cv score :  [0.6817908383675271, 0.6904015298317142, 0.700992613589556, 0.6792673219766172, 0.7008159298128009]
Mean cv score :  0.6906536467156431
2020/12/11 07:11
2020/12/11 07:31


In [29]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 5)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0_char_wb"] = pred_train[ : , 0]
train["tfidf_L_1_char_wb"] = pred_train[ : , 1]
train["tfidf_L_2_char_wb"] = pred_train[ : , 2]
train["tfidf_L_3_char_wb"] = pred_train[ : , 3]
train["tfidf_L_4_char_wb"] = pred_train[ : , 4]
test["tfidf_L_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_L_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_L_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_L_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_L_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 07:31
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 5)
cv score :  [0.7531322283836422, 0.7595884902535756, 0.7691192904545809, 0.7397663325723934, 0.7664605038631621]
Mean cv score :  0.757613369105471
2020/12/11 07:31
2020/12/11 07:53


### CountVectorizer - char_wb

In [30]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 4)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0_char_wb"] = pred_train[ : , 0]
train["count_MNB_1_char_wb"] = pred_train[ : , 1]
train["count_MNB_2_char_wb"] = pred_train[ : , 2]
train["count_MNB_3_char_wb"] = pred_train[ : , 3]
train["count_MNB_4_char_wb"] = pred_train[ : , 4]
test["count_MNB_0_char_wb"] = pred_full_test[ : , 0]
test["count_MNB_1_char_wb"] = pred_full_test[ : , 1]
test["count_MNB_2_char_wb"] = pred_full_test[ : , 2]
test["count_MNB_3_char_wb"] = pred_full_test[ : , 3]
test["count_MNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 07:53
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   32.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   32.2s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   28.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   28.7s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.2s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   29.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   29.1s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.5s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 4)
cv score :  [1.1986562546767974, 1.175329222155078, 1.2152358670264753, 1.2004725140495378, 1.2106382239947202]
Mean cv score :  1.2000664163805217
2020/12/11 07:53
2020/12/11 08:00


In [31]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0_char_wb"] = pred_train[ : , 0]
train["count_CMNB_1_char_wb"] = pred_train[ : , 1]
train["count_CMNB_2_char_wb"] = pred_train[ : , 2]
train["count_CMNB_3_char_wb"] = pred_train[ : , 3]
train["count_CMNB_4_char_wb"] = pred_train[ : , 4]
test["count_CMNB_0_char_wb"] = pred_full_test[ : , 0]
test["count_CMNB_1_char_wb"] = pred_full_test[ : , 1]
test["count_CMNB_2_char_wb"] = pred_full_test[ : , 2]
test["count_CMNB_3_char_wb"] = pred_full_test[ : , 3]
test["count_CMNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 08:00
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
cv score :  [0.708077687275612, 0.7115430477051486, 0.7358107130375974, 0.7060687560408367, 0.7375644805273464]
Mean cv score :  0.7198129369173082
2020/12/11 08:00
2020/12/11 08:12


In [32]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0_char_wb"] = pred_train[ : , 0]
train["count_CBNB_1_char_wb"] = pred_train[ : , 1]
train["count_CBNB_2_char_wb"] = pred_train[ : , 2]
train["count_CBNB_3_char_wb"] = pred_train[ : , 3]
train["count_CBNB_4_char_wb"] = pred_train[ : , 4]
test["count_CBNB_0_char_wb"] = pred_full_test[ : , 0]
test["count_CBNB_1_char_wb"] = pred_full_test[ : , 1]
test["count_CBNB_2_char_wb"] = pred_full_test[ : , 2]
test["count_CBNB_3_char_wb"] = pred_full_test[ : , 3]
test["count_CBNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 08:12
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.3s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   46.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   46.1s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.0s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   46.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   46.3s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   45.8s finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


cv score :  [0.870114989351585, 0.8763124922320114, 0.8968884931320346, 0.8804419941372403, 0.9200480018935181]
Mean cv score :  0.8887611941492779
2020/12/11 08:12
2020/12/11 08:23


  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [33]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 6)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0_char_wb"] = pred_train[ : , 0]
train["count_CH_1_char_wb"] = pred_train[ : , 1]
train["count_CH_2_char_wb"] = pred_train[ : , 2]
train["count_CH_3_char_wb"] = pred_train[ : , 3]
train["count_CH_4_char_wb"] = pred_train[ : , 4]
test["count_CH_0_char_wb"] = pred_full_test[ : , 0]
test["count_CH_1_char_wb"] = pred_full_test[ : , 1]
test["count_CH_2_char_wb"] = pred_full_test[ : , 2]
test["count_CH_3_char_wb"] = pred_full_test[ : , 3]
test["count_CH_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


2020/12/11 08:23
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.7min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.6min finished


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 6)
cv score :  [0.6656350773335611, 0.6802320699742936, 0.690574878949303, 0.6684665117459598, 0.6870881450584853]
Mean cv score :  0.6783993366123207
2020/12/11 08:23
2020/12/11 08:46


In [34]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0_char_wb"] = pred_train[ : , 0]
train["count_L_1_char_wb"] = pred_train[ : , 1]
train["count_L_2_char_wb"] = pred_train[ : , 2]
train["count_L_3_char_wb"] = pred_train[ : , 3]
train["count_L_4_char_wb"] = pred_train[ : , 4]
test["count_L_0_char_wb"] = pred_full_test[ : , 0]
test["count_L_1_char_wb"] = pred_full_test[ : , 1]
test["count_L_2_char_wb"] = pred_full_test[ : , 2]
test["count_L_3_char_wb"] = pred_full_test[ : , 3]
test["count_L_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 08:46
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char_wb'
	vect__ngram_range: (1, 7)
cv score :  [0.6918741310827037, 0.7141652313957813, 0.7106886108970111, 0.696486717150301, 0.7197985019280598]
Mean cv score :  0.7066026384907713
2020/12/11 08:46
2020/12/11 09:20


### Keras

In [35]:
def preprocessFastText(text):
    text = text.replace("' ", " ' ")
    signs = set(';:,.?!\'“”‘’\"')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocessFastText(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs
docs = create_docs(train)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= 2])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = max([max(len(l) for l in docs)])

docs = pad_sequences(sequences=docs, maxlen=maxlen)

docs_test = create_docs(test)
docs_test = tokenizer.texts_to_sequences(docs_test)
docs_test = pad_sequences(sequences=docs_test, maxlen=maxlen)

xtrain_pad = docs
xtest_pad = docs_test

In [36]:
input_dim = np.max(docs) + 1
embedding_dims = 20

def initFastText(embedding_dims,input_dim):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(5, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [37]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))

ytrain_enc = np_utils.to_categorical(Y_train)
earlyStopping=EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([xtrain_pad.shape[0], 5])

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)

for dev_index, val_index in kf.split(xtrain_pad):
    dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
    dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
    
    model = initFastText(embedding_dims,input_dim)
    model.fit(dev_X, dev_y,
              batch_size=32, 
              epochs=40, 
              verbose=1, 
              validation_data=(val_X, val_y),
              callbacks=[earlyStopping])
    
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(xtest_pad)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print('')
    print('')    
    print('')    
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5 

train["ff_0"] = pred_train[:,0]
train["ff_1"] = pred_train[:,1]
train["ff_2"] = pred_train[:,2]
train["ff_3"] = pred_train[:,3]
train["ff_4"] = pred_train[:,4]
test["ff_0"] = pred_full_test[:,0]
test["ff_1"] = pred_full_test[:,1]
test["ff_2"] = pred_full_test[:,2]
test["ff_3"] = pred_full_test[:,3]
test["ff_4"] = pred_full_test[:,4]

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 09:21
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40



Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40



Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40



Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Ep

Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40



cv score :  [0.4740999388400997, 0.46920848253241726, 0.4839351044227665, 0.44849345774334637, 0.47972955347481894]
Mean cv score :  0.47109330740268973
2020/12/11 09:21
2020/12/11 12:25


In [38]:
max_len = 70
nb_words = 10000

texts_1 = []
for text in train['text']:
    texts_1.append(text)

test_texts_1 = []
for text in test['text']:
    test_texts_1.append(text)

tokenizer = Tokenizer(num_words=nb_words)
tokenizer.fit_on_texts(texts_1)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
word_index = tokenizer.word_index

test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)

xtrain_pad = pad_sequences(sequences_1, maxlen=max_len)
xtest_pad = pad_sequences(test_sequences_1, maxlen=max_len)
del test_sequences_1
del sequences_1
nb_words_cnt = min(nb_words, len(word_index)) + 1

In [39]:
def initNN(nb_words_cnt, max_len):
    model = Sequential()
    model.add(Embedding(nb_words_cnt,32,input_length=max_len))
    model.add(Dropout(0.3))
    model.add(Conv1D(64, 5, padding='valid', activation='relu'))
    model.add(Dropout(0.3))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(800, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

In [40]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))

ytrain_enc = np_utils.to_categorical(Y_train)
earlyStopping=EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([xtrain_pad.shape[0], 5])

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)

for dev_index, val_index in kf.split(xtrain_pad):
    dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
    dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
    
    model = initNN(nb_words_cnt, max_len)
    model.fit(dev_X, dev_y,
              batch_size=32,
              epochs=3,
              verbose=1,
              validation_data=(val_X, val_y),
              callbacks=[earlyStopping])
    
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(xtest_pad)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print('')
    print('')
    print('')
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5 

train["nn_0"] = pred_train[:,0]
train["nn_1"] = pred_train[:,1]
train["nn_2"] = pred_train[:,2]
train["nn_3"] = pred_train[:,3]
train["nn_4"] = pred_train[:,4]

test["nn_0"] = pred_full_test[:,0]
test["nn_1"] = pred_full_test[:,1]
test["nn_2"] = pred_full_test[:,2]
test["nn_3"] = pred_full_test[:,3]
test["nn_4"] = pred_full_test[:,4]

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 12:25
Epoch 1/3
Epoch 2/3
Epoch 3/3



Epoch 1/3
Epoch 2/3
Epoch 3/3



Epoch 1/3
Epoch 2/3
Epoch 3/3



Epoch 1/3
Epoch 2/3
Epoch 3/3



Epoch 1/3
Epoch 2/3
Epoch 3/3



cv score :  [0.6514716043131098, 0.6403169269187412, 0.6709669889555318, 0.6373779642953998, 0.6743653679029192]
Mean cv score :  0.6548997704771403
2020/12/11 12:25
2020/12/11 12:35


## 모델 학습 및 검증
## Model Tuning & Evaluation

In [41]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# Final Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 5
#     param['silent'] = 1
    param['num_class'] = 5
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(train, test, Y_train):
    drop_columns=['index', "text"]
    x_train = train.drop(drop_columns+['author'],axis=1)
    x_test = test.drop(drop_columns,axis=1)
    y_train = Y_train
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([x_train.shape[0], 5])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("cv score : ", cv_scores)
    print("Mean cv score : ", np.mean(cv_scores))
    return pred_full_test/5
result = do(train, test, Y_train)

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/11 12:35
[0]	train-mlogloss:1.42714	test-mlogloss:1.42988
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[20]	train-mlogloss:0.45890	test-mlogloss:0.48008
[40]	train-mlogloss:0.33915	test-mlogloss:0.37481
[60]	train-mlogloss:0.30332	test-mlogloss:0.35502
[80]	train-mlogloss:0.28099	test-mlogloss:0.34956
[100]	train-mlogloss:0.26147	test-mlogloss:0.34749
[120]	train-mlogloss:0.24328	test-mlogloss:0.34703
[140]	train-mlogloss:0.22628	test-mlogloss:0.34653
[160]	train-mlogloss:0.21135	test-mlogloss:0.34641
[180]	train-mlogloss:0.19691	test-mlogloss:0.34603
[200]	train-mlogloss:0.18366	test-mlogloss:0.34636
[220]	train-mlogloss:0.17220	test-mlogloss:0.34668
Stopping. Best iteration:
[187]	train-mlogloss:0.19212	test-mlogloss:0.34597

[0]	train-mlogloss:1.42719	test-mlogloss:1.42844
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will trai

## 결과 및 결언
## Conclusion & Discussion

In [43]:
sample_submission=pd.read_csv('open/sample_submission.csv', encoding='utf-8')
sample_submission[['0', '1', '2', '3', '4']] = result
sample_submission.to_csv("sub_4_1210.csv", index=False)
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.009786,0.527903,0.446004,0.014256,0.002052
1,1,0.004383,0.988308,0.002804,0.001649,0.002856
2,2,0.999346,0.000363,0.000054,0.000057,0.000180
3,3,0.000260,0.002859,0.996193,0.000219,0.000469
4,4,0.995207,0.001464,0.001163,0.001277,0.000889
...,...,...,...,...,...,...
19612,19612,0.000182,0.999574,0.000105,0.000084,0.000055
19613,19613,0.000662,0.000085,0.000369,0.000090,0.998794
19614,19614,0.000124,0.999625,0.000082,0.000117,0.000052
19615,19615,0.000133,0.999480,0.000206,0.000125,0.000057
