In [24]:
import pandas as pd
import numpy as np

import re

import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, pipeline, model_selection, naive_bayes
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb

import time

from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [25]:
pd.set_option('display.max_columns', 200)
train = pd.read_csv('open/new_train.csv')
test = pd.read_csv('open/new_test.csv')

Y_train = LabelEncoder().fit_transform(train['author'])

In [26]:
X_train=train['text'].str.replace('[^a-zA-Z0-9]', ' ')
y_train=train['author']
X_test=test['text'].str.replace('[^a-zA-Z0-9]', ' ')

In [27]:
# train = train[['punc_1','punc_2','punc_3','punc_4','punc_5','punc_6','punc_7', 'index', 'text', 'author']]

# test = test[['punc_1','punc_2','punc_3','punc_4','punc_5','punc_6','punc_7', 'index', 'text']]

columns = ['punc_1','punc_2','punc_3','punc_4','punc_5','punc_6','punc_7',
          'tfidf_MNB_0', 'tfidf_MNB_1', 'tfidf_MNB_2', 'tfidf_MNB_3', 'tfidf_MNB_4',
           'tfidf_CMNB_0', 'tfidf_CMNB_1', 'tfidf_CMNB_2', 'tfidf_CMNB_3', 'tfidf_CMNB_4',
           'tfidf_CBNB_0', 'tfidf_CBNB_1', 'tfidf_CBNB_2', 'tfidf_CBNB_3', 'tfidf_CBNB_4',
           'tfidf_CH_0', 'tfidf_CH_1', 'tfidf_CH_2', 'tfidf_CH_3', 'tfidf_CH_4',
           'count_MNB_0', 'count_MNB_1', 'count_MNB_2', 'count_MNB_3', 'count_MNB_4',
           'count_CMNB_0', 'count_CMNB_1', 'count_CMNB_2', 'count_CMNB_3', 'count_CMNB_4',
           'count_CBNB_0', 'count_CBNB_1', 'count_CBNB_2', 'count_CBNB_3', 'count_CBNB_4',
           'count_CH_0', 'count_CH_1', 'count_CH_2', 'count_CH_3', 'count_CH_4',
           'ff_0', 'ff_1', 'ff_2', 'ff_3', 'ff_4', 
           'nn_0', 'nn_1', 'nn_2', 'nn_3', 'nn_4'
          ]

### TfidfVectorizer - word

In [28]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
                  'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0"] = pred_train[ : , 0]
train["tfidf_MNB_1"] = pred_train[ : , 1]
train["tfidf_MNB_2"] = pred_train[ : , 2]
train["tfidf_MNB_3"] = pred_train[ : , 3]
train["tfidf_MNB_4"] = pred_train[ : , 4]
test["tfidf_MNB_0"] = pred_full_test[ : , 0]
test["tfidf_MNB_1"] = pred_full_test[ : , 1]
test["tfidf_MNB_2"] = pred_full_test[ : , 2]
test["tfidf_MNB_3"] = pred_full_test[ : , 3]
test["tfidf_MNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 22:35
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   20.8s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   22.4s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.3
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   17.8s remaining:   53.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   19.4s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   18.8s remaining:   56.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   20.1s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   19.5s remaining:   58.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   20.7s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.3
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   19.6s remaining:   59.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   20.8s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
cv score :  [0.6029955683225479, 0.604635802543013, 0.6211772682884821, 0.6004100043878932, 0.6154952568908586]
Mean cv score :  0.608942780086559
2020/12/01 22:35
2020/12/01 22:39


In [29]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0"] = pred_train[ : , 0]
train["tfidf_CMNB_1"] = pred_train[ : , 1]
train["tfidf_CMNB_2"] = pred_train[ : , 2]
train["tfidf_CMNB_3"] = pred_train[ : , 3]
train["tfidf_CMNB_4"] = pred_train[ : , 4]
test["tfidf_CMNB_0"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 22:39
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.2s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.8s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.3s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.8s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
cv score :  [0.5976665473926792, 0.5975338255728783, 0.6173897988183089, 0.5963371295574684, 0.6096579691870107]
Mean cv score :  0.6037170541056691
2020/12/01 22:39
2020/12/01 22:43


In [30]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0"] = pred_train[ : , 0]
train["tfidf_CBNB_1"] = pred_train[ : , 1]
train["tfidf_CBNB_2"] = pred_train[ : , 2]
train["tfidf_CBNB_3"] = pred_train[ : , 3]
train["tfidf_CBNB_4"] = pred_train[ : , 4]
test["tfidf_CBNB_0"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 22:43
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.3s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   21.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   21.0s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.3s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.6s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
cv score :  [0.6292165710596017, 0.6394984741232733, 0.646819786619428, 0.6312645279907801, 0.6562403089836737]
Mean cv score :  0.6406079337553514
2020/12/01 22:43
2020/12/01 22:47


In [31]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0"] = pred_train[ : , 0]
train["tfidf_CH_1"] = pred_train[ : , 1]
train["tfidf_CH_2"] = pred_train[ : , 2]
train["tfidf_CH_3"] = pred_train[ : , 3]
train["tfidf_CH_4"] = pred_train[ : , 4]
test["tfidf_CH_0"] = pred_full_test[ : , 0]
test["tfidf_CH_1"] = pred_full_test[ : , 1]
test["tfidf_CH_2"] = pred_full_test[ : , 2]
test["tfidf_CH_3"] = pred_full_test[ : , 3]
test["tfidf_CH_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 22:47
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.5s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   19.5s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.9s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   18.6s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   17.4s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.6026105354437366, 0.6178484475316551, 0.6170057605498341, 0.601472768549464, 0.6213367185277744]
Mean cv score :  0.6120548461204928
2020/12/01 22:47
2020/12/01 22:51


In [32]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0"] = pred_train[ : , 0]
train["tfidf_L_1"] = pred_train[ : , 1]
train["tfidf_L_2"] = pred_train[ : , 2]
train["tfidf_L_3"] = pred_train[ : , 3]
train["tfidf_L_4"] = pred_train[ : , 4]
test["tfidf_L_0"] = pred_full_test[ : , 0]
test["tfidf_L_1"] = pred_full_test[ : , 1]
test["tfidf_L_2"] = pred_full_test[ : , 2]
test["tfidf_L_3"] = pred_full_test[ : , 3]
test["tfidf_L_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 22:51
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.2min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.1min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  2.6min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.5620624641494352, 0.5857273048883577, 0.5885432314650478, 0.5693716851561413, 0.5916510341517979]
Mean cv score :  0.579471143962156
2020/12/01 22:51
2020/12/01 23:22


### CountVectorizer - word

In [33]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
                  'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0"] = pred_train[ : , 0]
train["count_MNB_1"] = pred_train[ : , 1]
train["count_MNB_2"] = pred_train[ : , 2]
train["count_MNB_3"] = pred_train[ : , 3]
train["count_MNB_4"] = pred_train[ : , 4]
test["count_MNB_0"] = pred_full_test[ : , 0]
test["count_MNB_1"] = pred_full_test[ : , 1]
test["count_MNB_2"] = pred_full_test[ : , 2]
test["count_MNB_3"] = pred_full_test[ : , 3]
test["count_MNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:22
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   15.7s remaining:   47.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   21.2s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   18.7s remaining:   56.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   19.4s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   10.0s remaining:   30.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   10.6s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   11.0s remaining:   33.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   11.8s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   11.2s remaining:   33.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   11.8s finished


	clf__alpha: 0.024
	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
cv score :  [0.5776704326928397, 0.5804705854660563, 0.6017163249986536, 0.5748083523087922, 0.5919857145779784]
Mean cv score :  0.5853302820088641
2020/12/01 23:22
2020/12/01 23:24


In [34]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0"] = pred_train[ : , 0]
train["count_CMNB_1"] = pred_train[ : , 1]
train["count_CMNB_2"] = pred_train[ : , 2]
train["count_CMNB_3"] = pred_train[ : , 3]
train["count_CMNB_4"] = pred_train[ : , 4]
test["count_CMNB_0"] = pred_full_test[ : , 0]
test["count_CMNB_1"] = pred_full_test[ : , 1]
test["count_CMNB_2"] = pred_full_test[ : , 2]
test["count_CMNB_3"] = pred_full_test[ : , 3]
test["count_CMNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:24
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.7s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    9.8s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   11.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   11.4s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.4s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
cv score :  [0.5721619114226169, 0.5751870347960057, 0.5952297376162479, 0.5675133432744751, 0.5877951662222627]
Mean cv score :  0.5795774386663217
2020/12/01 23:24
2020/12/01 23:26


In [35]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0"] = pred_train[ : , 0]
train["count_CBNB_1"] = pred_train[ : , 1]
train["count_CBNB_2"] = pred_train[ : , 2]
train["count_CBNB_3"] = pred_train[ : , 3]
train["count_CBNB_4"] = pred_train[ : , 4]
test["count_CBNB_0"] = pred_full_test[ : , 0]
test["count_CBNB_1"] = pred_full_test[ : , 1]
test["count_CBNB_2"] = pred_full_test[ : , 2]
test["count_CBNB_3"] = pred_full_test[ : , 3]
test["count_CBNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:26
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    8.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    8.4s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    8.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    8.1s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    9.4s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   10.9s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   11.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   11.0s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
cv score :  [0.6292165710596003, 0.6394984741232734, 0.6468197866192531, 0.6312645279907797, 0.6562403089836758]
Mean cv score :  0.6406079337553165
2020/12/01 23:26
2020/12/01 23:28


In [36]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0"] = pred_train[ : , 0]
train["count_CH_1"] = pred_train[ : , 1]
train["count_CH_2"] = pred_train[ : , 2]
train["count_CH_3"] = pred_train[ : , 3]
train["count_CH_4"] = pred_train[ : , 4]
test["count_CH_0"] = pred_full_test[ : , 0]
test["count_CH_1"] = pred_full_test[ : , 1]
test["count_CH_2"] = pred_full_test[ : , 2]
test["count_CH_3"] = pred_full_test[ : , 3]
test["count_CH_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:28
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.9s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.6s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.6s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.4s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    9.4s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.6122273500581314, 0.6310874332279297, 0.6303188097453891, 0.6110604158565444, 0.6305159249029815]
Mean cv score :  0.6230419867581952
2020/12/01 23:28
2020/12/01 23:30


In [37]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0"] = pred_train[ : , 0]
train["count_L_1"] = pred_train[ : , 1]
train["count_L_2"] = pred_train[ : , 2]
train["count_L_3"] = pred_train[ : , 3]
train["count_L_4"] = pred_train[ : , 4]
test["count_L_0"] = pred_full_test[ : , 0]
test["count_L_1"] = pred_full_test[ : , 1]
test["count_L_2"] = pred_full_test[ : , 2]
test["count_L_3"] = pred_full_test[ : , 3]
test["count_L_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:30
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.3min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.59456263122513, 0.6247646427582598, 0.6199121419343707, 0.5983970572232508, 0.6207081495507819]
Mean cv score :  0.6116689245383586
2020/12/01 23:30
2020/12/01 23:46


### TfidfVectorizer - char

In [38]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0_char"] = pred_train[ : , 0]
train["tfidf_MNB_1_char"] = pred_train[ : , 1]
train["tfidf_MNB_2_char"] = pred_train[ : , 2]
train["tfidf_MNB_3_char"] = pred_train[ : , 3]
train["tfidf_MNB_4_char"] = pred_train[ : , 4]
test["tfidf_MNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_MNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_MNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_MNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_MNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:46
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.8s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.2s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   13.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   13.9s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.4s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   32.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   32.2s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [1.0115280239076807, 1.0125400559034543, 1.0284658391939439, 1.012186660775046, 1.032666145731224]
Mean cv score :  1.01947734510227
2020/12/01 23:46
2020/12/01 23:51


In [39]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0_char"] = pred_train[ : , 0]
train["tfidf_CMNB_1_char"] = pred_train[ : , 1]
train["tfidf_CMNB_2_char"] = pred_train[ : , 2]
train["tfidf_CMNB_3_char"] = pred_train[ : , 3]
train["tfidf_CMNB_4_char"] = pred_train[ : , 4]
test["tfidf_CMNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:51
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   28.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   28.0s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   25.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   25.8s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   25.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   25.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   26.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   26.0s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   26.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   26.0s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [0.8708013456746512, 0.8832552137613162, 0.8938444246641268, 0.8692223806167206, 0.8929140616467663]
Mean cv score :  0.8820074852727162
2020/12/01 23:51
2020/12/01 23:56


In [40]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0_char"] = pred_train[ : , 0]
train["tfidf_CBNB_1_char"] = pred_train[ : , 1]
train["tfidf_CBNB_2_char"] = pred_train[ : , 2]
train["tfidf_CBNB_3_char"] = pred_train[ : , 3]
train["tfidf_CBNB_4_char"] = pred_train[ : , 4]
test["tfidf_CBNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/01 23:56
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   27.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   27.3s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   23.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   23.0s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   24.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   24.9s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.3s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [1.2502839254949032, 1.242559166405652, 1.2661564218326058, 1.2522426310801658, 1.2723725930825402]
Mean cv score :  1.2567229475791735
2020/12/01 23:56
2020/12/02 00:00


In [41]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0_char"] = pred_train[ : , 0]
train["tfidf_CH_1_char"] = pred_train[ : , 1]
train["tfidf_CH_2_char"] = pred_train[ : , 2]
train["tfidf_CH_3_char"] = pred_train[ : , 3]
train["tfidf_CH_4_char"] = pred_train[ : , 4]
test["tfidf_CH_0_char"] = pred_full_test[ : , 0]
test["tfidf_CH_1_char"] = pred_full_test[ : , 1]
test["tfidf_CH_2_char"] = pred_full_test[ : , 2]
test["tfidf_CH_3_char"] = pred_full_test[ : , 3]
test["tfidf_CH_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:00
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.3min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.0min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [0.7518715154701759, 0.7696286784047797, 0.7732840873435437, 0.7469738449370563, 0.7654980656858875]
Mean cv score :  0.7614512383682887
2020/12/02 00:00
2020/12/02 00:14


In [42]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0_char"] = pred_train[ : , 0]
train["tfidf_L_1_char"] = pred_train[ : , 1]
train["tfidf_L_2_char"] = pred_train[ : , 2]
train["tfidf_L_3_char"] = pred_train[ : , 3]
train["tfidf_L_4_char"] = pred_train[ : , 4]
test["tfidf_L_0_char"] = pred_full_test[ : , 0]
test["tfidf_L_1_char"] = pred_full_test[ : , 1]
test["tfidf_L_2_char"] = pred_full_test[ : , 2]
test["tfidf_L_3_char"] = pred_full_test[ : , 3]
test["tfidf_L_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:14
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   42.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   42.0s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   36.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   36.1s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   36.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   36.7s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   38.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   38.2s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   37.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   37.9s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [0.8095727079575089, 0.8458538955991773, 0.8720832585088008, 0.7842694820340493, 0.8464996491968897]
Mean cv score :  0.8316557986592852
2020/12/02 00:14
2020/12/02 00:23


### CountVectorizer - char

In [43]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0_char"] = pred_train[ : , 0]
train["count_MNB_1_char"] = pred_train[ : , 1]
train["count_MNB_2_char"] = pred_train[ : , 2]
train["count_MNB_3_char"] = pred_train[ : , 3]
train["count_MNB_4_char"] = pred_train[ : , 4]
test["count_MNB_0_char"] = pred_full_test[ : , 0]
test["count_MNB_1_char"] = pred_full_test[ : , 1]
test["count_MNB_2_char"] = pred_full_test[ : , 2]
test["count_MNB_3_char"] = pred_full_test[ : , 3]
test["count_MNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:23
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   14.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   14.5s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   14.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   14.1s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   17.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.2s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [1.2025769972789655, 1.196017341161219, 1.2158048511371249, 1.204632802204555, 1.215416507101536]
Mean cv score :  1.20688969977668
2020/12/02 00:23
2020/12/02 00:26


In [44]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0_char"] = pred_train[ : , 0]
train["count_CMNB_1_char"] = pred_train[ : , 1]
train["count_CMNB_2_char"] = pred_train[ : , 2]
train["count_CMNB_3_char"] = pred_train[ : , 3]
train["count_CMNB_4_char"] = pred_train[ : , 4]
test["count_CMNB_0_char"] = pred_full_test[ : , 0]
test["count_CMNB_1_char"] = pred_full_test[ : , 1]
test["count_CMNB_2_char"] = pred_full_test[ : , 2]
test["count_CMNB_3_char"] = pred_full_test[ : , 3]
test["count_CMNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:26
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.3s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.6s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.5s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   19.0s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.5s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [0.899373766481415, 0.9028902679565609, 0.9204983140389096, 0.8936078919693751, 0.9246590782329285]
Mean cv score :  0.9082058637358378
2020/12/02 00:26
2020/12/02 00:30


In [45]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0_char"] = pred_train[ : , 0]
train["count_CBNB_1_char"] = pred_train[ : , 1]
train["count_CBNB_2_char"] = pred_train[ : , 2]
train["count_CBNB_3_char"] = pred_train[ : , 3]
train["count_CBNB_4_char"] = pred_train[ : , 4]
test["count_CBNB_0_char"] = pred_full_test[ : , 0]
test["count_CBNB_1_char"] = pred_full_test[ : , 1]
test["count_CBNB_2_char"] = pred_full_test[ : , 2]
test["count_CBNB_3_char"] = pred_full_test[ : , 3]
test["count_CBNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:30
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.6s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   21.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   21.7s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.3s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   15.8s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   20.9s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [1.2502839254949023, 1.2425591664056512, 1.2661564218326065, 1.2522426310801653, 1.2723725930825422]
Mean cv score :  1.2567229475791735
2020/12/02 00:30
2020/12/02 00:34


In [46]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0_char"] = pred_train[ : , 0]
train["count_CH_1_char"] = pred_train[ : , 1]
train["count_CH_2_char"] = pred_train[ : , 2]
train["count_CH_3_char"] = pred_train[ : , 3]
train["count_CH_4_char"] = pred_train[ : , 4]
test["count_CH_0_char"] = pred_full_test[ : , 0]
test["count_CH_1_char"] = pred_full_test[ : , 1]
test["count_CH_2_char"] = pred_full_test[ : , 2]
test["count_CH_3_char"] = pred_full_test[ : , 3]
test["count_CH_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:34
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   59.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   59.2s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   58.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   58.3s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.0min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   57.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   57.6s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   58.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   58.0s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [0.7693733031820273, 0.7891490830442921, 0.794005424334621, 0.7662423414060131, 0.7873459511333336]
Mean cv score :  0.7812232206200573
2020/12/02 00:34
2020/12/02 00:46


In [47]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2), (1, 3)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0_char"] = pred_train[ : , 0]
train["count_L_1_char"] = pred_train[ : , 1]
train["count_L_2_char"] = pred_train[ : , 2]
train["count_L_3_char"] = pred_train[ : , 3]
train["count_L_4_char"] = pred_train[ : , 4]
test["count_L_0_char"] = pred_full_test[ : , 0]
test["count_L_1_char"] = pred_full_test[ : , 1]
test["count_L_2_char"] = pred_full_test[ : , 2]
test["count_L_3_char"] = pred_full_test[ : , 3]
test["count_L_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:46
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   38.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   38.4s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   37.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   37.7s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   38.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   38.9s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   39.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   39.1s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   34.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   34.9s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


	vect__analyzer: 'char'
	vect__ngram_range: (1, 3)
cv score :  [0.7988854018201208, 0.8270402219678563, 0.8227781506143519, 0.812051650233377, 0.8311840983155193]
Mean cv score :  0.8183879045902451
2020/12/02 00:46
2020/12/02 00:55


In [48]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# Final Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 5
#     param['silent'] = 1
    param['num_class'] = 5
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(train, test, Y_train):
    drop_columns=['index', "text"]
    x_train = train.drop(drop_columns+['author'],axis=1)
    x_test = test.drop(drop_columns,axis=1)
    y_train = Y_train
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([x_train.shape[0], 5])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("cv score : ", cv_scores)
    print("Mean cv score : ", np.mean(cv_scores))
    return pred_full_test/5
result = do(train, test, Y_train)

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2020/12/02 00:55
[0]	train-mlogloss:1.42802	test-mlogloss:1.42744
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[20]	train-mlogloss:0.46950	test-mlogloss:0.48200
[40]	train-mlogloss:0.35241	test-mlogloss:0.38115
[60]	train-mlogloss:0.31889	test-mlogloss:0.36436
[80]	train-mlogloss:0.29810	test-mlogloss:0.35896
[100]	train-mlogloss:0.27932	test-mlogloss:0.35714
[120]	train-mlogloss:0.26241	test-mlogloss:0.35603
[140]	train-mlogloss:0.24639	test-mlogloss:0.35555
[160]	train-mlogloss:0.23160	test-mlogloss:0.35489
[180]	train-mlogloss:0.21767	test-mlogloss:0.35511
[200]	train-mlogloss:0.20495	test-mlogloss:0.35518
Stopping. Best iteration:
[157]	train-mlogloss:0.23374	test-mlogloss:0.35475

[0]	train-mlogloss:1.42771	test-mlogloss:1.43243
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.

In [49]:
# # cv score :  [0.3641794996825376, 0.3822802325154725, 0.37811282920527145, 0.3630294105994838, 0.3801573158655308]
# # Mean cv score :  0.3735518575736593

# cv score :  [0.35475335016523574, 0.3753240263740126, 0.37369690713018755, 0.3574915867155482, 0.37240149473934475]
# Mean cv score :  0.3667334730248658

In [51]:
train.to_csv('open/new_train.csv', index=False)
test.to_csv('open/new_test.csv', index=False)

In [50]:
sample_submission=pd.read_csv('open/sample_submission.csv', encoding='utf-8')
sample_submission[['0', '1', '2', '3', '4']] = result
sample_submission.to_csv("kg_8_1201.csv", index=False)
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.005643,0.569645,0.409466,0.013175,0.002072
1,1,0.002242,0.994056,0.001584,0.000525,0.001594
2,2,0.998047,0.000647,0.000165,0.000141,0.001000
3,3,0.000277,0.004009,0.994443,0.000250,0.001020
4,4,0.987226,0.004116,0.003481,0.004380,0.000798
...,...,...,...,...,...,...
19612,19612,0.000185,0.999471,0.000119,0.000154,0.000070
19613,19613,0.000428,0.000189,0.000704,0.000133,0.998547
19614,19614,0.000415,0.999104,0.000151,0.000237,0.000093
19615,19615,0.000282,0.998725,0.000415,0.000467,0.000110
