In [1]:
import pandas as pd
import numpy as np

import re

import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, naive_bayes, model_selection
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import xgboost as xgb

from tqdm import tqdm

from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [2]:
pd.set_option('display.max_columns', 200)
X_train = pd.read_csv('open/train.csv')
X_test = pd.read_csv( 'open/test_x.csv')

# authors = [0,1,2,3,4]
Y_train = LabelEncoder().fit_transform(X_train['author'])

In [3]:
# 소문자, 단어 나눔
def clean(X_train,X_test):
    X_train['words'] = [re.sub("[^a-zA-Z]"," ", data).lower().split() for data in X_train['text']]
    X_test['words'] = [re.sub("[^a-zA-Z]"," ", data).lower().split() for data in X_test['text']]
    return X_train,X_test
X_train,X_test = clean(X_train,X_test)

In [4]:
# 특수문자 확인하기
p = re.compile('[a-z]|[A-Z]|[0-9]')
char = {}
for text in X_train.text:
    text = text.replace(' ', '')
    for c in text:
        char[c] = '_'
char_list = list(char.keys())
x = ' '.join(char_list)
y = p.findall(x)
sp_char_list = list(set(char_list) - set(y))
print(sp_char_list)

['ì', 'ô', 'ù', '?', 'ü', 'ç', ';', '—', 'ï', 'î', ')', '*', '’', '”', '‘', '(', '“', 'Æ', 'ê', 'Œ', 'æ', 'ö', 'ñ', '/', '[', '‐', '-', 'ë', 'à', 'â', '!', 'ä', '"', '}', "'", 'è', ':', '_', 'º', 'Ê', '£', ']', 'œ', '#', ',', '{', '&', 'é', '.']


In [5]:
# 구두점 비율(문장 안에 각 부호가 얼마나 있는지)
punctuations = [{"id":1, "p" : "[;:]"},
                {"id":2, "p" : "[,.]"},
                {"id":3, "p" : "[?]"},
                {"id":4, "p" : "[!]"},
                {"id":5, "p" : "[‘’\']"},
                {"id":6, "p" : "[“”\"]"},
                {"id":7, "p" : "[;:,.?!\'“”‘’\"]"}]

for p in punctuations:
    punctuation = p["p"]
    _train =  [sentence.split() for sentence in X_train['text']]
    X_train['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _train]

    _test =  [sentence.split() for sentence in X_test['text']]
    X_test['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _test]

In [6]:
# tfidf - words - nb (학습의 단위를 단어로 설정)
def tfidfWords(X_train, X_test):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 2), analyzer = 'word')
    full_tfidf = tfidf_vec.fit_transform(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_tfidf = tfidf_vec.transform(X_train['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(X_test['text'].values.tolist())
    return train_tfidf, test_tfidf, full_tfidf
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_tfidf_MNB(X_train, X_test, Y_train):
    train_tfidf, test_tfidf, full_tfidf = tfidfWords(X_train, X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 5])
    kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        print('pred_val_y     : ', pred_val_y[0])
        print('pred_test_y    : ', pred_test_y[0])
        pred_full_test = pred_full_test + pred_test_y
        print('pred_full_test : ', pred_full_test[0])
        pred_train[val_index, : ] = pred_val_y
        print('pred_train     : ' , pred_train[0])
        print('')
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5
    return pred_train,pred_full_test

pred_train, pred_test = do_tfidf_MNB(X_train, X_test, Y_train)
X_train["tfidf_words_nb_0"] = pred_train[ : , 0]
X_train["tfidf_words_nb_1"] = pred_train[ : , 1]
X_train["tfidf_words_nb_2"] = pred_train[ : , 2]
X_train["tfidf_words_nb_3"] = pred_train[ : , 3]
X_train["tfidf_words_nb_4"] = pred_train[ : , 4]
X_test["tfidf_words_nb_0"] = pred_test[ : , 0]
X_test["tfidf_words_nb_1"] = pred_test[ : , 1]
X_test["tfidf_words_nb_2"] = pred_test[ : , 2]
X_test["tfidf_words_nb_3"] = pred_test[ : , 3]
X_test["tfidf_words_nb_4"] = pred_test[ : , 4]

pred_val_y     :  [0.07813081 0.0178482  0.07129607 0.81539852 0.0173264 ]
pred_test_y    :  [0.19020287 0.13951587 0.16309699 0.46061922 0.04656505]
pred_full_test :  [0.19020287 0.13951587 0.16309699 0.46061922 0.04656505]
pred_train     :  [0.07813081 0.0178482  0.07129607 0.81539852 0.0173264 ]

pred_val_y     :  [0.22769558 0.08564251 0.31930272 0.21442673 0.15293246]
pred_test_y    :  [0.18826039 0.15068576 0.15663496 0.45675144 0.04766744]
pred_full_test :  [0.37846326 0.29020163 0.31973195 0.91737066 0.0942325 ]
pred_train     :  [0.07813081 0.0178482  0.07129607 0.81539852 0.0173264 ]

pred_val_y     :  [0.23746063 0.05438223 0.38674226 0.23436174 0.08705313]
pred_test_y    :  [0.19612146 0.16123402 0.17670903 0.41781048 0.04812501]
pred_full_test :  [0.57458472 0.45143565 0.49644098 1.33518114 0.14235751]
pred_train     :  [0.07813081 0.0178482  0.07129607 0.81539852 0.0173264 ]

pred_val_y     :  [0.15653621 0.05095289 0.37101161 0.19084719 0.23065209]
pred_test_y    :  [0.2

In [7]:
# tfidf - chars - nb (학습의 단위를 글자로 설정)
def tfidfWords(X_train, X_test):
    tfidf_vec = TfidfVectorizer(stop_words = 'english', ngram_range = (1, 3), analyzer = 'char')
    full_tfidf = tfidf_vec.fit_transform(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_tfidf = tfidf_vec.transform(X_train['text'].values.tolist())
    test_tfidf = tfidf_vec.transform(X_test['text'].values.tolist())
    return train_tfidf, test_tfidf
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do(X_train, X_test, Y_train):
    train_tfidf, test_tfidf = tfidfWords(X_train, X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 5])
    kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_tfidf[dev_index], train_tfidf[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_tfidf)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index, : ] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5
    return pred_train,pred_full_test
pred_train,pred_test = do(X_train,X_test,Y_train)
X_train["tfidf_chars_nb_0"] = pred_train[:,0]
X_train["tfidf_chars_nb_1"] = pred_train[:,1]
X_train["tfidf_chars_nb_2"] = pred_train[:,2]
X_train["tfidf_chars_nb_3"] = pred_train[:,3]
X_train["tfidf_chars_nb_4"] = pred_train[:,4]
X_test["tfidf_chars_nb_0"] = pred_test[:,0]
X_test["tfidf_chars_nb_1"] = pred_test[:,1]
X_test["tfidf_chars_nb_2"] = pred_test[:,2]
X_test["tfidf_chars_nb_3"] = pred_test[:,3]
X_test["tfidf_chars_nb_4"] = pred_test[:,4]

Mean cv score :  1.0896401622640728


In [8]:
# count - words - nb
def countWords(X_train, X_test):
    count_vec = CountVectorizer(stop_words = 'english', ngram_range=(1, 2), analyzer = 'word')
    count_vec.fit(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_count = count_vec.transform(X_train['text'].values.tolist())
    test_count = count_vec.transform(X_test['text'].values.tolist())
    return train_count, test_count
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_count_MNB(X_train, X_test, Y_train):
    train_count, test_count=countWords(X_train, X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 5])
    kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_count[dev_index], train_count[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_count)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index, : ] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5
    return pred_train, pred_full_test

pred_train, pred_test = do_count_MNB(X_train, X_test, Y_train)
X_train["count_words_nb_0"] = pred_train[ : , 0]
X_train["count_words_nb_1"] = pred_train[ : , 1]
X_train["count_words_nb_2"] = pred_train[ : , 2]
X_train["count_words_nb_3"] = pred_train[ : , 3]
X_train["count_words_nb_4"] = pred_train[ : , 4]
X_test["count_words_nb_0"] = pred_test[ : , 0]
X_test["count_words_nb_1"] = pred_test[ : , 1]
X_test["count_words_nb_2"] = pred_test[ : , 2]
X_test["count_words_nb_3"] = pred_test[ : , 3]
X_test["count_words_nb_4"] = pred_test[ : , 4]

Mean cv score :  1.1484245877084394


In [9]:
# Mean cv score :  1.1792103306062116

In [10]:
# count - chars - nb
def countChars(X_train,X_test):
    count_vec = CountVectorizer(ngram_range = (1, 3), analyzer = 'char')
    count_vec.fit(X_train['text'].values.tolist() + X_test['text'].values.tolist())
    train_count = count_vec.transform(X_train['text'].values.tolist())
    test_count = count_vec.transform(X_test['text'].values.tolist())
    return train_count, test_count
    
def runMNB(train_X, train_y, test_X, test_y, test_X2):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

def do_count_chars_MNB(X_train, X_test, Y_train):
    train_count, test_count = countChars(X_train, X_test)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([X_train.shape[0], 5])
    kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
    for dev_index, val_index in kf.split(X_train):
        dev_X, val_X = train_count[dev_index], train_count[val_index]
        dev_y, val_y = Y_train[dev_index], Y_train[val_index]
        pred_val_y, pred_test_y, model = runMNB(dev_X, dev_y, val_X, val_y, test_count)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index, : ] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("Mean cv score : ", np.mean(cv_scores))
    pred_full_test = pred_full_test / 5
    return pred_train, pred_full_test

pred_train, pred_test = do_count_chars_MNB(X_train, X_test, Y_train)
X_train["count_chars_nb_0"] = pred_train[ : , 0]
X_train["count_chars_nb_1"] = pred_train[ : , 1]
X_train["count_chars_nb_2"] = pred_train[ : , 2]
X_train["count_chars_nb_3"] = pred_train[ : , 3]
X_train["count_chars_nb_4"] = pred_train[ : , 4]

X_test["count_chars_nb_0"] = pred_test[ : , 0]
X_test["count_chars_nb_1"] = pred_test[ : , 1]
X_test["count_chars_nb_2"] = pred_test[ : , 2]
X_test["count_chars_nb_3"] = pred_test[ : , 3]
X_test["count_chars_nb_4"] = pred_test[ : , 4]

Mean cv score :  3.264365333331928


In [11]:
# Mean cv score :  5.806984089402464   --> (1, 7)
# Mean cv score :  4.383101848616254   --> (1, 5)
# Mean cv score :  3.264365333331928   --> (1, 3)

In [12]:
earlyStopping=EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')

# Fast Text
def doAddFastText(X_train,X_test,pred_train,pred_test):
    X_train["ff_0"] = pred_train[:,0]
    X_train["ff_1"] = pred_train[:,1]
    X_train["ff_2"] = pred_train[:,2]
    X_train["ff_3"] = pred_train[:,3]
    X_train["ff_4"] = pred_train[:,4]
    X_test["ff_0"] = pred_test[:,0]
    X_test["ff_1"] = pred_test[:,1]
    X_test["ff_2"] = pred_test[:,2]
    X_test["ff_3"] = pred_test[:,3]
    X_test["ff_4"] = pred_test[:,4]
    return X_train,X_test

def initFastText(embedding_dims,input_dim):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(5, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def preprocessFastText(text):
    text = text.replace("' ", " ' ")
    signs = set(';:,.?!\'“”‘’\"')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocessFastText(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

def doFastText(X_train,X_test,Y_train):
    min_count = 2

    docs = create_docs(X_train)
    tokenizer = Tokenizer(lower=False, filters='')
    tokenizer.fit_on_texts(docs)
    num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

    tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
    tokenizer.fit_on_texts(docs)
    docs = tokenizer.texts_to_sequences(docs)

    maxlen = max([max(len(l) for l in docs)])

    docs = pad_sequences(sequences=docs, maxlen=maxlen)
    input_dim = np.max(docs) + 1
    embedding_dims = 20

    # we need to binarize the labels for the neural net
    ytrain_enc = np_utils.to_categorical(Y_train)

    docs_test = create_docs(X_test)
    docs_test = tokenizer.texts_to_sequences(docs_test)
    docs_test = pad_sequences(sequences=docs_test, maxlen=maxlen)
    xtrain_pad = docs
    xtest_pad = docs_test
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([xtrain_pad.shape[0], 5])
    for dev_index, val_index in kf.split(xtrain_pad):
        dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initFastText(embedding_dims,input_dim)
        model.fit(dev_X, y=dev_y, batch_size=32, epochs=40, verbose=1,validation_data=(val_X, val_y),callbacks=[earlyStopping])
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(docs_test)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        print('#############################################################################################################')
    return doAddFastText(X_train,X_test,pred_train,pred_full_test/5)

X_train,X_test = doFastText(X_train,X_test,Y_train)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
####################################################
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
####################################################
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
####################################################
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
####################################################
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25


Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
####################################################


In [13]:
# NN
def doAddNN(X_train,X_test,pred_train,pred_test):
    X_train["nn_0"] = pred_train[:,0]
    X_train["nn_1"] = pred_train[:,1]
    X_train["nn_2"] = pred_train[:,2]
    X_train["nn_3"] = pred_train[:,3]
    X_train["nn_4"] = pred_train[:,4]
    
    X_test["nn_0"] = pred_test[:,0]
    X_test["nn_1"] = pred_test[:,1]
    X_test["nn_2"] = pred_test[:,2]
    X_test["nn_3"] = pred_test[:,3]
    X_test["nn_4"] = pred_test[:,4]
    return X_train,X_test

def initNN(nb_words_cnt,max_len):
    model = Sequential()
    model.add(Embedding(nb_words_cnt,32,input_length=max_len))
    model.add(Dropout(0.3))
    model.add(Conv1D(64, 5, padding='valid', activation='relu'))
    model.add(Dropout(0.3))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(800, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

def doNN(X_train,X_test,Y_train):
    max_len = 70
    nb_words = 10000
    
    print('Processing text dataset')
    texts_1 = []
    for text in X_train['text']:
        texts_1.append(text)
    print('Found %s texts.' % len(texts_1))
    
    test_texts_1 = []
    for text in X_test['text']:
        test_texts_1.append(text)
    print('Found %s texts.' % len(test_texts_1))
    
    tokenizer = Tokenizer(num_words=nb_words)
    tokenizer.fit_on_texts(texts_1 + test_texts_1)
    sequences_1 = tokenizer.texts_to_sequences(texts_1)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)

    xtrain_pad = pad_sequences(sequences_1, maxlen=max_len)
    xtest_pad = pad_sequences(test_sequences_1, maxlen=max_len)
    del test_sequences_1
    del sequences_1
    nb_words_cnt = min(nb_words, len(word_index)) + 1

    # we need to binarize the labels for the neural net
    ytrain_enc = np_utils.to_categorical(Y_train)
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([xtrain_pad.shape[0], 5])
    for dev_index, val_index in kf.split(xtrain_pad):
        dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
        dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
        model = initNN(nb_words_cnt,max_len)
        model.fit(dev_X, y=dev_y, batch_size=32, epochs=4, verbose=1,validation_data=(val_X, val_y),callbacks=[earlyStopping])
        pred_val_y = model.predict(val_X)
        pred_test_y = model.predict(xtest_pad)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        print('#############################################################################################################')
    return doAddNN(X_train,X_test,pred_train,pred_full_test/5)

X_train,X_test = doNN(X_train,X_test,Y_train)

Processing text dataset
Found 54879 texts.
Found 19617 texts.
Found 52997 unique tokens.
Epoch 1/4
Epoch 2/4
Epoch 3/4
####################################################
Epoch 1/4
Epoch 2/4
Epoch 3/4
####################################################
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
####################################################
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
####################################################
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
####################################################


In [14]:
# Final Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, child=1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 5
    param['silent'] = 1
    param['num_class'] = 5
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(X_train,X_test,Y_train):
    drop_columns=["text","words"]
    x_train = X_train.drop(drop_columns+['author'],axis=1)
    x_test = X_test.drop(drop_columns,axis=1)
    y_train = Y_train
    
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros([x_train.shape[0], 5])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y, x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index,:] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print("cv scores : ", cv_scores)
    return pred_full_test/5
result = do(X_train,X_test,Y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-mlogloss:1.43253	test-mlogloss:1.43250
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[20]	train-mlogloss:0.49874	test-mlogloss:0.50836
[40]	train-mlogloss:0.38348	test-mlogloss:0.40617
[60]	train-mlogloss:0.35097	test-mlogloss:0.38739
[80]	train-mlogloss:0.33159	test-mlogloss:0.38140
[100]	train-mlogloss:0.31583	test-mlogloss:0.37877
[120]	train-mlogloss:0.30048	test-mlogloss:0.37678
[140]	train-mlogloss:0.28624	test-mlogloss:0.37583
[160]	train-mlogloss:0.27303	test-mlogloss:0.37525
[180]	train-mlogloss:0.26085	test-mlogloss:0.37489
[200]	train-mlogloss:0.24930	test-mlogloss:0.37444
[220]	train-

##### cv scores :  [0.37971865148208395, 0.3913440671935568, 0.39145263387349694, 0.38940780721717677, 0.38413344802020066]
##### cv scores :  [0.37639503734384844, 0.38710317050095433, 0.3945025033210634, 0.3729838736870731, 0.3990754037926082]
##### cv scores :  [0.374254669870907, 0.39632741185780007, 0.39629713994992327, 0.37429750360830943, 0.40502376285223635]

In [15]:
sample_submission=pd.read_csv('open/sample_submission.csv', encoding='utf-8')
sample_submission[['0', '1', '2', '3', '4']] = result
sample_submission.to_csv("kg_4_1123.csv", index=False)
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.001733,0.789531,0.197901,0.007342,0.003493
1,1,0.002087,0.995005,0.000231,0.000491,0.002186
2,2,0.999280,0.000339,0.000053,0.000043,0.000284
3,3,0.000296,0.014466,0.983473,0.000505,0.001260
4,4,0.983480,0.005917,0.002239,0.005519,0.002845
...,...,...,...,...,...,...
19612,19612,0.000301,0.999509,0.000070,0.000091,0.000029
19613,19613,0.002426,0.000237,0.000513,0.000272,0.996552
19614,19614,0.000349,0.999222,0.000136,0.000189,0.000104
19615,19615,0.000135,0.999317,0.000245,0.000252,0.000051
