# Kernel 2위
- https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle

해당 커널에서는 기본적인 모델로부터 다른 특성을 사용하면서 개선시키는 작업을 수행했습니다.

# 해당 Kernel에서 다루고자 하는 것

- 데이터 표현
    - tfidf - [TF/IDF](https://ko.wikipedia.org/wiki/Tf-idf) 
    - count features - 단어 개수를 Feature로 사용
    

- 모델
    - logistic regression
    - naive bayes
    - svm
    - xgboost
    - grid search
    - LSTM
    - GRU
    - Ensembling

In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [2]:
base_path = '/home/lifesailor/.kaggle/spooky'

In [3]:
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
sample = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [4]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [6]:
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


## Loss function

In [7]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """여러 클래스에 대한 Log loss를 정의한다"""
    
    """
    :param actual: 실제 class를 포함하는 array
    :param predicted: prediction 확률로 이루어진 array
    """
    # actual가 1차원 벡터면 one-hot encoding으로 변경한다.
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    # predict
    clip = np.clip(predicted, eps, 1 - eps)
    
    # 데이터 개수
    rows = actual.shape[0]
    
    # 확률
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

- multiclass logloss는 실제 값에 해당하는 클래스에 대한 log 값만 계산된다.

<img src="./evaluation.png">

## 1. Label Encoder

- 3명의 저자를 0,1,2로 encoding한다.

In [8]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

- 데이터를 train, test로 나눈다.

In [9]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, 
                                                  y,
                                                  random_state=42,
                                                  test_size=0.1, 
                                                  shuffle=True)
print(xtrain.shape, xvalid.shape)

(17621,) (1958,)


## 2. Baseline

- tf-idf를 사용해서 logregression으로 baseline을 만든다.

In [10]:
# min_df: 최소 단어 개수 
# max_features: 최대 feature 제한
# analyzer: 단어 기준
# token_pattern: 만족하는 조건
# ngram_range: (1, 3)
# use_idf: True
# smooth_idf: True
# sublinear_tf: True
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [11]:
# 전체 단어 셋을 만들어야 하기 때문에
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)

In [12]:
# Baseline - Logression
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.631 




## 3. Count vector

- count vectorizer: 단어 개수 기반 모델

In [13]:
# tf-idf와 비슷한 hyperparemeter
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [14]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.538 


Count가 TF-IDF 보다 성능이 좋다.

## 4. Naive Bayes

- Naive Bayes 모델

In [15]:
# Multiclass Naive Bayes
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.581 


In [16]:
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.502 


## 5.  SVM

- svm을 넣기 전에 standardization 해야 한다.

In [17]:
# 120-200 components - dimension reduction
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Standardization
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [18]:
clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.702 


## 6. Xgboost

- Xgboost: GradientBoostingTree

In [19]:
# xgboost
# max_depth: maximum depth of a tree
# colsample_bytree: 전체 중에서 사용할 feature 비율
# n_estimators = 200
# subsamplㄷ: 전체 중에서 사용할 data 비율
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.779 


In [20]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.768 


- XGboost가 잘 working 하지 않으나 hyperparmeter 튜닝을 잘 하지 않았기 때문이다.

## 7. Grid Search

- Grie Search: 각각의 parameter 조합에서 최선의 parameter를 찾는 방법이다.

In [21]:
# scoring 함수
mll_scorer = metrics.make_scorer(multiclass_logloss, 
                                 greater_is_better=False, 
                                 needs_proba=True)

In [22]:
# pipeline 구성
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])

In [23]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}

In [24]:
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

model.fit(xtrain_tfv, ytrain)  

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:    8.8s remaining:    8.8s
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:   10.2s remaining:    6.1s
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:   11.6s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:   12.2s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   13.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   13.3s finished


Best score: -0.738
Best parameters set:
	lr__C: 0.1
	lr__penalty: 'l2'
	svd__n_components: 180


In [25]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: -0.738
Best parameters set:
	lr__C: 0.1
	lr__penalty: 'l2'
	svd__n_components: 180


In [26]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid - smoothing parameter
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

In [27]:
# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: -0.491
Best parameters set:
	nb__alpha: 0.1


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0197s.) Setting batch_size=20.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished


### 여태까지 가장 좋은 성능이다.

## 8. Word Vectors

- 미리 단어별로 Embedding 된 정보를 불러온다. 여기에서는 Glove Embedding을 사용한다. http://www-nlp.stanford.edu/data/glove.840B.300d.zip 에서 다운 받을 수 있다.

In [64]:
# embedding 다운로드
embeddings_index = {}
f = open('./glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')    
        embeddings_index[word] = coefs
    except:
        pass
f.close()

print('Found %s word vectors.' % len(embeddings_index))


0it [00:00, ?it/s][A
2256it [00:00, 22559.70it/s][A
4955it [00:00, 23726.25it/s][A
7628it [00:00, 24553.04it/s][A
10360it [00:00, 25320.24it/s][A
13095it [00:00, 25895.22it/s][A
15825it [00:00, 26299.52it/s][A
18555it [00:00, 26590.17it/s][A
21272it [00:00, 26760.53it/s][A
23972it [00:00, 26830.77it/s][A
26686it [00:01, 26922.15it/s][A
29383it [00:01, 26936.38it/s][A
32094it [00:01, 26986.47it/s][A
34776it [00:01, 26934.77it/s][A
37481it [00:01, 26967.13it/s][A
40164it [00:01, 25257.16it/s][A
42702it [00:01, 23391.29it/s][A
45322it [00:01, 24167.16it/s][A
48041it [00:01, 24998.46it/s][A
50753it [00:01, 25598.01it/s][A
53466it [00:02, 26036.76it/s][A
56170it [00:02, 26328.46it/s][A
58863it [00:02, 26503.16it/s][A
61532it [00:02, 26556.37it/s][A
64239it [00:02, 26707.49it/s][A
66949it [00:02, 26822.70it/s][A
69659it [00:02, 26904.12it/s][A
72369it [00:02, 26962.05it/s][A
75078it [00:02, 26998.22it/s][A
77785it [00:02, 27018.89it/s][A
80488it [00:03, 27015.7

649580it [00:24, 26711.71it/s][A
652252it [00:24, 25893.86it/s][A
654847it [00:24, 23710.11it/s][A
657258it [00:25, 23564.88it/s][A
659960it [00:25, 24503.62it/s][A
662654it [00:25, 25185.67it/s][A
665349it [00:25, 25688.43it/s][A
668040it [00:25, 26042.29it/s][A
670752it [00:25, 26355.96it/s][A
673445it [00:25, 26525.39it/s][A
676147it [00:25, 26670.68it/s][A
678844it [00:25, 26757.13it/s][A
681550it [00:25, 26844.79it/s][A
684248it [00:26, 26882.92it/s][A
686950it [00:26, 26921.29it/s][A
689652it [00:26, 26950.53it/s][A
692349it [00:26, 26925.52it/s][A
695043it [00:26, 26904.17it/s][A
697734it [00:26, 26805.74it/s][A
700416it [00:26, 25574.56it/s][A
703054it [00:26, 25808.59it/s][A
705726it [00:26, 26073.38it/s][A
708445it [00:26, 26396.96it/s][A
711174it [00:27, 26658.18it/s][A
713897it [00:27, 26827.02it/s][A
716584it [00:27, 26794.23it/s][A
719266it [00:27, 26378.77it/s][A
721908it [00:27, 24114.96it/s][A
724360it [00:27, 23152.47it/s][A
727020it [00:2

1285289it [00:48, 26247.90it/s][A
1287972it [00:48, 26419.27it/s][A
1290651it [00:49, 26528.72it/s][A
1293310it [00:49, 26413.09it/s][A
1295956it [00:49, 24584.95it/s][A
1298650it [00:49, 25246.89it/s][A
1301352it [00:49, 25753.11it/s][A
1304059it [00:49, 26134.10it/s][A
1306767it [00:49, 26409.04it/s][A
1309468it [00:49, 26583.72it/s][A
1312135it [00:49, 26588.64it/s][A
1314833it [00:49, 26702.40it/s][A
1317539it [00:50, 26807.84it/s][A
1320243it [00:50, 26874.34it/s][A
1322953it [00:50, 26940.08it/s][A
1325657it [00:50, 26967.58it/s][A
1328358it [00:50, 26977.72it/s][A
1331059it [00:50, 26986.80it/s][A
1333759it [00:50, 25725.13it/s][A
1336345it [00:50, 23573.76it/s][A
1338747it [00:50, 23061.36it/s][A
1341369it [00:51, 23924.72it/s][A
1344046it [00:51, 24712.38it/s][A
1346729it [00:51, 25310.00it/s][A
1349393it [00:51, 25694.69it/s][A
1352078it [00:51, 26028.24it/s][A
1354757it [00:51, 26251.09it/s][A
1357443it [00:51, 26429.14it/s][A
1360122it [00:51, 26

1911047it [01:12, 26815.75it/s][A
1913730it [01:12, 26774.22it/s][A
1916426it [01:12, 26828.45it/s][A
1919117it [01:13, 26850.02it/s][A
1921823it [01:13, 26910.00it/s][A
1924518it [01:13, 26921.01it/s][A
1927216it [01:13, 26937.23it/s][A
1929910it [01:13, 26931.32it/s][A
1932604it [01:13, 26877.06it/s][A
1935292it [01:13, 26733.05it/s][A
1937980it [01:13, 26774.70it/s][A
1940675it [01:13, 26824.58it/s][A
1943358it [01:13, 26256.23it/s][A
1945987it [01:14, 23941.05it/s][A
1948423it [01:14, 23059.23it/s][A
1951105it [01:14, 24069.75it/s][A
1953759it [01:14, 24758.91it/s][A
1956416it [01:14, 25273.85it/s][A
1959072it [01:14, 25644.29it/s][A
1961711it [01:14, 25861.39it/s][A
1964372it [01:14, 26080.42it/s][A
1967025it [01:14, 26210.71it/s][A
1969653it [01:14, 25937.83it/s][A
1972328it [01:15, 26173.39it/s][A
1975033it [01:15, 26429.39it/s][A
1977735it [01:15, 26603.53it/s][A
1980399it [01:15, 26592.43it/s][A
1983061it [01:15, 26391.36it/s][A
1985751it [01:15, 26

Found 2195884 word vectors.


In [65]:
# 각 token이 300차원으로 embedding 되었다.
print("embedding 차원: ", len(embeddings_index[',']))

embedding 차원:  300


In [66]:
# 전체 문장을 300차원으로 바꾼다.
def sent2vec(s):
    words = str(s).lower()
    
    # tokenize
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    
    # token
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M).astype(np.float)
    
    # 각 위치마다 합한다.
    v = np.sum(M, axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    # normalization
    return v / np.sqrt((v ** 2).sum())

In [67]:
# 문장을 vector로 변경
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]


  0%|          | 0/17621 [00:00<?, ?it/s][A
  3%|▎         | 510/17621 [00:00<00:03, 5088.62it/s][A
  6%|▌         | 1054/17621 [00:00<00:03, 5187.94it/s][A
  9%|▉         | 1628/17621 [00:00<00:02, 5339.63it/s][A
 13%|█▎        | 2203/17621 [00:00<00:02, 5456.00it/s][A
 16%|█▌        | 2765/17621 [00:00<00:02, 5503.23it/s][A
 19%|█▉        | 3331/17621 [00:00<00:02, 5548.65it/s][A
 22%|██▏       | 3894/17621 [00:00<00:02, 5572.13it/s][A
 25%|██▌       | 4452/17621 [00:00<00:02, 5572.81it/s][A
 29%|██▊       | 5024/17621 [00:00<00:02, 5614.40it/s][A
 32%|███▏      | 5590/17621 [00:01<00:02, 5626.46it/s][A
 35%|███▍      | 6160/17621 [00:01<00:02, 5647.88it/s][A
 38%|███▊      | 6715/17621 [00:01<00:01, 5605.98it/s][A
 41%|████▏     | 7269/17621 [00:01<00:01, 5582.00it/s][A
 44%|████▍     | 7828/17621 [00:01<00:01, 5583.51it/s][A
 48%|████▊     | 8384/17621 [00:01<00:01, 5459.99it/s][A
 51%|█████     | 8929/17621 [00:01<00:01, 5043.56it/s][A
 54%|█████▍    | 9475/17621

In [69]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

### 1. Xgboost

In [70]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:14] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[19:46:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[19:46:22] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_

- 성능이 별로 좋지 않다.

### 2. Xgboost - parameter tuning

In [71]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

[19:46:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 254 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 244 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 242 extra nodes, 0 pruned nodes, max_depth=7
[19:47:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned no

[19:47:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=7
[19:47:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 238 extra nodes, 0 pruned nodes, max_depth=7
[19:47:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=7
[19:47:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 0 pruned nodes, max_depth=7
[19:47:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=7
[19:47:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 232 extra nodes, 0 pruned nodes, max_depth=7
[19:47:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 234 extra nodes, 0 pruned nodes, max_depth=7
[19:47:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 206 extra nodes, 0 pruned nodes, max_depth=7
[19:47:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 226 extra nodes, 0 pruned no

[19:47:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 184 extra nodes, 0 pruned nodes, max_depth=7
[19:47:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 244 extra nodes, 0 pruned nodes, max_depth=7
[19:47:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[19:47:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[19:47:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned nodes, max_depth=7
[19:47:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 220 extra nodes, 0 pruned nodes, max_depth=7
[19:47:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=7
[19:47:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 234 extra nodes, 0 pruned nodes, max_depth=7
[19:47:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 240 extra nodes, 0 pruned no

[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 142 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 208 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 214 extra nodes, 0 pruned nodes, max_depth=7
[19:47:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=7
[19:47:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 184 extra nodes, 0 pruned no

[19:47:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 196 extra nodes, 0 pruned nodes, max_depth=7
[19:47:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 98 extra nodes, 0 pruned nodes, max_depth=7
[19:47:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 208 extra nodes, 0 pruned nodes, max_depth=7
[19:47:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 210 extra nodes, 0 pruned nodes, max_depth=7
[19:47:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=7
[19:47:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 220 extra nodes, 0 pruned nodes, max_depth=7
[19:47:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 210 extra nodes, 0 pruned nodes, max_depth=7
[19:47:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 230 extra nodes, 0 pruned nodes, max_depth=7
[19:47:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 228 extra nodes, 0 pruned nod

[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 198 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 188 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 220 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[19:47:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 196 extra nodes, 0 pruned no

[19:47:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 198 extra nodes, 0 pruned nodes, max_depth=7
[19:47:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=7
[19:47:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=7
[19:47:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=7
[19:47:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=7
[19:47:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=7
[19:47:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 216 extra nodes, 0 pruned nodes, max_depth=7
[19:47:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=7
[19:47:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned no

[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 214 extra nodes, 0 pruned nodes, max_depth=7
[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 218 extra nodes, 0 pruned nodes, max_depth=7
[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=7
[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 210 extra nodes, 0 pruned nodes, max_depth=7
[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 144 extra nodes, 0 pruned nodes, max_depth=7
[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=7
[19:48:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 144 extra nodes, 0 pruned nodes, max_depth=7
[19:48:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 188 extra nodes, 0 pruned nodes, max_depth=7
[19:48:01] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 210 extra nodes, 0 pruned no

[19:48:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 204 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 208 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 176 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned nodes, max_depth=7
[19:48:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 168 extra nodes, 0 pruned no

- 좀 더 개선되지만 썩 마음에 들지 않는다.

### 3. Deep Learning

In [72]:
# scaling
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [73]:
# y - category
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [74]:
# 3 layer nn
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [75]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Train on 17621 samples, validate on 1958 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1721cc9978>

### 4. Deep Learning - LSTM

- embedding이 안 된 것으로 들어간다.

In [84]:
token = text.Tokenizer(num_words=None)

# 70 dimension
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

# 총 word 개수
word_index = token.word_index

In [86]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


  0%|          | 0/25943 [00:00<?, ?it/s][A
100%|██████████| 25943/25943 [00:00<00:00, 628525.38it/s][A

In [87]:
embedding_matrix.shape

(25944, 300)

### LSTM

In [90]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

# spatial Dropout - This version performs the same function as Dropout, however it drops entire 1D feature maps 
# Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs. 
# recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the recurrent state.
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

### GRU

In [91]:
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.callbacks.History at 0x7f171447f710>

- 가장 좋은 성능이 나왔다.

### 기존 커널에서는 Ensemble이 있지만 여기에서는 생략했다.