# Kernel 2위
- https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle

- 기본적인 모델로부터 다른 특성을 사용하면서 개선시킨다.

# 해당 Kernel에서 다루고자 하는 것

1. tfidf
2. count features
3. logistic regresiion
4. naive bayes
5. svm
6. xgboost
7. grid search
8. LSTM
9. GRU
10. Ensembling

In [2]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Using TensorFlow backend.


In [3]:
base_path = '/Users/lifesailor/.kaggle/spooky'

In [4]:
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
sample = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))

In [5]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [6]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [7]:
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


## Loss function

- multiclass_logloss

In [8]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """여러 클래스에 대한 Log loss를 정의한다"""
    
    """
    :param actual: 실제 class를 포함하는 array
    :param predicted: prediction 확률로 이루어진 array
    """
    # actual가 1차원 벡터면 one-hot encoding으로 변경한다.
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    # predict
    clip = np.clip(predicted, eps, 1 - eps)
    
    # 데이터 개수
    rows = actual.shape[0]
    
    # 확률
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

<img src="./evaluation.png">

- Loss가 작을 수록 좋은 지표이다.

## 1. Label Encoder

- 3명의 저자를 0,1,2로 encoding한다.

In [9]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

- 데이터를 train, test로 나눈다.

In [10]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, 
                                                  y,
                                                  random_state=42,
                                                  test_size=0.1, 
                                                  shuffle=True)
print(xtrain.shape, xvalid.shape)

(17621,) (1958,)


## 2. Baseline

- tf-idf를 사용해서 logregression으로 baseline을 만든다.

In [15]:
# min_df: 최소 단어 개수 
# max_features: 최대 feature 제한
# analyzer: 단어 기준
# token_pattern: 만족하는 조건
# ngram_range: (1, 3)
# use_idf: True
# smooth_idf: True
# sublinear_tf: True
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [16]:
# 전체 단어 셋을 만들어야 하기 때문에
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)

In [17]:
# Baseline - Logression
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.631 




## 3. Count vector

- count vectorizer

In [18]:
# tf-idf와 비슷한 hyperparemeter
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [19]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))



logloss: 0.538 


## 4. Naive Bayes

- Naive Bayes 모델

In [20]:
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.581 


In [21]:
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.502 


## 5.  SVM

- svm을 넣기 전에 standardization 해야 한다.

In [22]:
# 120-200 components
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Standardization
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [23]:
clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.703 


## 6. Xgboost

- Xgboost tf-idf

In [24]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.779 


- XGBoost count

In [25]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.768 


- XGboost가 잘 working 하지 않으나 hyperparmeter 튜닝을 잘 하지 않았기 때문이다.

## 7. Grid Search

- Grie Search를 한다.

In [34]:
# scoring 함수
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [35]:
svd = TruncatedSVD()
scl = preprocessing.StandardScaler()
lr_model = LogisticRegression()
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])


In [36]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}

In [45]:
model = GridSearchCV(estimator=clf, param_grid=param_grid, scori3ng=mll_scorer,
                     verbose=1)

model.fit(xtrain_tfv, ytrain)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits






[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.0min finished


Best score: -0.728
Best parameters set:
	lr__C: 1.0
	lr__penalty: 'l2'
	svd__n_components: 180
