Reference
- https://www.youtube.com/watch?v=PhunzHqhKoQ
- https://github.com/rickiepark/introduction_to_ml_with_python/blob/master/07-konlpy.ipynb
- https://github.com/justmarkham/pycon-2016-tutorial/blob/master/exercise_solution.ipynb

In [1]:
import konlpy
import pandas as pd
import numpy as np

# Read data

## Read train data

In [2]:
df_train = pd.read_csv('./datasets/naver_train.txt', delimiter='\t', keep_default_na=False)
print(df_train.shape)
df_train.head()

(150000, 3)


Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


### `as_matrix()`: pandas dataframe => numpy array

In [6]:
X_train = df_train['document'].as_matrix()
y_train = df_train['label'].as_matrix()

In [7]:
X_train

array(['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나',
       '너무재밓었다그래서보는것을추천한다', ..., '이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?',
       '청춘 영화의 최고봉.방황과 우울했던 날들의 자화상', '한국 영화 최초로 수간하는 내용이 담긴 영화'], dtype=object)

In [8]:
y_train

array([0, 1, 0, ..., 0, 1, 0])

### Positve/Negative ratio
#### - `np.bincount`: count positive/negative values

In [9]:
np.bincount(y_train)

array([75173, 74827])

## Read test data

In [10]:
df_test = pd.read_csv('./datasets/naver_test.txt', delimiter='\t', keep_default_na=False)
print(df_test.shape)
X_test = df_test['document'].as_matrix()
y_test = df_test['label'].as_matrix()

(50000, 3)


### Positve/Negative ratio

In [11]:
np.bincount(y_test)

array([24827, 25173])

# Tokenizer

## Let's use Twitter POS-Tagger as tokenizer

In [35]:
from konlpy.tag import Twitter
twitter = Twitter()

def twitter_tokenizer(text):
    # TODO:
    tokens = twitter.morphs(text)
    return tokens

In [38]:
assert twitter_tokenizer('이 영화 좋아요') == ['이', '영화', '좋', '아요']

# Vectorization
- Convert text of variable length into fixed-sized vector

# [Count Vectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
- Encode text into frequencies of vocabulary terms

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

## Vectorize train data

In [47]:
X_train_tf = vectorizer.fit_transform(X_train) # term frequency matrcies

In [49]:
vectorizer.get_feature_names()

['00',
 '000',
 '0000000000000000',
 '00000000000000000000000000000을달라',
 '000000000000000001개짜리',
 '000000000점',
 '00000001',
 '0000000을',
 '0000643652',
 '0001',
 '0001이라도떨어뜨리기위해',
 '0001점도',
 '0002027146',
 '0007과',
 '000달러',
 '000명',
 '000명도',
 '000원',
 '000원아깝다',
 '000원으로',
 '000원이나',
 '000점',
 '001점도',
 '001점은',
 '005',
 '007',
 '007과',
 '007급',
 '007나를',
 '007년에',
 '007로',
 '007마니아들에겐',
 '007문레이커',
 '007스러운',
 '007시리즈',
 '007시리즈는',
 '007시리즈들',
 '007시리즈라',
 '007시리즈인데',
 '007시리즈중',
 '007씨리즈가',
 '007에',
 '007영화',
 '007영화였다',
 '007운운하길레',
 '007은',
 '007을',
 '007의',
 '007이',
 '007이다',
 '007이라',
 '007이란',
 '007인듯',
 '007인지',
 '007중',
 '007중에',
 '007중에서',
 '007처럼',
 '007팬으로서',
 '007하는',
 '009',
 '009의',
 '00년',
 '00년대',
 '00년도초반',
 '00년에',
 '00시59분',
 '00에',
 '00학번',
 '00화',
 '01',
 '010',
 '01410',
 '01내려지네',
 '01년',
 '01년도',
 '01임',
 '01줄이러왔습니다',
 '02',
 '02년',
 '02년도',
 '03',
 '03년도거라곤',
 '03년도이런특수효과를',
 '04',
 '047',
 '04까진',
 '04년',
 '04년도',
 '04년에',
 '04점',
 '05',
 '05년',
 '05년도에

In [50]:
len(vectorizer.get_feature_names())

293366

In [51]:
vectorizer.vocabulary_

{'더빙': 71119,
 '진짜': 246232,
 '짜증나네요': 248358,
 '목소리': 99567,
 '포스터보고': 273335,
 '초딩영화줄': 255126,
 '오버연기조차': 190112,
 '가볍지': 16352,
 '않구나': 167602,
 '너무재밓었다그래서보는것을추천한다': 57394,
 '교도소': 33783,
 '이야기구먼': 208071,
 '솔직히': 145795,
 '재미는': 222295,
 '없다': 177352,
 '평점': 271982,
 '조정': 234711,
 '사이몬페그의': 133947,
 '익살스런': 210575,
 '연기가': 181881,
 '돋보였던': 74028,
 '영화': 185057,
 '스파이더맨에서': 150442,
 '늙어보이기만': 63331,
 '했던': 283593,
 '커스틴': 261359,
 '던스트가': 71680,
 '너무나도': 56734,
 '이뻐보였다': 207059,
 '걸음마': 25696,
 '3세부터': 5282,
 '초등학교': 254957,
 '1학년생인': 3039,
 '8살용영화': 7674,
 'ㅋㅋㅋ': 13347,
 '별반개도': 117898,
 '아까움': 160393,
 '원작의': 198609,
 '긴장감을': 43582,
 '제대로': 232430,
 '살려내지못했다': 134933,
 '반개도': 110754,
 '아깝다': 160496,
 '욕나온다': 194727,
 '이응경': 209070,
 '길용우': 43801,
 '연기생활이몇년인지': 182309,
 '정말': 230402,
 '발로해도': 112228,
 '그것보단': 36866,
 '낫겟다': 53960,
 '납치': 53930,
 '감금만반복반복': 18635,
 '이드라마는': 204973,
 '가족도없다': 17264,
 '연기못하는사람만모엿네': 182260,
 '액션이': 171312,
 '없는데도': 177305,
 '재미': 222156,
 '있는': 2148

In [52]:
len(vectorizer.vocabulary_)

293366

In [27]:
X_train_tf.shape

(150000, 293366)

In [18]:
X_train[0]

'아 더빙.. 진짜 짜증나네요 목소리'

## sparse matrix: only stored elements are non-zero

In [28]:
X_train_tf[0]

<1x293366 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

##  `np.nonzero`: return indices of non-zero elements

In [29]:
X_train_tf[0].nonzero()

(array([0, 0, 0, 0], dtype=int32),
 array([ 99567, 248358, 246232,  71119], dtype=int32))

In [53]:
print(vectorizer.vocabulary_['목소리'])
print(vectorizer.vocabulary_['짜증나네요'])
print(vectorizer.vocabulary_['진짜'])
print(vectorizer.vocabulary_['더빙'])

99567
248358
246232
71119


## Return to text

In [22]:
vectorizer.inverse_transform(X_train_tf[0])

[array(['목소리', '짜증나네요', '진짜', '더빙'],
       dtype='<U140')]

# Classifier

## [Logistic Regression (aka. Maximum-entropy Classifier)](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [36]:
from sklearn.linear_model import LogisticRegression

# Pipelining - connect Vectorizer and Classifier

In [91]:
from sklearn.pipeline import make_pipeline

In [92]:
pipeline = make_pipeline(CountVectorizer(tokenizer=twitter_tokenizer), LogisticRegression())

# Train model

In [93]:
pipeline.fit(X_train[:1000], y_train[:1000])

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

# Test model

In [94]:
pipeline.predict(['이 영화 좋아요'])

array([1])

In [95]:
pipeline.predict_proba(['이 영화 좋아요']) # confidence

array([[ 0.22430691,  0.77569309]])

In [96]:
pipeline.predict(['이거 정말 별로에요'])

array([0])

In [97]:
pipeline.predict_proba(['이거 정말 별로에요'])

array([[ 0.68974387,  0.31025613]])

# Calculate Accuracy

In [98]:
test_score = pipeline.score(X_test[:1000], y_test[:1000])
test_score

0.71799999999999997

# Search Best Hyperparameter

## Useful Hyperparameters
### Vectorizer
#### - min_df: threshold to add to vocabulary => ignore too rare terms
#### - ngram_range: lower and upper boundary of n-grams; default: (1, 1)
### Logistic Regression
#### - C : inverse of regularization constant => smaller values makes regularization stronger

In [99]:
from sklearn.model_selection import GridSearchCV

In [100]:
param_grid = {'countvectorizer__min_df': [1, 3],
              'countvectorizer__ngram_range': [(1, 1), (1, 2)],
              'logisticregression__C': [0.1, 1, 10]}
pipeline = make_pipeline(CountVectorizer(tokenizer=twitter_tokenizer), LogisticRegression())
grid = GridSearchCV(pipeline, param_grid)

In [101]:
grid.fit(X_train[:1000], y_train[:1000])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'countvectorizer__min_df': [1, 3], 'countvectorizer__ngram_range': [(1, 1), (1, 2)], 'logisticregression__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

### Cross-validation score

In [102]:
grid.best_score_

0.71499999999999997

In [103]:
grid.best_params_

{'countvectorizer__min_df': 1,
 'countvectorizer__ngram_range': (1, 1),
 'logisticregression__C': 0.1}

In [104]:
grid.best_estimator_

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [105]:
grid.best_estimator_.score(X_test[:1000], y_test[:1000])

0.72299999999999998

# Let's Upgrade our vectorizer

# [TF-IDF Vectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- Encode text into tf-idf features

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [107]:
tfidf_param_grid = {
    'tfidfvectorizer__min_df': [1, 3],
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
    'logisticregression__C': [0.1, 1, 10]}
tfidf_pipeline = make_pipeline(TfidfVectorizer(tokenizer=twitter_tokenizer), LogisticRegression())
tfidf_grid = GridSearchCV(pipeline, param_grid)

In [108]:
tfidf_grid.fit(X_train[:1000], y_train[:1000])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidfvectorizer__min_df': [1, 3], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)], 'logisticregression__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [109]:
tfidf_grid.best_score_

0.751

In [110]:
tfidf_grid.best_params_

{'logisticregression__C': 10,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 2)}

In [111]:
tfidf_grid.best_estimator_

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [112]:
tfidf_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

0.745

# Let's make our tokenizer faster with mecab (multiprocessing)

In [116]:
from konlpy.tag import Mecab
mecab = Mecab()
def mecab_tokenizer(text):
    # TODO:
    tokens = mecab.morphs(text)
    return tokens

In [117]:
assert mecab_tokenizer('이 영화 좋아요') == ['이', '영화', '좋', '아요']

In [137]:
mecab_param_grid = {
    'tfidfvectorizer__min_df': [1, 3],
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
    'logisticregression__C': [0.1, 1, 10]}
mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression())
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid)

In [138]:
mecab_grid.fit(X_train[:1000], y_train[:1000])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidfvectorizer__min_df': [1, 3], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)], 'logisticregression__C': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [139]:
mecab_grid.best_score_

0.74199999999999999

In [140]:
mecab_grid.best_params_

{'logisticregression__C': 10,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 2)}

In [141]:
mecab_grid.best_estimator_

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [142]:
mecab_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

0.752

# Much faster! Now we can search little further in hyperparemter combinations!

In [131]:
mecab_param_grid = {
    'tfidfvectorizer__min_df': [1, 3, 5, 7],
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'logisticregression__C': [0.1, 1, 10, 10]}
mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression())
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid)

In [132]:
mecab_grid.fit(X_train[:1000], y_train[:1000])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidfvectorizer__min_df': [1, 3, 5, 7], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'logisticregression__C': [0.1, 1, 10, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [133]:
mecab_grid.best_score_

0.745

In [134]:
mecab_grid.best_params_

{'logisticregression__C': 1,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 3)}

In [135]:
mecab_grid.best_estimator_

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

## Hmm worse than before...
- The model with best cross-validation score can **overfit** in training set.

In [136]:
mecab_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

0.747

## Let's try with larger training data

In [146]:
mecab_param_grid = {
    'tfidfvectorizer__min_df': [1, 3, 5, 7],
    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'logisticregression__C': [0.1, 1, 10, 10]}
mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression())
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid)

In [147]:
mecab_grid.fit(X_train[:2000], y_train[:2000])

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidfvectorizer__min_df': [1, 3, 5, 7], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'logisticregression__C': [0.1, 1, 10, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [148]:
mecab_grid.best_score_

0.77200000000000002

In [149]:
mecab_grid.best_params_

{'logisticregression__C': 10,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 2)}

In [150]:
mecab_grid.best_estimator_

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

# Yayy!

In [151]:
mecab_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

0.79600000000000004

# If you have time,
- Gather more data
- Try different tokenization (ex, char-level, positional-encoding)
- Try different classifier (ex, SVM, Random Foreset)
- [Ensemble features](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html#sklearn.pipeline.FeatureUnion) (word-level occurence + char_level + word_vector + query length etc.)
- Ensemble models (http://blog.kaggle.com/2017/06/15/stacking-made-easy-an-introduction-to-stacknet-by-competitions-grandmaster-marios-michailidis-kazanova)
- Feature hashing (https://msdn.microsoft.com/en-us/library/azure/dn906018.aspx)