Reference
- https://www.youtube.com/watch?v=PhunzHqhKoQ
- https://github.com/rickiepark/introduction_to_ml_with_python/blob/master/07-konlpy.ipynb
- https://github.com/justmarkham/pycon-2016-tutorial/blob/master/exercise_solution.ipynb

In [None]:
import konlpy
import pandas as pd
import numpy as np

# Read data

## Read train data

In [None]:
df_train = pd.read_csv('./datasets/naver_train.txt', delimiter='\t', keep_default_na=False)
print(df_train.shape)
df_train.head()

### `as_matrix()`: pandas dataframe => numpy array

In [None]:
# TODO
X_train = 
y_train = 

In [None]:
X_train

In [None]:
y_train

### Positve/Negative ratio
#### - `np.bincount`: count positive/negative values

In [None]:
# TODO

## Read test data

In [None]:
df_test = # TODO
print(df_test.shape)
X_test = # TODO
y_test = # TODO

### Positve/Negative ratio

In [None]:
# TODO

# Tokenizer

## Let's use Twitter POS-Tagger as tokenizer

In [None]:
from konlpy.tag import Twitter
twitter = Twitter()

def twitter_tokenizer(text):
    # TODO:
    return tokens

In [None]:
assert twitter_tokenizer('이 영화 좋아요') == ['이', '영화', '좋', '아요']

# Vectorization
- Convert text of variable length into fixed-sized vector

# [Count Vectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
- Encode text into frequencies of vocabulary terms

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

## Vectorize train data

In [None]:
X_train_tf = vectorizer.fit_transform(X_train) # term frequency matrcies

In [None]:
vectorizer.get_feature_names()

In [None]:
len(vectorizer.get_feature_names())

In [None]:
vectorizer.vocabulary_

In [None]:
len(vectorizer.vocabulary_)

In [None]:
X_train_tf.shape

In [None]:
X_train[0]

## sparse matrix: only stored elements are non-zero

In [None]:
X_train_tf[0]

##  `np.nonzero`: return indices of non-zero elements

In [None]:
X_train_tf[0].nonzero()

In [None]:
print(vectorizer.vocabulary_['목소리'])
print(vectorizer.vocabulary_['짜증나네요'])
print(vectorizer.vocabulary_['진짜'])
print(vectorizer.vocabulary_['더빙'])

## Return to text

In [None]:
vectorizer.inverse_transform(X_train_tf[0])

# Classifier

## [Logistic Regression (aka. Maximum-entropy Classifier)](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [None]:
from sklearn.linear_model import LogisticRegression

# Pipelining - connect Vectorizer and Classifier

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipeline = make_pipeline(CountVectorizer(tokenizer=twitter_tokenizer), LogisticRegression())

# Train model

In [None]:
pipeline.fit(X_train[:1000], y_train[:1000])

# Test model

In [None]:
pipeline.predict(['이 영화 좋아요'])

In [None]:
pipeline.predict_proba(['이 영화 좋아요']) # confidence

In [None]:
pipeline.predict(['이거 정말 별로에요'])

In [None]:
pipeline.predict_proba(['이거 정말 별로에요'])

# Calculate Accuracy

In [None]:
test_score = pipeline.score(X_test[:1000], y_test[:1000])
test_score

# Search Best Hyperparameter

## Useful Hyperparameters
### Vectorizer
#### - min_df: threshold to add to vocabulary => ignore too rare terms
#### - ngram_range: lower and upper boundary of n-grams; default: (1, 1)
### Logistic Regression
#### - C : inverse of regularization constant => smaller values makes regularization stronger

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'countvectorizer__min_df': [1, 3],
    'countvectorizer__ngram_range': [(1, 1), (1, 2)],
    'logisticregression__C': [0.1, 1, 10]}
pipeline = make_pipeline(
    CountVectorizer(tokenizer=twitter_tokenizer),
    LogisticRegression())
grid = GridSearchCV(pipeline, param_grid)

In [None]:
grid.fit(X_train[:1000], y_train[:1000])

### Cross-validation score

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid.best_estimator_.score(X_test[:1000], y_test[:1000])

# Let's Upgrade our vectorizer

# [TF-IDF Vectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- Encode text into tf-idf features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_param_grid = {
    # TODO
}
tfidf_pipeline = make_pipeline(
    # TODO
)
tfidf_grid = GridSearchCV(
    # TODO
)

In [None]:
tfidf_grid.fit(X_train[:1000], y_train[:1000])

In [None]:
tfidf_grid.best_score_

In [None]:
tfidf_grid.best_params_

In [None]:
tfidf_grid.best_estimator_

In [None]:
tfidf_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

# Let's make our tokenizer faster with mecab (multiprocessing)

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()
def mecab_tokenizer(text):
    # TODO:
    return tokens

In [None]:
assert mecab_tokenizer('이 영화 좋아요') == ['이', '영화', '좋', '아요']

In [None]:
mecab_param_grid = {
    # TODO
}
mecab_pipe = make_pipeline(
    # TODO
)
mecab_grid = GridSearchCV(
    # TODO
)

In [None]:
mecab_grid.fit(X_train[:1000], y_train[:1000])

In [None]:
mecab_grid.best_score_

In [None]:
mecab_grid.best_params_

In [None]:
mecab_grid.best_estimator_

In [None]:
mecab_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

# Save model

In [None]:
from sklearn.externals import joblib
joblib.dump(mecab_grid.best_estimator_, '.katalk_bot/sentiment_engine.pkl')

In [None]:
engine = joblib.load('.katalk_bot/sentiment_engine.pkl')
engine

In [None]:
mecab_grid.best_estimator_.predict_proba(['난 기분이 좋아'])[:, 1][0]

# Much faster! Now we can search little further in hyperparemter combinations!

In [None]:
mecab_param_grid = {
    # TODO
}
mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression())
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid)

In [None]:
mecab_grid.fit(X_train[:1000], y_train[:1000])

In [None]:
mecab_grid.best_score_

In [None]:
mecab_grid.best_params_

In [None]:
mecab_grid.best_estimator_

## Hmm worse than before...
- The model with best cross-validation score can **overfit** in training set.

In [None]:
mecab_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

## Let's try with larger training data

In [None]:
mecab_param_grid = {
    # TODO
}
mecab_pipe = make_pipeline(
    # TODO
)
mecab_grid = GridSearchCV(
    # TODO
)

In [None]:
mecab_grid.fit(
    # TODO
)

In [None]:
mecab_grid.best_score_

In [None]:
mecab_grid.best_params_

In [None]:
mecab_grid.best_estimator_

# Yayy!

In [None]:
mecab_grid.best_estimator_.score(X_test[:1000], y_test[:1000])

# If you have time,
- Gather more data
- Try different tokenization (ex, char-level, positional-encoding)
- Try different classifier (ex, SVM, Random Foreset)
- [Ensemble features](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html#sklearn.pipeline.FeatureUnion) (word-level occurence + char_level + word_vector + query length etc.)
- [Ensemble models](http://blog.kaggle.com/2017/06/15/stacking-made-easy-an-introduction-to-stacknet-by-competitions-grandmaster-marios-michailidis-kazanova)
- [Feature hashing](https://msdn.microsoft.com/en-us/library/azure/dn906018.aspx)