# Preparing data

## Reading data -> Tokenization

In [1]:
import pandas as pd
df = pd.read_csv('data/sentiment.csv')

import jieba
df['token_text'] = df['text'].apply(lambda x:list(jieba.cut(x)))
df

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\user\AppData\Local\Temp\jieba.cache
Loading model cost 1.533 seconds.
Prefix dict has been built successfully.


Unnamed: 0,tag,text,token_text
0,P,店家很給力，快遞也是相當快，第三次光顧啦,"[店家, 很, 給力, ，, 快遞, 也, 是, 相當快, ，, 第三次, 光顧, 啦]"
1,N,這樣的配置用Vista系統還是有點卡。 指紋收集器。 沒送原裝滑鼠還需要自己買，不太好。,"[這樣, 的, 配置, 用, Vista, 系統, 還是, 有點, 卡, 。, , 指紋,..."
2,P,不錯，在同等檔次酒店中應該是值得推薦的！,"[不錯, ，, 在, 同等, 檔次, 酒店, 中應, 該, 是, 值得, 推薦, 的, ！]"
3,N,哎！ 不會是蒙牛乾的吧 嚴懲真凶！,"[哎, ！, , 不會, 是, 蒙牛, 乾, 的, 吧, , 嚴懲, 真凶, ！]"
4,N,空尤其是三立電視臺女主播做的序尤其無趣像是硬湊那麼多字,"[空, 尤其, 是, 三立, 電視, 臺, 女主播, 做, 的, 序, 尤其, 無趣, 像是..."
...,...,...,...
6383,P,價效比高、記憶體大、功能全，螢幕超清晰,"[價效, 比高, 、, 記憶體, 大, 、, 功能, 全, ，, 螢幕超, 清晰]"
6384,N,你太狠了… 告訴你他們不會喧譁的人，肯定是蒙牛喝多了,"[你, 太狠, 了, …, , 告訴, 你, 他們, 不會, 喧, 譁, 的, 人, ，,..."
6385,N,醫生居然買了蒙牛，我是喝呢還是不喝呢還是不喝呢？,"[ , 醫生, 居然, 買, 了, 蒙牛, ，, 我, 是, 喝, 呢, 還是, 不, 喝,..."
6386,N,我只想說 夾蒙牛是不對的 販賣毒品是犯罪行為,"[我, 只, 想, 說, , 夾, 蒙牛, 是, 不, 對, 的, , 販賣, 毒品, ..."


## Removing Punctuation

In [2]:
import unicodedata # for removing Chinese puctuation
def remove_punc_by_unicode(words):
    out = []
    for word in words:
        if word != " " and not unicodedata.category(word[0]).startswith('P'):
            out.append(word)
    return out
df['cleaned'] = df['token_text'].apply(remove_punc_by_unicode)

## tokenized text

In [3]:
documents = [" ".join(doc) for doc in df['cleaned']]
y = df.iloc[:, 0]
documents[:5]

['店家 很 給力 快遞 也 是 相當快 第三次 光顧 啦',
 '這樣 的 配置 用 Vista 系統 還是 有點 卡 指紋 收集器 沒送 原裝 滑鼠 還 需要 自己 買 不太好',
 '不錯 在 同等 檔次 酒店 中應 該 是 值得 推薦 的',
 '哎 不會 是 蒙牛 乾 的 吧 嚴懲 真凶',
 '空 尤其 是 三立 電視 臺 女主播 做 的 序 尤其 無趣 像是 硬 湊 那麼 多字']

# Feature selection

## Vectorization (not neccesary)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_vect = count_vect.fit_transform(documents)
print(X_vect.shape)
print("Frequency of 推薦: ", count_vect.vocabulary_.get(u'推薦'))

(6388, 12240)
Frequency of 推薦:  5830


## tfidf vectorization
- https://towardsdatascience.com/clustering-documents-with-python-97314ad6a78d
- https://blog.csdn.net/blmoistawinde/article/details/80816179
- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

**Parameters**
- **lowercasebool**, default=True: Convert all characters to lowercase before tokenizing.
- **analyzer{‘word’, ‘char’, ‘char_wb’}** or callable, default=’word’ Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.
- **stop_words{‘english’}**, list, default=None
- **token_pattern**, str, default=r”(?u)\b\w\w+\b”: the default setting limits on at least 2 characters
- **ngram_range**, tuple (min_n, max_n), default=(1, 1): (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
- **max_df(min_df)** float or int, default=1.0: When building the vocabulary ignore terms that have a document frequency strictly higher(lower for min_df) than the given threshold (corpus-specific stop words). If float in range [0.0, 1.0], the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.
- **max_features**, int, default=None: If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
- **use_idf**, default=True: Enable inverse-document-frequency reweighting.
- **smooth_idf**, default=True: Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.




In [5]:
with open("data/stopwords_zh-tw.txt", encoding="utf-8") as fin:
    stopwords = fin.read().split("\n")[1:]

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.05,
                              # token_pattern=r"(?u)\b\w+\b", # default=r”(?u)\b\w\w+\b” means at least 2 characters
                              # max_features = 2000,
                              stop_words=stopwords).fit(documents)

X_tfidf = tfidf.transform(documents)
print(type(X_tfidf))
print(X_tfidf.shape)


# Show transform(X_tfidf) result
tfidf.inverse_transform(X_tfidf)[:10]

<class 'scipy.sparse.csr.csr_matrix'>
(6388, 11959)


[array(['給力', '第三次', '相當快', '快遞', '店家', '光顧'], dtype='<U24'),
 array(['需要', '配置', '系統', '滑鼠', '沒送', '有點', '收集器', '指紋', '原裝', '不太好',
        'vista'], dtype='<U24'),
 array(['檔次', '推薦', '同等', '值得', '中應'], dtype='<U24'),
 array(['真凶', '嚴懲'], dtype='<U24'),
 array(['電視', '無趣', '尤其', '女主播', '多字', '像是', '三立'], dtype='<U24'),
 array(['本書', '明明', '只到', '原因', '信的過', '以後怎麼'], dtype='<U24'),
 array(['感覺還', '一下'], dtype='<U24'),
 array(['顯示', '還不錯', '速度', '硬碟', '玩遊戲', '溫度', '散熱', '以下', 'cpu', '56'],
       dtype='<U24'),
 array(['配置', '還不錯', '速度', '貼紙', '白色', '方便', '好看', '外觀', '執行', '主流',
        'vista'], dtype='<U24'),
 array(['重新', '還要', '花灑', '水超級', '水壓', '標配', '本來', '換一個', '問題', '售後還',
        '修理', '一下'], dtype='<U24')]

## w2v then Doc2Vec
把文字向量化，再對每個文件中的文字向量取平均，得出文章向量(Doc2Vec)。

In [6]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_documents, vector_size=100, window=2, min_count=1, workers=4)

In [7]:
tagged_documents[:5]

[TaggedDocument(words='店家 很 給力 快遞 也 是 相當快 第三次 光顧 啦', tags=[0]),
 TaggedDocument(words='這樣 的 配置 用 Vista 系統 還是 有點 卡 指紋 收集器 沒送 原裝 滑鼠 還 需要 自己 買 不太好', tags=[1]),
 TaggedDocument(words='不錯 在 同等 檔次 酒店 中應 該 是 值得 推薦 的', tags=[2]),
 TaggedDocument(words='哎 不會 是 蒙牛 乾 的 吧 嚴懲 真凶', tags=[3]),
 TaggedDocument(words='空 尤其 是 三立 電視 臺 女主播 做 的 序 尤其 無趣 像是 硬 湊 那麼 多字', tags=[4])]

In [8]:
import scipy
doc_vector = [model.infer_vector(doc) for doc in df['cleaned']]
X_w2v = scipy.sparse.csr_matrix(doc_vector)
X_w2v.shape

(6388, 100)

## with chi.square feature selector
想看哪些字有鑑別度，現在有一群已經標註過正負面的文本(Yn)。若A這個字(Xn, 也就是feature)在正負面的文章中都很多，那就對model沒有鑑別度。
- https://towardsdatascience.com/using-the-chi-squared-test-for-feature-selection-with-implementation-b15a4dad93f1

In [38]:
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(score_func=chi2, k=2000)
y = df.iloc[:, 0]
fit = selector.fit(X_tfidf, y)
fit.scores_

array([0.15789817, 0.01763186, 0.45339518, ..., 0.32014909, 0.00125132,
       0.33394599])

In [39]:
X_chi2 = selector.fit_transform(X_tfidf, y)
X_chi2

<6388x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 25329 stored elements in Compressed Sparse Row format>

# Pipeline

- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
- Pipeline只是幫忙設計整個 Vectorization -> Feature selection -> Training -> Evaluating 的流程。所以必須要自己做`train_test_split()`。但若用了`GridSearchCV`，那Grid會自己跑nfold，看要設計多少個幾個folds，所以不用特地做trian-test-split

## train_test_split()

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(documents, y, test_size=0.3)

## Modeling
- `LogisticRegression()` https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [6]:
with open("../data/stopwords_zh-tw.txt", encoding="utf-8") as fin:
    stopwords = fin.read().split("\n")[1:]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.05, stop_words=stopwords)),
    ('clf', LogisticRegression())
])

## Using pipeline without Grid

In [9]:
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.05,
                                 stop_words=['?', '、', '。', '“', '”', '《', '》',
                                             '！', '，', '：', '；', '？', '人民',
                                             '末##末', '啊', '阿', '哎', '哎呀', '哎喲',
                                             '唉', '我', '我們', '按', '按照', '依照',
                                             '吧', '吧噠', '把', '罷了', '被', ...])),
                ('clf', LogisticRegression())])

In [49]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, predicted)
cr = classification_report(y_test, predicted)
accuracy = accuracy_score(y_test, predicted)
print(accuracy)
print(cm)
print(cr)

0.8142931664058425
[[867 131]
 [225 694]]
              precision    recall  f1-score   support

           N       0.79      0.87      0.83       998
           P       0.84      0.76      0.80       919

    accuracy                           0.81      1917
   macro avg       0.82      0.81      0.81      1917
weighted avg       0.82      0.81      0.81      1917



# GridSearchCV + Pipeline

Notes. 在Pipeline的寫法中，不能把不同的models passthrough到parameters才給，必須要在Pipeline時就要給estimator（也就是ML models）。如果要這麼做的話，就是寫for-loop。例如第五節。

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
- Selecting dimensionality reduction with Pipeline and GridSearchCV https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html
- https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

## Import library

In [42]:
with open("../data/stopwords_zh-tw.txt", encoding="utf-8") as fin:
    stopwords = fin.read().split("\n")[1:]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

## Designing Pipeline

In [43]:
from sklearn.pipeline import Pipeline
pipe= Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords)),
    ('clf', LogisticRegression())
])

## Designing parameters for GridSearchCV

In [44]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.01, 0.05, 0.1, 0.2, 1.0],
#     'tfidf__token_pattern': [r"(?u)\b\w+\b", r"(?u)\b\w\w+\b"],
    'tfidf__use_idf': (True, False),
#     'clf__penalty': ('l1', 'l2', 'none'),
}

## Initialize Grid
- `cv` for cross-validation folds, which means that the GridSearchCV will splitting data into 5 folds as training-testing data automatically. No need to split data into train and test manually. 

In [46]:
grid = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose = 3)
grid.fit(documents, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=['?', '、',
                                                                    '。', '“',
                                                                    '”', '《',
                                                                    '》', '！',
                                                                    '，', '：',
                                                                    '；', '？',
                                                                    '人民',
                                                                    '末##末', '啊',
                                                                    '阿', '哎',
                                                                    '哎呀', '哎喲',
                                                                    '唉', '我',
                                                                    '我們',

In [47]:
print(grid.best_score_)
print("Params: ")
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, grid.best_params_[param_name]))

0.8498727333110295
Params: 
tfidf__max_df: 1.0
tfidf__ngram_range: (1, 1)
tfidf__use_idf: True


In [48]:
import pandas as pd
pd.DataFrame(grid.cv_results_).filter(regex='(param_.*)|(.*test_score)')

Unnamed: 0,param_tfidf__max_df,param_tfidf__ngram_range,param_tfidf__use_idf,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01,"(1, 1)",True,0.822379,0.805947,0.805164,0.779953,0.810493,0.804787,0.013858,19
1,0.01,"(1, 1)",False,0.819249,0.805164,0.79421,0.76899,0.796398,0.796802,0.016457,20
2,0.01,"(1, 2)",True,0.840376,0.830203,0.837246,0.797181,0.847298,0.830461,0.017524,13
3,0.01,"(1, 2)",False,0.838811,0.820814,0.823944,0.790916,0.829287,0.820754,0.01612,18
4,0.05,"(1, 1)",True,0.837246,0.838028,0.834898,0.818324,0.835552,0.83281,0.00733,12
5,0.05,"(1, 1)",False,0.831768,0.827074,0.826291,0.810493,0.824589,0.824043,0.007181,15
6,0.05,"(1, 2)",True,0.837246,0.843505,0.832551,0.81989,0.847298,0.836098,0.009559,9
7,0.05,"(1, 2)",False,0.825509,0.833333,0.820814,0.811276,0.830854,0.824357,0.007846,14
8,0.1,"(1, 1)",True,0.836463,0.836463,0.834898,0.820673,0.839468,0.833593,0.006627,11
9,0.1,"(1, 1)",False,0.833333,0.827074,0.825509,0.811276,0.821457,0.82373,0.007305,17


# Multi-Models

Classification of text documents using sparse features https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py
> The example design a benchmark for reporting all model training and testing results

In [49]:
from sklearn.pipeline import Pipeline
pipe= Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords)),
    ('clf', LogisticRegression())
])
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if opts.print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print(trim("%s: %s" % (label, " ".join(feature_names[top10]))))
        print()

    if opts.print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=target_names))

    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [None]:
results = []
for clf, name in (
        ((tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

# (In-Complete) Multiple models with different parameters
- different models in pipeline for gridsearchv https://stackoverflow.com/questions/50265993/alternate-different-models-in-pipeline-for-gridsearchcv
- plot grid search stats. https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html

In [65]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC

models = {
#     'tfidf': TfidfVectorizer(max_df=0.05, stop_words=stopwords),
    'RF': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'LR': LogisticRegression(),
    'NB': GaussianNB(),
    'GB': GradientBoostingClassifier(),
    'SVM': svm.SVC()
}

params = {
#     'tfidf': {
#         'ngram_range': [(1, 1), (1, 2)]
#     },
    'RF':{ 
            "n_estimators": [100, 200, 500, 1000],
            "max_features": ["auto", "sqrt", "log2"],
            "bootstrap": [True],
            "criterion": ['gini', 'entropy'],
            "oob_score": [True, False]
            },
    'KNN': {
        'n_neighbors': range(3, 15),
        'weights': ['uniform', 'distance'],
        'algorithm': ['ball_tree', 'kd_tree', 'brute']
        },
    'LR': {
        'solver': ['newton-cg', 'sag', 'lbfgs'],
        'multi_class': ['ovr', 'multinomial']
        }  
}


In [68]:
for name in models.keys():
    text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(max_df=0.05, stop_words=stopwords)),
        ('clf', models[name])
    ])
    gscv = GridSearchCV(text_clf, param_grid=params[name], cv=5)
    gscv.fit(X_train, y_train)
    print("best parameters are: {}".format(gscv.best_estimator_))
    y_pred = gscv.predict(X_test)
    print(grid.best_score_)
    print(accuracy_score(y_test, y_pred))


TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.05,
                                 stop_words=['?', '、', '。', '“', '”', '《', '》',
                                             '！', '，', '：', '；', '？', '人民',
                                             '末##末', '啊', '阿', '哎', '哎呀', '哎喲',
                                             '唉', '我', '我們', '按', '按照', '依照',
                                             '吧', '吧噠', '把', '罷了', '被', ...])),
                ('clf',
                 TfidfVectorizer(max_df=0.05,
                                 stop_words=['?', '、', '。', '“', '”', '《', '》',
                                             '！', '，', '：', '；', '？', '人民',
                                             '末##末', '啊', '阿', '哎', '哎呀', '哎喲',
                                             '唉', '我', '我們', '按', '按照', '依照',
                                             '吧', '吧噠', '把', '罷了', '被', ...]))]) does not.