### author  : lzw
### description: using gensim doc2vec to handle sentimentclassification ( doc2vec as text representation )
### reference : tutorial on gensim official
       (https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb)
### opinion: 
  - Use an external metric to evaluate doc2vec embedding.(sentiment classification) 
  
   - Imbalanced dataset of sentiment classification, might a bad metric to evaluate doc2vec.
  
  - It might be useful to throw away stopwords for a small dataset.
  
### requirement
  - python35

In [1]:
import datetime
import multiprocessing
from collections import OrderedDict
from random import shuffle
import gensim
import jieba
import numpy as np
from gensim.models.doc2vec import Doc2Vec
from sklearn import linear_model
from sklearn.metrics import classification_report
from data_helper import load_text_label, elapsed_timer, split_file_classes
import sys ; sys.path.insert(0,"../")
import utils

jieba.load_userdict('../dataset/digital forum comments/digital_selfdict.txt')
cores = multiprocessing.cpu_count()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\William\AppData\Local\Temp\jieba.cache
Loading model cost 1.211 seconds.
Prefix dict has been built succesfully.


In [4]:
"""
    load corpus and specific split portions.
"""
corpus_path = '../dataset/digital forum comments/split_20180403/classes/'
label_index, class_name_count = split_file_classes('../dataset/digital forum comments/split_20180403/all.txt')
split = [0.7, 0.3]
corpus, train_len_total, dev_len_total = load_text_label(corpus_path, class_name_count, label_index, split, shuffle=True)
corpus_train = [d for d in corpus if d.split == 'train']
corpus_dev = [d for d in corpus if d.split == 'dev']
corpus_unk = [d for d in corpus if d.split == 'UNK']# sentences with unknown label

corpus_len = len(corpus)
print ('train texts num: %d' %(train_len_total))
print ('dev texts num: %d' %(dev_len_total))
print ('unk texts num %d' %(len(corpus_unk)))

train texts num: 74346
dev texts num: 31864
unk texts num 245907


In [24]:
"""
    specific doc2vec model parameters
"""
simple_models = [
    # PV-DM w/ concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=20, hs=0, min_count=3, workers=cores),
    # PV-DBOW
    Doc2Vec(dm=0, vector_size=100, negative=20, hs=0, min_count=3, workers=cores),
    # PV-DM w/ average
    Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=5, negative=20, hs=0, min_count=3, workers=cores),
]
models_by_name = OrderedDict((str(model), model) for model in simple_models)

# throw stop words away
stop_words_file = '../dataset/stopwords.txt'
stop_words_file = open(stop_words_file, encoding='utf-8')
stop_words = set(stop_words_file.read().split('\n'))
stop_words_file.close()
def my_trim(word, count, min_count):
    if word in stop_words:
        return gensim.utils.RULE_DISCARD
    else:
        return gensim.utils.RULE_DEFAULT
    
simple_models[0].build_vocab(corpus, trim_rule=my_trim)
# simple_models[0].build_vocab_from_freq(word2count)# PV-DM w/ concat requires one special NULL word so it serves as template
print(simple_models[0])
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4)
Doc2Vec(dbow,d100,n20,mc3,s0.001,t4)
Doc2Vec(dm/m,d100,n20,w5,mc3,s0.001,t4)


In [25]:
alpha, min_alpha, epoches = 0.01, 0.025, 30
alpha_delta = (alpha - min_alpha) / epoches
print("START %s" % datetime.datetime.now())
for epoch in range(1, epoches + 1):
    shuffle(corpus)
    for name, train_model in models_by_name.items():
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(corpus, total_examples=corpus_len, epochs=1)  # Train
    alpha -= alpha_delta
    print('[complete epoch %d ]' % (epoch))

print("END %s" % str(datetime.datetime.now()))

START 2018-04-05 21:36:42.325323
[complete epoch 1 ]
[complete epoch 2 ]
[complete epoch 3 ]
[complete epoch 4 ]
[complete epoch 5 ]
[complete epoch 6 ]
[complete epoch 7 ]
[complete epoch 8 ]
[complete epoch 9 ]
[complete epoch 10 ]
[complete epoch 11 ]
[complete epoch 12 ]
[complete epoch 13 ]
[complete epoch 14 ]
[complete epoch 15 ]
[complete epoch 16 ]
[complete epoch 17 ]
[complete epoch 18 ]
[complete epoch 19 ]
[complete epoch 20 ]
END 2018-04-05 22:04:23.550819


## word vectors similarity

In [26]:
word_models = simple_models[:]

In [37]:
import random
from IPython.display import HTML
# pick a random word with a suitable number of occurences
# while True:
#     word = random.choice(word_models[0].wv.index2word)
#     if word_models[0].wv.vocab[word].count > 2000:
#         break
word='蓝屏'
# word='华为'
# or uncomment below line, to just pick a word from the relevant domain:
#word = 'comedy/drama'
similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),<br>\n') for model in word_models]
similar_table = ("<table><tr><th>" +
    "</th><th>".join([str(model) for model in word_models]) + 
    "</th></tr><tr><td>" +
    "</td><td>".join(similars_per_model) +
    "</td></tr></table>")
print("most similar words for '%s' (%d occurences)" % (word, simple_models[0].wv.vocab[word].count))
HTML(similar_table)

most similar words for '蓝屏' (304 occurences)




"Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4)","Doc2Vec(dbow,d100,n20,mc3,s0.001,t4)","Doc2Vec(dm/m,d100,n20,w5,mc3,s0.001,t4)"
"[('死机', 0.6453672647476196), ('掉帧', 0.5336572527885437), ('黑屏', 0.5276408195495605), ('不好受', 0.5066460967063904), ('重启', 0.5064347982406616), ('卡住', 0.49836763739585876), ('啪', 0.4920652210712433), ('卡死', 0.48987439274787903), ('扫射', 0.4890672564506531), ('driverpower', 0.4881700277328491), ('浓', 0.4871233105659485), ('消失', 0.4867590665817261), ('花屏', 0.48554563522338867), ('闭幕', 0.48437780141830444), ('脱胎换骨', 0.4842427372932434), ('闪退', 0.48232191801071167), ('失火', 0.48080918192863464), ('开不了机', 0.478066623210907), ('抖动', 0.4774223268032074), ('贡酒', 0.47672390937805176)]","[('炮火', 0.3934079706668854), ('山东地区', 0.38848552107810974), ('告别', 0.3857855796813965), ('主动出击', 0.3735121488571167), ('实质', 0.37349337339401245), ('Pay', 0.37048235535621643), ('言论', 0.3557182252407074), ('B型', 0.353630393743515), ('长沙地区', 0.35167157649993896), ('主城', 0.3503320813179016), ('摔', 0.3464968800544739), ('GPS', 0.34526538848876953), ('九千', 0.34388646483421326), ('白板', 0.34240207076072693), ('潍坊', 0.33825549483299255), ('备', 0.33548614382743835), ('回溯', 0.3300383687019348), ('logo', 0.3257746696472168), ('肺', 0.32479560375213623), ('秀才', 0.324740469455719)]","[('死机', 0.5976219773292542), ('重装系统', 0.5648543238639832), ('FPS', 0.5384677648544312), ('卡死', 0.5288370251655579), ('开不开机', 0.504610002040863), ('闪退', 0.5013353824615479), ('重启', 0.5004960894584656), ('开机', 0.4987723231315613), ('花屏', 0.4918825924396515), ('win', 0.49185311794281006), ('莫名其妙', 0.48777681589126587), ('进不了', 0.47796082496643066), ('自动', 0.4706798195838928), ('卡住', 0.46173620223999023), ('重装', 0.4606199264526367), ('黑屏', 0.4562179446220398), ('光盘', 0.4546934962272644), ('卡机', 0.45238950848579407), ('lol', 0.44898971915245056), ('报错', 0.4460287094116211)]"


## doc vectors similarity

In [31]:
import random

doc_id = np.random.randint(simple_models[0].docvecs.count)  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%d): « %s »\n' % (doc_id, ' '.join(corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: « %s »\n' % (label, sims[index], ' '.join(corpus[sims[index][0]].words)))

TARGET (331550): « 联想拯救者 R 可以 装 固态 吗 »

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4):

MOST (211464, 0.8575758934020996): « 长虹 华为 签署 战略 合作 协议 共推 智慧 城市 发展 »

MEDIAN (263460, 0.009524062275886536): « 华为 最 经典 的 外观设计 为什么 没有 延续 使用 下去 »

LEAST (65436, -0.7224101424217224): « 任正非 计划 未来 几年 华为 每年 研发 经费 要 提高 到 亿美元 »



## sentiment classificatioin
### 0：负面； 1：中立

In [77]:
infer_steps = 20
infer_alpha = 0.01
infer_min_alpha=0.025

In [46]:
sentiment_clf_models = {}

for name, train_model in models_by_name.items():
    
    print ('-'*10 + ' model: ' + name + '   ' + '-'*10)
    dev_labels = [doc.label for doc in corpus if doc.split == 'dev']
    clf = linear_model.SGDClassifier(max_iter=1000, verbose=1, tol=10, learning_rate='constant', eta0=0.01,
                                     class_weight={1: 1, 0: 4}, loss='log', penalty='elasticnet')

    clf.fit([train_model.docvecs[doc.tags[0]] for doc in corpus if doc.split == 'train'],
            [doc.label for doc in corpus if doc.split == 'train'])

    classifedLabels_dev = clf.predict(
        [train_model.docvecs[doc.tags[0]] for doc in corpus if doc.split == 'dev'])
    print(
        ' [clf  ' + name + ' ]' + '\n' + str(classification_report(dev_labels, classifedLabels_dev)))

    classifedLabels_dev_infer = clf.predict(np.array( # infer dev texts to evaluate doc2vec model 
        [train_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha, min_alpha=infer_min_alpha) for doc in corpus if
         doc.split == 'dev']))
    print(
        ' [clf  ' + name + ' infer ' + str(infer_steps) +  ' ]' + '\n' + str(
            classification_report(dev_labels, classifedLabels_dev_infer)) + '\n')
    
    sentiment_clf_models[name] = clf
    print ('-'*70)


---------- model: Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4)   ----------
-- Epoch 1
Norm: 2.81, NNZs: 99, Bias: -0.110767, T: 74346, Avg. loss: 0.703230
Total training time: 0.07 seconds.
-- Epoch 2
Norm: 3.78, NNZs: 94, Bias: -0.223693, T: 148692, Avg. loss: 0.699638
Total training time: 0.17 seconds.
Convergence after 2 epochs took 0.17 seconds
 [clf  Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4) ]
             precision    recall  f1-score   support

          0       0.21      0.89      0.34      6620
          1       0.79      0.11      0.20     25244

avg / total       0.67      0.27      0.22     31864

 [clf  Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4) infer 20 ]
             precision    recall  f1-score   support

          0       0.21      0.80      0.33      6620
          1       0.79      0.20      0.32     25244

avg / total       0.67      0.32      0.32     31864

------------------------------
---------- model: Doc2Vec(dbow,d100,n20,mc3,s0.001,t4)   ----------
-- Epoch 1
Norm: 

### test texts 

#### using unseen texts to evaluate sentiment classification logistic regression 

In [78]:
# for unseen text, what is a good infer parameters？
# small learning rate might only inferring noises.
test_infer_steps = 30
test_infer_alpha = 0.05
test_infer_min_alpha=0.025

In [72]:
test_filepath = '../dataset/digital forum comments/测试集3_联想_杨欣标注15000_20171116.txt'
test_texts, test_labels = utils.load_data_label(test_filepath)
test_labels = [1-l for l in test_labels]
negative_len = len([l for l in test_labels if l==0])

In [75]:
print ('len of negative: %d , len of positive: %d, portion: %d : %f \n' % (negative_len, len(test_labels)-negative_len, 
                                                                       1,negative_len/(len(test_labels)-negative_len)))

for name, clf_model in sentiment_clf_models.items():
    
    print ('-'*10 + ' model: ' + name + '   ' + '-'*10)
    classifedLabels_test_infer = clf_model.predict(np.array( # infer dev texts to evaluate doc2vec model 
        [models_by_name[name].infer_vector(jieba.lcut(''.join(utils.accept_sentence(doc)), HMM=False), 
                                           steps=test_infer_steps, alpha=test_infer_steps, min_alpha=test_infer_alpha) 
                                           for doc in test_texts]))
    
    print(
        ' [clf  ' + name + ' infer ' + str(infer_steps) +  ' ]' + '\n' + str(
            classification_report(test_labels, classifedLabels_test_infer)) + '\n')
    
    print ('-'*70)


len of negative: 1251 , len of positive: 13737, portion:1 : 0.091068 

---------- model: Doc2Vec(dbow,d100,n20,mc3,s0.001,t4)   ----------
 [clf  Doc2Vec(dbow,d100,n20,mc3,s0.001,t4) infer 30 ]
             precision    recall  f1-score   support

          0       0.11      0.35      0.16      1251
          1       0.93      0.74      0.82     13737

avg / total       0.86      0.71      0.77     14988


----------------------------------------------------------------------
---------- model: Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4)   ----------
 [clf  Doc2Vec(dm/c,d100,n20,w5,mc3,s0.001,t4) infer 30 ]
             precision    recall  f1-score   support

          0       0.08      0.73      0.15      1251
          1       0.91      0.24      0.38     13737

avg / total       0.84      0.28      0.36     14988


----------------------------------------------------------------------
---------- model: Doc2Vec(dm/m,d100,n20,w5,mc3,s0.001,t4)   ----------
 [clf  Doc2Vec(dm/m,d100,n20,w5,