In [6]:
import os
import gensim
from gensim import corpora, models
import smart_open 
import numpy as np
from numpy import random
random.seed(555)
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict



In [7]:
# 檔案路徑
amazon_corpus_path = "C:" + os.sep + "temp" + os.sep + "BLACK-DECKER.csv"

In [8]:
# 斷詞 & 標籤
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.LabeledSentence(gensim.utils.simple_preprocess(line), ['SENT_%s' % i])

In [9]:
# 訓練組與測試組
train_corpus = list(read_corpus(amazon_corpus_path))
test_corpus = list(read_corpus(amazon_corpus_path, tokens_only=True))

In [50]:
train_corpus[:2]

[LabeledSentence(words=['ve', 'owned', 'some', 'type', 'of', 'dustbuster', 'handvac', 'for', 'the', 'last', 'twenty', 'years', 'and', 'don', 'think', 've', 'ever', 'ever', 'had', 'one', 'that', 'didn', 'hate', 'too', 'little', 'suction', 'power', 'not', 'enough', 'charge', 'time', 'the', 'suction', 'nozzle', 'is', 'awkward', 'to', 'use', 'and', 'the', 'batteries', 'are', 'always', 'dead', 'after', 'couple', 'minutes', 'can', 'keep', 'them', 'plugged', 'into', 'the', 'wall', 'all', 'the', 'time', 'but', 'that', 'kills', 'the', 'battery', 'and', 'wastes', 'electricity', 'have', 'found', 'it', 'much', 'more', 'effective', 'to', 'just', 'use', 'the', 'suction', 'hose', 'on', 'my', 'upright', 'even', 'though', 'that', 'means', 'lugging', 'my', 'vacuum', 'around', 'so', 'decided', 'to', 'see', 'what', 'is', 'new', 'in', 'handvac', 'technology', 'and', 'if', 'they', 'have', 'improved', 'at', 'all', 'over', 'the', 'last', 'ten', 'years', 'fortunately', 'can', 'say', 'that', 'they', 'have', 'th

In [10]:
# 建模
model = models.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(train_corpus)

In [9]:
# 學習（學習速率遞減）
epoch = 10
for _ in range(epoch):
    model.train(train_corpus)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
print ('done training')

done training


In [10]:
# 儲存模型
model.save("C:" + os.sep + "temp" + os.sep +"try.d2v")

In [11]:
# 載入模型
model = gensim.models.Doc2Vec.load("C:" + os.sep + "temp" + os.sep +"try.d2v")

In [62]:
# 把每篇文章的向量塞進矩陣裡面
lenx=len(train_corpus)
leny=len(model.infer_vector(train_corpus[0].words))
print(lenx,leny)
a = np.zeros((lenx,leny))
for i_sent in range(len(train_corpus)):
    a[i_sent,:] = model.infer_vector(test_corpus[i_sent])
print(a[:2])

9362 100
[[ 0.71777129 -0.2968137   1.74602664 -0.30613655 -2.69307899 -0.43241048
   1.12778926  0.26941639  0.38615209 -0.41628286 -0.76238769 -0.09876788
  -1.12781191 -0.85713768  0.62653118 -1.49333346 -0.08015751 -1.22722721
  -0.1390937   1.40933192 -0.89079672  0.27064511  1.13283479 -0.26122171
   2.10870814 -0.77381921  1.96802855  1.11911893 -1.14450192 -0.45961979
   0.34697917 -0.52431965 -0.84373212 -0.51649266  0.47457075 -0.69516867
  -0.10425612  0.79353291 -0.60305548 -0.57055169 -0.22832471  1.01737738
   2.06921506  0.14518255  0.92332911  0.16651233 -0.20290974  2.01038456
  -1.28398776 -0.95935118  1.339203   -1.23555207 -0.53434384 -1.69657874
  -1.3845191  -0.3434324  -0.25409126 -0.57073379  0.04967006  0.74818397
   0.76053321 -2.11183715  0.38309515  0.24115428  0.80337095  0.87139839
  -0.26672438  0.00822973 -0.80505943  0.87872058 -0.78241342  0.0992297
   0.70013189  1.19461274  0.63134563  1.00228322  1.96896958 -0.0401099
  -0.30581975 -1.25995636  0.21

In [63]:
# 正規化（？）
sim_matrix = whiten(a)

In [64]:
# Kmeans分7群（7是隨便挑的，之後要用elbow method或輪廓係數找最佳群組數）
centroid, destortion = kmeans(sim_matrix, 7, iter=100, thresh=1e-05)
labels, dist = vq(sim_matrix, centroid)
print(labels[0])

2


In [65]:
# 按標籤將屬於該群的文章INDEX存起來

cluster0 = []
cluster1 = []
cluster2 = []
cluster3 = []
cluster4 = []
cluster5 = []
cluster6 = []

for i in range(len(labels)):
    if (labels[i] == 0):
        cluster0.append(i);
    elif (labels[i] == 1):
        cluster1.append(i);
    elif (labels[i] == 2):
        cluster2.append(i);
    elif (labels[i] == 3):
        cluster3.append(i);
    elif (labels[i] == 4):
        cluster4.append(i);
    elif (labels[i] == 5):
        cluster5.append(i);
    elif (labels[i] == 6):
        cluster6.append(i);

In [66]:
# 標籤為0的群組文章INDEX
print(cluster0)

[59, 911, 2336, 2417, 2588, 2984, 3043, 3142, 3160, 3260, 3303, 3306, 3322, 3343, 3362, 3367, 3372, 3390, 3396, 3397, 3400, 3420, 3425, 3429, 3437, 3440, 3456, 3458, 3467, 3469, 3483, 3484, 3488, 3510, 3527, 3531, 3544, 3558, 3559, 3562, 3569, 3571, 3581, 3583, 3585, 3595, 3601, 3602, 3604, 3666, 3767, 3829, 3904, 4112, 4257, 4301, 4337, 4528, 4549, 4550, 4605, 4704, 4721, 4733, 4737, 4759, 4770, 4804, 4841, 4846, 4848, 4898, 4903, 4907, 4940, 5023, 5024, 5026, 5037, 5038, 5125, 5180, 5190, 5202, 5225, 5253, 5267, 5298, 5326, 5371, 5392, 5400, 5457, 5478, 5484, 5489, 5503, 5538, 5561, 5628, 5641, 5664, 5689, 5751, 5815, 5820, 5848, 5849, 5854, 5856, 5896, 5906, 5910, 5949, 5953, 5974, 5987, 5988, 5989, 6000, 6027, 6028, 6035, 6039, 6066, 6079, 6081, 6114, 6124, 6125, 6157, 6158, 6178, 6179, 6185, 6194, 6205, 6225, 6228, 6233, 6248, 6288, 6294, 6304, 6308, 6319, 6331, 6341, 6345, 6354, 6360, 6386, 6387, 6391, 6397, 6428, 6450, 6451, 6456, 6463, 6466, 6468, 6480, 6483, 6488, 6499, 6501, 

In [67]:
# 按文章INDEX將文章分群

corpus_clu0 = []
corpus_clu1 = []
corpus_clu2 = []
corpus_clu3 = []
corpus_clu4 = []
corpus_clu5 = []
corpus_clu6 = []

for j in range(len(cluster0)):
    corpus_clu0.append(train_corpus[cluster0[j]]);
for j in range(len(cluster1)):
    corpus_clu1.append(train_corpus[cluster1[j]]);    
for j in range(len(cluster2)):
    corpus_clu2.append(train_corpus[cluster2[j]]);
for j in range(len(cluster3)):
    corpus_clu3.append(train_corpus[cluster3[j]]); 
for j in range(len(cluster4)):
    corpus_clu4.append(train_corpus[cluster4[j]]);     
for j in range(len(cluster5)):
    corpus_clu5.append(train_corpus[cluster5[j]]);
for j in range(len(cluster6)):
    corpus_clu6.append(train_corpus[cluster6[j]]);

In [68]:
# 看看第0群的第0篇文章內容與標籤
print(corpus_clu0[0])

LabeledSentence(['have', 'it', 'in', 'order', 'but', 'it', 'funny', 'how', 'the', 'first', 'picture', 'shows', 'dirt', 'on', 'narrow', 'location', 'but', 'that', 'the', 'physical', 'size', 'of', 'the', 'vacuum', 'is', 'not', 'able', 'to', 'physically', 'fit', 'to', 'clean', 'the', 'whole', 'mess'], ['SENT_59'])


In [69]:
# 看看第0群的第0篇文章標籤
print(corpus_clu0[0].tags)

['SENT_59']


In [70]:
# 看看第0群的第0篇文章內容
print(corpus_clu0[0].words)

['have', 'it', 'in', 'order', 'but', 'it', 'funny', 'how', 'the', 'first', 'picture', 'shows', 'dirt', 'on', 'narrow', 'location', 'but', 'that', 'the', 'physical', 'size', 'of', 'the', 'vacuum', 'is', 'not', 'able', 'to', 'physically', 'fit', 'to', 'clean', 'the', 'whole', 'mess']


In [71]:
# 只挑文章內容存進去各個群組
corpus_clu0_wordlist = []
corpus_clu1_wordlist = []
corpus_clu2_wordlist = []
corpus_clu3_wordlist = []
corpus_clu4_wordlist = []
corpus_clu5_wordlist = []
corpus_clu6_wordlist = []

for i in range(len(corpus_clu0)):
    corpus_clu0_wordlist.append(corpus_clu0[i].words)
for i in range(len(corpus_clu1)):
    corpus_clu1_wordlist.append(corpus_clu1[i].words)
for i in range(len(corpus_clu2)):
    corpus_clu2_wordlist.append(corpus_clu2[i].words)
for i in range(len(corpus_clu3)):
    corpus_clu3_wordlist.append(corpus_clu3[i].words)
for i in range(len(corpus_clu4)):
    corpus_clu4_wordlist.append(corpus_clu4[i].words)
for i in range(len(corpus_clu5)):
    corpus_clu5_wordlist.append(corpus_clu5[i].words)
for i in range(len(corpus_clu6)):
    corpus_clu6_wordlist.append(corpus_clu6[i].words)

In [72]:
# 看看第0個群組的前2篇文章
corpus_clu0_wordlist[:2]

[['have',
  'it',
  'in',
  'order',
  'but',
  'it',
  'funny',
  'how',
  'the',
  'first',
  'picture',
  'shows',
  'dirt',
  'on',
  'narrow',
  'location',
  'but',
  'that',
  'the',
  'physical',
  'size',
  'of',
  'the',
  'vacuum',
  'is',
  'not',
  'able',
  'to',
  'physically',
  'fit',
  'to',
  'clean',
  'the',
  'whole',
  'mess'],
 ['power',
  'is',
  'great',
  'holds',
  'good',
  'charge',
  'my',
  'previous',
  'one',
  'had',
  'fixture',
  'to',
  'hold',
  'it',
  'on',
  'the',
  'wall',
  'for',
  'this',
  'one',
  'the',
  'recharging',
  'base',
  'sits',
  'on',
  'the',
  'floor',
  'and',
  'if',
  'knocked',
  'over',
  'or',
  'set',
  'slightly',
  'off',
  'the',
  'base',
  'it',
  'doesn',
  'charge']]

In [73]:
# 把只出現一次的字移除
from collections import defaultdict
from pprint import pprint  # pretty-printer

def remove_less_than_one(corpus_clu_wordlist):
    frequency = defaultdict(int)
    for text in corpus_clu_wordlist:
         for token in text:
            frequency[token] += 1

    corpus_clu_wordlist = [[token for token in text if frequency[token] > 1]
                             for text in corpus_clu_wordlist]
    return corpus_clu_wordlist;

In [74]:
remove_less_than_one(corpus_clu0_wordlist)
remove_less_than_one(corpus_clu1_wordlist)
remove_less_than_one(corpus_clu2_wordlist)
remove_less_than_one(corpus_clu3_wordlist)
remove_less_than_one(corpus_clu4_wordlist)
remove_less_than_one(corpus_clu5_wordlist)
remove_less_than_one(corpus_clu6_wordlist)

[['we',
  'have',
  'been',
  'buying',
  'new',
  'handheld',
  'vacuum',
  'every',
  'months',
  'for',
  'the',
  'past',
  'years',
  'or',
  'so',
  'typically',
  'after',
  'year',
  'the',
  'performance',
  'is',
  'so',
  'poor',
  'due',
  'to',
  'battery',
  'recharging',
  'issues',
  'that',
  'the',
  'vacs',
  'become',
  'useless',
  'after',
  'reading',
  'the',
  'reviews',
  'on',
  'amazon',
  'about',
  'this',
  'one',
  'decided',
  'to',
  'take',
  'chance',
  'on',
  'it',
  'even',
  'though',
  'typically',
  'try',
  'to',
  'find',
  'higher',
  'voltage',
  'items',
  'won',
  'go',
  'into',
  'much',
  'detail',
  'about',
  'the',
  'product',
  'because',
  'others',
  'have',
  'done',
  'good',
  'job',
  'explaining',
  'the',
  'features',
  'for',
  'us',
  'it',
  'works',
  'very',
  'well',
  'it',
  'sits',
  'on',
  'its',
  'charger',
  'inside',
  'floor',
  'level',
  'kitchen',
  'cabinet',
  'where',
  'it',
  'is',
  'easy',
  'to'

In [75]:
print(corpus_clu0_wordlist)

[['have', 'it', 'in', 'order', 'but', 'it', 'funny', 'how', 'the', 'first', 'picture', 'shows', 'dirt', 'on', 'narrow', 'location', 'but', 'that', 'the', 'physical', 'size', 'of', 'the', 'vacuum', 'is', 'not', 'able', 'to', 'physically', 'fit', 'to', 'clean', 'the', 'whole', 'mess'], ['power', 'is', 'great', 'holds', 'good', 'charge', 'my', 'previous', 'one', 'had', 'fixture', 'to', 'hold', 'it', 'on', 'the', 'wall', 'for', 'this', 'one', 'the', 'recharging', 'base', 'sits', 'on', 'the', 'floor', 'and', 'if', 'knocked', 'over', 'or', 'set', 'slightly', 'off', 'the', 'base', 'it', 'doesn', 'charge'], ['are', 'all', 'hand', 'vacuums', 'same', 'of', 'course', 'not', 'after', 'buying', 'and', 'working', 'this', 'about', 'months', 'concluded', 'it', 'is', 'the', 'best', 'the', 'charge', 'last', 'long', 'sucks', 'powerful', 'easy', 'to', 'empty', 'and', 'clean', 'don', 'like', 'the', 'platform', 'recharge', 'system', 'prefer', 'the', 'old', 'way', 'but', 'still', 'very', 'pleased', 'with', '

In [83]:
# 建立字典印出，有8585個不重複的字
dictionary = corpora.Dictionary(test_corpus)
print(dictionary)

Dictionary(8585 unique tokens: ['lesson', 'blocks', 'settles', 'mud', 'eject']...)


In [84]:
# 看看字的ID號碼
print(dictionary.token2id)



In [85]:
# 顯示各篇文章出現的字的ID跟次數
corpus0 = [dictionary.doc2bow(text) for text in corpus_clu0_wordlist]
corpus1 = [dictionary.doc2bow(text) for text in corpus_clu1_wordlist]
corpus2 = [dictionary.doc2bow(text) for text in corpus_clu2_wordlist]
corpus3 = [dictionary.doc2bow(text) for text in corpus_clu3_wordlist]
corpus4 = [dictionary.doc2bow(text) for text in corpus_clu4_wordlist]
corpus5 = [dictionary.doc2bow(text) for text in corpus_clu5_wordlist]
corpus6 = [dictionary.doc2bow(text) for text in corpus_clu6_wordlist]

#corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
print(corpus0)

[[(26, 2), (29, 1), (39, 2), (44, 1), (53, 1), (55, 1), (70, 1), (76, 1), (80, 2), (81, 1), (88, 4), (109, 1), (185, 1), (297, 1), (314, 1), (324, 1), (404, 1), (476, 1), (556, 1), (628, 1), (658, 1), (808, 1), (881, 1), (1044, 1), (1516, 1), (1909, 1), (1910, 1), (1911, 1), (1912, 1)], [(26, 1), (32, 1), (38, 1), (39, 2), (51, 1), (70, 2), (71, 1), (88, 4), (101, 1), (134, 1), (142, 1), (147, 1), (149, 2), (164, 1), (174, 1), (185, 1), (225, 2), (229, 1), (254, 1), (304, 1), (316, 1), (333, 1), (368, 2), (410, 1), (457, 1), (527, 1), (662, 1), (763, 1), (874, 1), (970, 1), (987, 1), (5407, 1)], [(26, 1), (28, 1), (39, 1), (44, 1), (55, 1), (80, 1), (81, 1), (86, 1), (88, 4), (93, 1), (111, 1), (121, 1), (135, 1), (142, 2), (147, 2), (149, 1), (154, 1), (185, 1), (198, 1), (204, 1), (224, 1), (226, 1), (235, 1), (246, 1), (287, 1), (303, 1), (314, 1), (318, 1), (330, 1), (331, 1), (334, 1), (360, 1), (373, 1), (380, 1), (430, 1), (784, 1), (895, 1), (976, 1), (1115, 1), (1256, 1), (129

In [86]:
# TF-IDF一下
from gensim import corpora, models, similarities
tfidf0 = models.TfidfModel(corpus0)
tfidf1 = models.TfidfModel(corpus1)
tfidf2 = models.TfidfModel(corpus2)
tfidf3 = models.TfidfModel(corpus3)
tfidf4 = models.TfidfModel(corpus4)
tfidf5 = models.TfidfModel(corpus5)
tfidf6 = models.TfidfModel(corpus6)

corpus_tfidf0 = tfidf0[corpus0]
corpus_tfidf1 = tfidf1[corpus1]
corpus_tfidf2 = tfidf2[corpus2]
corpus_tfidf3 = tfidf3[corpus3]
corpus_tfidf4 = tfidf4[corpus4]
corpus_tfidf5 = tfidf5[corpus5]
corpus_tfidf6 = tfidf6[corpus6]

In [87]:
# 第0個群組有1564篇文章
print(tfidf0)

TfidfModel(num_docs=838, num_nnz=4795)


In [88]:
# 第0個群組每篇文章出現的字的ID與TF-IDF值
for doc in corpus_tfidf0:
    print(doc)

[(26, 0.1451522066522173), (29, 0.1266039513433135), (39, 0.1050863147189037), (44, 0.0928867090119253), (53, 0.21000643692961443), (55, 0.11467580544502594), (70, 0.13557164430982171), (76, 0.11357153688767754), (80, 0.20658967426980632), (81, 0.11250125127198199), (88, 0.2876543112344899), (109, 0.1507126763537977), (185, 0.090010563953751), (297, 0.1717950118477987), (314, 0.09804898506210055), (324, 0.17813643600634171), (404, 0.23411516194009865), (476, 0.23411516194009865), (556, 0.1858977119191302), (628, 0.23411516194009865), (658, 0.23411516194009865), (808, 0.23411516194009865), (881, 0.1959037368582829), (1044, 0.15769231177646717), (1516, 0.21000643692961443), (1909, 0.23411516194009865), (1910, 0.23411516194009865), (1911, 0.1858977119191302), (1912, 0.23411516194009865)]
[(26, 0.06591857567851515), (32, 0.17793315860140763), (38, 0.2126393855907733), (39, 0.0954465708698683), (51, 0.11095197850388017), (70, 0.24627086012436325), (71, 0.2126393855907733), (88, 0.2612672989

In [89]:
# 看一下各群組前100個TF-IDF值較高的字
from collections import Counter

def get_top100_word(corpus_tfidf):
    d = {}
    for doc in corpus_tfidf:
        for id, value in doc:
            word = dictionary.get(id)
            d[word] = value
    d_val = Counter(d)
    return d_val.most_common(100)  

In [90]:
# 第0群
get_top100_word(corpus_tfidf0)

[('okie', 1.0),
 ('wonderful', 1.0),
 ('sucks', 1.0),
 ('terrific', 1.0),
 ('great', 1.0),
 ('love', 1.0),
 ('good', 1.0),
 ('cool', 1.0),
 ('goooooood', 1.0),
 ('horrible', 1.0),
 ('like', 1.0),
 ('magnifica', 1.0),
 ('aaa', 1.0),
 ('beautiful', 1.0),
 ('name', 1.0),
 ('functional', 1.0),
 ('excelent', 1.0),
 ('satisfied', 1.0),
 ('flawless', 1.0),
 ('loooooove', 1.0),
 ('fab', 1.0),
 ('powerfull', 0.9857771268624808),
 ('products', 0.9857771268624808),
 ('prod', 0.9857771268624808),
 ('worked', 0.9857771268624808),
 ('absolutely', 0.9857771268624808),
 ('cleaner', 0.9724289811193982),
 ('work', 0.9724289811193982),
 ('loved', 0.9700973242343978),
 ('luv', 0.9700973242343978),
 ('handheld', 0.96943031376534),
 ('item', 0.9666765971741552),
 ('okay', 0.9658630484766407),
 ('stuff', 0.9656575229402051),
 ('value', 0.9519979768679041),
 ('surprisingly', 0.9505831539740577),
 ('convenience', 0.9456124470177245),
 ('helpful', 0.9456124470177245),
 ('thanks', 0.9436829429599554),
 ('fine', 

In [91]:
# 第1群
get_top100_word(corpus_tfidf1)

[('units', 0.685567811018989),
 ('rarely', 0.6789245846715205),
 ('recharger', 0.6671123798704861),
 ('unplugged', 0.6457825018992069),
 ('lear', 0.6437777581700289),
 ('promising', 0.6423393057519056),
 ('expensive', 0.6274670906408097),
 ('sue', 0.6203969808425336),
 ('minivan', 0.6192272344284491),
 ('fathers', 0.6012512423063152),
 ('gadget', 0.5951981821971336),
 ('thank', 0.594955869963961),
 ('liter', 0.5863187971665441),
 ('ease', 0.5834572495297763),
 ('device', 0.5806580987526649),
 ('shop', 0.568315773198723),
 ('cooler', 0.5658238699755499),
 ('lower', 0.5655014971029219),
 ('banister', 0.5568334569439122),
 ('totally', 0.5548171270553434),
 ('heard', 0.5546811365821203),
 ('held', 0.5521448991970239),
 ('cobwebs', 0.5509028115803579),
 ('reattach', 0.5499210255457946),
 ('gifts', 0.5448369621228709),
 ('homes', 0.5430132072792752),
 ('kid', 0.5401250039722817),
 ('rabbit', 0.5369642818912941),
 ('delighted', 0.5362350250034739),
 ('bust', 0.5231930898050932),
 ('beet', 0.5

In [92]:
# 第2群
get_top100_word(corpus_tfidf2)

[('value', 0.8346208912748677),
 ('vacume', 0.7316251607327761),
 ('type', 0.6446479206789933),
 ('sears', 0.632797606146837),
 ('version', 0.6016493035964722),
 ('market', 0.5958965201323219),
 ('refreshinig', 0.5781894806842379),
 ('okay', 0.5765441338162045),
 ('diminishing', 0.5714139967305767),
 ('nosle', 0.5617464510604265),
 ('thinner', 0.5596928924632877),
 ('jury', 0.5579429148509577),
 ('costly', 0.5547510490342431),
 ('snorkel', 0.5436300839051376),
 ('sun', 0.5203359587182717),
 ('saver', 0.5110696651299343),
 ('shock', 0.5068060862886188),
 ('tiring', 0.5048508235093695),
 ('intertwined', 0.5044900120825452),
 ('eaten', 0.5042714389253192),
 ('quicker', 0.5042714389253192),
 ('lighter', 0.5014081442427689),
 ('spout', 0.4985018664744567),
 ('champ', 0.49678031461006433),
 ('waited', 0.4954484776102656),
 ('price', 0.4953013332497273),
 ('nowhere', 0.49481054805693403),
 ('dwindled', 0.4936836393147584),
 ('pants', 0.4851961183852254),
 ('gone', 0.48388869736615825),
 ('net

In [93]:
# 第3群
get_top100_word(corpus_tfidf3)

[('terrific', 1.0),
 ('sweet', 1.0),
 ('excellent', 1.0),
 ('thanks', 1.0),
 ('outstanding', 1.0),
 ('like', 1.0),
 ('fine', 1.0),
 ('nice', 1.0),
 ('fantastic', 1.0),
 ('perfect', 1.0),
 ('amazing', 1.0),
 ('vacc', 0.9872071758096075),
 ('vacume', 0.9846405686725631),
 ('stuff', 0.9775733825750403),
 ('pickup', 0.9775733825750403),
 ('performance', 0.9721611104953328),
 ('wonderfully', 0.9680330804010474),
 ('wonders', 0.9680330804010474),
 ('scam', 0.9596244206630317),
 ('soo', 0.958550487389006),
 ('low', 0.9528448679241986),
 ('mild', 0.9492459649204018),
 ('suctions', 0.9486201727137978),
 ('deal', 0.9430685880796237),
 ('reliable', 0.9321521735441144),
 ('absolutely', 0.9300481668889337),
 ('handling', 0.9290320284184058),
 ('substantial', 0.9258415906799033),
 ('medium', 0.9258365945038114),
 ('useful', 0.9256196339683916),
 ('mini', 0.9188951777909024),
 ('bad', 0.918774052623176),
 ('extremely', 0.9187354632234482),
 ('daily', 0.9143785402911384),
 ('dependable', 0.90029798826

In [94]:
# 第4群
get_top100_word(corpus_tfidf4)

[('win', 0.8543330144397026),
 ('dont', 0.807738174543703),
 ('reasonable', 0.7759143300926981),
 ('complaints', 0.7718141612380178),
 ('wicked', 0.7663174915040077),
 ('stayed', 0.7618982144039957),
 ('tad', 0.7599390706268272),
 ('clog', 0.7432931156667428),
 ('dad', 0.7428137782802501),
 ('sh', 0.7407214574832928),
 ('strongest', 0.7265570421312121),
 ('likes', 0.720094431553784),
 ('moths', 0.7169311572384105),
 ('supposedly', 0.7146719722327157),
 ('effective', 0.7146331328940123),
 ('input', 0.7105801780264135),
 ('frustrating', 0.6986161739696377),
 ('overcharging', 0.6926032404234269),
 ('rave', 0.6898949577553914),
 ('looking', 0.6896939192658461),
 ('unavoidable', 0.6889326948111658),
 ('weird', 0.6758232145251766),
 ('goodi', 0.6722263607162777),
 ('mounting', 0.6708494194248165),
 ('collected', 0.670708983984788),
 ('worries', 0.6672522057532239),
 ('grabbing', 0.6586049950731803),
 ('sale', 0.6576955103433022),
 ('decent', 0.6570063528267011),
 ('tools', 0.6567365166473522

In [95]:
# 第5群
get_top100_word(corpus_tfidf5)

[('outstanding', 0.9229908642392328),
 ('experience', 0.9122564097309335),
 ('neded', 0.9032716967471295),
 ('advertized', 0.8933532480352355),
 ('van', 0.877507154842853),
 ('described', 0.8545361363480172),
 ('powerfull', 0.8411997179897919),
 ('hr', 0.836660439436862),
 ('gud', 0.8366107017404749),
 ('neat', 0.835097442026527),
 ('suv', 0.8293099538424115),
 ('powerhouse', 0.8275445828149594),
 ('awhile', 0.8214528499619163),
 ('loses', 0.8092932712456169),
 ('excellent', 0.8073958658178326),
 ('poor', 0.8072414177705178),
 ('imo', 0.7994586382940285),
 ('transaction', 0.7976188909529914),
 ('deliver', 0.7959148399427705),
 ('wanted', 0.7910717156118758),
 ('dispensing', 0.7903393785426276),
 ('effecient', 0.7873344371917465),
 ('news', 0.7868323592579363),
 ('drain', 0.7833984072071166),
 ('tho', 0.7817739710965269),
 ('garbage', 0.7807587752924132),
 ('workshop', 0.7785667258345405),
 ('convient', 0.7710762957247763),
 ('piercingly', 0.7701851260584716),
 ('stove', 0.7684212134830

In [96]:
# 第6群
get_top100_word(corpus_tfidf6)

[('repeat', 0.6521550107610949),
 ('doesnt', 0.6445329065898524),
 ('god', 0.6097580503035002),
 ('depleted', 0.5900148929828237),
 ('remains', 0.5819648736715688),
 ('van', 0.5807018888293853),
 ('attachmt', 0.5765513171118385),
 ('buddy', 0.5756031402948714),
 ('ch', 0.5594293306986394),
 ('chord', 0.5435930066408104),
 ('shop', 0.5415824958514153),
 ('shape', 0.5343063430445164),
 ('placing', 0.5316893884075758),
 ('sweeper', 0.5252766547817657),
 ('sweeps', 0.5252766547817657),
 ('champ', 0.5252766547817657),
 ('happened', 0.5224059357022494),
 ('slowing', 0.5194940526126444),
 ('family', 0.5152054685436476),
 ('soda', 0.5030481199431515),
 ('lid', 0.502402949325884),
 ('reasonable', 0.5013338652502458),
 ('adjustable', 0.49832307261414105),
 ('upper', 0.4924153483020839),
 ('wonder', 0.48717269840374355),
 ('feb', 0.48569333851265306),
 ('noisey', 0.48402728407082146),
 ('collapses', 0.48402728407082146),
 ('version', 0.4807715085092224),
 ('relatively', 0.47649363249965626),
 ('g

In [97]:
# 試一下NLTK
import nltk
from nltk.corpus import treebank

In [33]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [98]:
print(corpus_clu0_wordlist[1])

['power', 'is', 'great', 'holds', 'good', 'charge', 'my', 'previous', 'one', 'had', 'fixture', 'to', 'hold', 'it', 'on', 'the', 'wall', 'for', 'this', 'one', 'the', 'recharging', 'base', 'sits', 'on', 'the', 'floor', 'and', 'if', 'knocked', 'over', 'or', 'set', 'slightly', 'off', 'the', 'base', 'it', 'doesn', 'charge']


In [99]:
# 標籤詞性
tagged = nltk.pos_tag(corpus_clu0_wordlist[1])

In [100]:
tagged[0:]

[('power', 'NN'),
 ('is', 'VBZ'),
 ('great', 'JJ'),
 ('holds', 'VBZ'),
 ('good', 'JJ'),
 ('charge', 'NN'),
 ('my', 'PRP$'),
 ('previous', 'JJ'),
 ('one', 'CD'),
 ('had', 'VBD'),
 ('fixture', 'NN'),
 ('to', 'TO'),
 ('hold', 'VB'),
 ('it', 'PRP'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('wall', 'NN'),
 ('for', 'IN'),
 ('this', 'DT'),
 ('one', 'CD'),
 ('the', 'DT'),
 ('recharging', 'NN'),
 ('base', 'NN'),
 ('sits', 'VBZ'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('floor', 'NN'),
 ('and', 'CC'),
 ('if', 'IN'),
 ('knocked', 'VBN'),
 ('over', 'IN'),
 ('or', 'CC'),
 ('set', 'VBN'),
 ('slightly', 'RB'),
 ('off', 'IN'),
 ('the', 'DT'),
 ('base', 'NN'),
 ('it', 'PRP'),
 ('doesn', 'VBZ'),
 ('charge', 'NN')]

In [101]:
entities = nltk.chunk.ne_chunk(tagged)

In [102]:
print(entities)

(S
  power/NN
  is/VBZ
  great/JJ
  holds/VBZ
  good/JJ
  charge/NN
  my/PRP$
  previous/JJ
  one/CD
  had/VBD
  fixture/NN
  to/TO
  hold/VB
  it/PRP
  on/IN
  the/DT
  wall/NN
  for/IN
  this/DT
  one/CD
  the/DT
  recharging/NN
  base/NN
  sits/VBZ
  on/IN
  the/DT
  floor/NN
  and/CC
  if/IN
  knocked/VBN
  over/IN
  or/CC
  set/VBN
  slightly/RB
  off/IN
  the/DT
  base/NN
  it/PRP
  doesn/VBZ
  charge/NN)


In [3]:
# 試一下StanfordDependencyParser
from nltk.parse.stanford import StanfordDependencyParser
import os

In [4]:
# 載入環境變數
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk1.8.0_111" 
os.environ["CLASSPATH"] = "C:/Users/lisrba/Documents/StanfordNLP/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "C:/Users/lisrba/Documents/StanfordNLP/StanfordNLP/models"

In [5]:
# 簡單測試
eng_parser = StanfordDependencyParser()
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for row in res[0].triples():
    print(row)

(('fox', 'NN'), 'det', ('the', 'DT'))
(('fox', 'NN'), 'amod', ('quick', 'JJ'))
(('fox', 'NN'), 'amod', ('brown', 'JJ'))
(('fox', 'NN'), 'dep', ('jumps', 'NNS'))
(('jumps', 'NNS'), 'nmod', ('dog', 'NN'))
(('dog', 'NN'), 'case', ('over', 'IN'))
(('dog', 'NN'), 'det', ('the', 'DT'))
(('dog', 'NN'), 'amod', ('lazy', 'JJ'))


In [121]:
# 拿評論測試
abc = list(eng_parser.parse(corpus_clu0_wordlist[111]))

In [122]:
for row in abc[0].triples():
    print(row)

(('light', 'JJ'), 'dep', ('motor', 'NN'))
(('motor', 'NN'), 'amod', ('great', 'JJ'))
(('motor', 'NN'), 'amod', ('powerful', 'JJ'))
(('motor', 'NN'), 'cc', ('and', 'CC'))
(('motor', 'NN'), 'conj', ('suction', 'NN'))
(('suction', 'NN'), 'amod', ('great', 'JJ'))
(('light', 'JJ'), 'advmod', ('very', 'RB'))
(('light', 'JJ'), 'cc', ('and', 'CC'))
(('light', 'JJ'), 'conj', ('great', 'JJ'))
(('great', 'JJ'), 'advmod', ('works', 'RB'))


In [40]:
#python2
#filetest = open("C:"+os.sep+"temp"+os.sep+"file.txt", "w",encoding='utf-8')
#filetest.write(str(corpus_clu0[0].words))
#filetest.close() 

In [41]:
# python3 I/O
with open("C:"+os.sep+"temp"+os.sep+"file.txt", "a",encoding='utf-8') as filetest:
    filetest.write(str(corpus_clu0[0].words))

##### NLTK 7. Extracting Information from Text
http://www.nltk.org/book/ch07.html

##### 4.2. Feature extraction
http://scikit-learn.org/stable/modules/feature_extraction.html

#####  sklearn.feature_extraction.text.TfidfVectorizer
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

##### python之最強王者（10）———文件(File)、輸入輸出的基本操作
http://www.zendei.com/article/17118.html

##### Clustering text documents using k-means
http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py

##### 自然語言處理 -- TF-IDF
http://cpmarkchang.logdown.com/posts/193915-natural-language-processing-tf-idf

##### Python TF-IDF计算100份文档关键词权重
http://www.cnblogs.com/chenbjin/p/3851165.html

