In [1]:
import os
import gensim
from gensim import corpora, models
import smart_open 
import numpy as np
from numpy import random
random.seed(555)
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict



In [2]:
# 檔案路徑
amazon_corpus_path = "C:" + os.sep + "temp" + os.sep + "BLACK-DECKER.csv"

In [3]:
# 斷詞 & 標籤
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.LabeledSentence(gensim.utils.simple_preprocess(line), ['SENT_%s' % i])

In [4]:
# 訓練組與測試組
train_corpus = list(read_corpus(amazon_corpus_path))
test_corpus = list(read_corpus(amazon_corpus_path, tokens_only=True))

In [5]:
train_corpus[:2]

[LabeledSentence(words=['ve', 'owned', 'some', 'type', 'of', 'dustbuster', 'handvac', 'for', 'the', 'last', 'twenty', 'years', 'and', 'don', 'think', 've', 'ever', 'ever', 'had', 'one', 'that', 'didn', 'hate', 'too', 'little', 'suction', 'power', 'not', 'enough', 'charge', 'time', 'the', 'suction', 'nozzle', 'is', 'awkward', 'to', 'use', 'and', 'the', 'batteries', 'are', 'always', 'dead', 'after', 'couple', 'minutes', 'can', 'keep', 'them', 'plugged', 'into', 'the', 'wall', 'all', 'the', 'time', 'but', 'that', 'kills', 'the', 'battery', 'and', 'wastes', 'electricity', 'have', 'found', 'it', 'much', 'more', 'effective', 'to', 'just', 'use', 'the', 'suction', 'hose', 'on', 'my', 'upright', 'even', 'though', 'that', 'means', 'lugging', 'my', 'vacuum', 'around', 'so', 'decided', 'to', 'see', 'what', 'is', 'new', 'in', 'handvac', 'technology', 'and', 'if', 'they', 'have', 'improved', 'at', 'all', 'over', 'the', 'last', 'ten', 'years', 'fortunately', 'can', 'say', 'that', 'they', 'have', 'th

In [6]:
# 建模
model = models.Doc2Vec(alpha=0.025, min_alpha=0.025)  # use fixed learning rate
model.build_vocab(train_corpus)

In [9]:
# 學習（學習速率遞減）
epoch = 10
for _ in range(epoch):
    model.train(train_corpus)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
print ('done training')

done training


In [10]:
# 儲存模型
model.save("C:" + os.sep + "temp" + os.sep +"try.d2v")

In [7]:
# 載入模型
model = gensim.models.Doc2Vec.load("C:" + os.sep + "temp" + os.sep +"try.d2v")

In [8]:
# 把每篇文章的向量塞進矩陣裡面
lenx=len(train_corpus)
leny=len(model.infer_vector(train_corpus[0].words))
print(lenx,leny)
a = np.zeros((lenx,leny))
for i_sent in range(len(train_corpus)):
    a[i_sent,:] = model.infer_vector(test_corpus[i_sent])
print(a[:2])

9362 100
[[ 0.91893691 -0.25788108  1.84998441  0.11234858 -2.46256018 -0.49450001
   0.14411953 -0.5181275   0.27060199  0.22007766 -0.90371591 -0.42633575
  -1.03870964 -0.90186334  0.76243496 -1.53511012 -0.0610662  -1.20328557
  -0.59060353  1.63943064 -0.74973261  0.97232139  0.61992282 -0.95465875
   1.57215083 -0.81540394  2.1525743   1.29551339 -1.51895964  0.07195573
   0.50540882 -0.33654842 -1.16719294 -0.58698267  0.38133293 -0.58228415
   0.11161171  0.5160718  -0.54661995 -0.56649339 -0.04003856  0.79407054
   1.61801136  0.39664221  0.56881511  0.29280883  0.1439618   1.09199226
  -1.74520838 -0.37423113  1.63765132 -0.9377467  -0.86362076 -1.78516603
  -1.48999083 -0.31300369 -0.51769322 -0.28051028 -0.93538976  0.95480072
   0.10389411 -1.4914149   0.39116448 -0.52631855  0.70351726  1.34742332
   0.01329369  0.34379762 -0.09115305  1.05434239 -1.30771983 -0.06083123
   0.3133426   0.92537606  0.80035371  0.17099653  1.33099747 -0.21025081
  -0.40435165 -1.55230629  0.

In [9]:
# 正規化（？）
sim_matrix = whiten(a)

In [10]:
# Kmeans分7群（7是隨便挑的，之後要用elbow method或輪廓係數找最佳群組數）
centroid, destortion = kmeans(sim_matrix, 7, iter=100, thresh=1e-05)
labels, dist = vq(sim_matrix, centroid)
print(labels[0])

5


In [11]:
# 按標籤將屬於該群的文章INDEX存起來

cluster0 = []
cluster1 = []
cluster2 = []
cluster3 = []
cluster4 = []
cluster5 = []
cluster6 = []

for i in range(len(labels)):
    if (labels[i] == 0):
        cluster0.append(i);
    elif (labels[i] == 1):
        cluster1.append(i);
    elif (labels[i] == 2):
        cluster2.append(i);
    elif (labels[i] == 3):
        cluster3.append(i);
    elif (labels[i] == 4):
        cluster4.append(i);
    elif (labels[i] == 5):
        cluster5.append(i);
    elif (labels[i] == 6):
        cluster6.append(i);

In [12]:
# 標籤為0的群組文章INDEX
print(cluster0)

[240, 554, 732, 812, 829, 850, 923, 1465, 1814, 2176, 2195, 2208, 2282, 2588, 2685, 2702, 2714, 2761, 2870, 2894, 2931, 2990, 3055, 3098, 3164, 3193, 3198, 3201, 3212, 3252, 3259, 3260, 3279, 3303, 3304, 3306, 3334, 3336, 3346, 3350, 3356, 3362, 3372, 3383, 3396, 3400, 3402, 3404, 3412, 3418, 3419, 3420, 3425, 3437, 3440, 3458, 3459, 3461, 3462, 3463, 3469, 3473, 3478, 3480, 3483, 3484, 3486, 3488, 3497, 3504, 3507, 3508, 3514, 3520, 3521, 3527, 3528, 3542, 3543, 3544, 3547, 3551, 3556, 3557, 3561, 3562, 3564, 3566, 3567, 3568, 3570, 3583, 3585, 3587, 3595, 3597, 3600, 3601, 3604, 3605, 3666, 3742, 3767, 3794, 3795, 3829, 3855, 3904, 3923, 3963, 3970, 4030, 4033, 4101, 4171, 4196, 4197, 4198, 4211, 4276, 4301, 4312, 4337, 4414, 4415, 4438, 4452, 4549, 4550, 4558, 4568, 4578, 4583, 4592, 4593, 4622, 4647, 4685, 4690, 4703, 4704, 4706, 4733, 4745, 4759, 4769, 4790, 4818, 4830, 4847, 4848, 4859, 4869, 4898, 4903, 4940, 4942, 4958, 4968, 4969, 4986, 4987, 5023, 5024, 5036, 5037, 5038, 5061

In [13]:
# 按文章INDEX將文章分群

corpus_clu0 = []
corpus_clu1 = []
corpus_clu2 = []
corpus_clu3 = []
corpus_clu4 = []
corpus_clu5 = []
corpus_clu6 = []

for j in range(len(cluster0)):
    corpus_clu0.append(train_corpus[cluster0[j]]);
for j in range(len(cluster1)):
    corpus_clu1.append(train_corpus[cluster1[j]]);    
for j in range(len(cluster2)):
    corpus_clu2.append(train_corpus[cluster2[j]]);
for j in range(len(cluster3)):
    corpus_clu3.append(train_corpus[cluster3[j]]); 
for j in range(len(cluster4)):
    corpus_clu4.append(train_corpus[cluster4[j]]);     
for j in range(len(cluster5)):
    corpus_clu5.append(train_corpus[cluster5[j]]);
for j in range(len(cluster6)):
    corpus_clu6.append(train_corpus[cluster6[j]]);

In [14]:
# 看看第0群的第0篇文章內容與標籤
print(corpus_clu0[0])

LabeledSentence(['went', 'from', 'dyson', 'to', 'this', 'black', 'and', 'decker', 'unit', 'looks', 'nice', 'and', 'holds', 'charge', 'well', 'the', 'whole', 'point', 'is', 'to', 'pick', 'stuff', 'up', 'and', 'this', 'unit', 'did', 'not', 'way', 'underpowered'], ['SENT_240'])


In [15]:
# 看看第0群的第0篇文章標籤
print(corpus_clu0[0].tags)

['SENT_240']


In [16]:
# 看看第0群的第0篇文章內容
print(corpus_clu0[0].words)

['went', 'from', 'dyson', 'to', 'this', 'black', 'and', 'decker', 'unit', 'looks', 'nice', 'and', 'holds', 'charge', 'well', 'the', 'whole', 'point', 'is', 'to', 'pick', 'stuff', 'up', 'and', 'this', 'unit', 'did', 'not', 'way', 'underpowered']


In [17]:
# 只挑文章內容存進去各個群組
corpus_clu0_wordlist = []
corpus_clu1_wordlist = []
corpus_clu2_wordlist = []
corpus_clu3_wordlist = []
corpus_clu4_wordlist = []
corpus_clu5_wordlist = []
corpus_clu6_wordlist = []

for i in range(len(corpus_clu0)):
    corpus_clu0_wordlist.append(corpus_clu0[i].words)
for i in range(len(corpus_clu1)):
    corpus_clu1_wordlist.append(corpus_clu1[i].words)
for i in range(len(corpus_clu2)):
    corpus_clu2_wordlist.append(corpus_clu2[i].words)
for i in range(len(corpus_clu3)):
    corpus_clu3_wordlist.append(corpus_clu3[i].words)
for i in range(len(corpus_clu4)):
    corpus_clu4_wordlist.append(corpus_clu4[i].words)
for i in range(len(corpus_clu5)):
    corpus_clu5_wordlist.append(corpus_clu5[i].words)
for i in range(len(corpus_clu6)):
    corpus_clu6_wordlist.append(corpus_clu6[i].words)

In [18]:
# 看看第0個群組的前2篇文章
corpus_clu0_wordlist[:2]

[['went',
  'from',
  'dyson',
  'to',
  'this',
  'black',
  'and',
  'decker',
  'unit',
  'looks',
  'nice',
  'and',
  'holds',
  'charge',
  'well',
  'the',
  'whole',
  'point',
  'is',
  'to',
  'pick',
  'stuff',
  'up',
  'and',
  'this',
  'unit',
  'did',
  'not',
  'way',
  'underpowered'],
 ['worked',
  'great',
  'until',
  'last',
  'month',
  'but',
  'doesn',
  'charge',
  'or',
  'turn',
  'on',
  'now']]

In [19]:
# 把只出現一次的字移除
from collections import defaultdict
from pprint import pprint  # pretty-printer

def remove_less_than_one(corpus_clu_wordlist):
    frequency = defaultdict(int)
    for text in corpus_clu_wordlist:
         for token in text:
            frequency[token] += 1

    corpus_clu_wordlist = [[token for token in text if frequency[token] > 1]
                             for text in corpus_clu_wordlist]
    return corpus_clu_wordlist;

In [20]:
remove_less_than_one(corpus_clu0_wordlist)
remove_less_than_one(corpus_clu1_wordlist)
remove_less_than_one(corpus_clu2_wordlist)
remove_less_than_one(corpus_clu3_wordlist)
remove_less_than_one(corpus_clu4_wordlist)
remove_less_than_one(corpus_clu5_wordlist)
remove_less_than_one(corpus_clu6_wordlist)

[['this',
  'dustbuster',
  'replaced',
  'an',
  'older',
  'one',
  'that',
  'used',
  'nicad',
  'batteries',
  'and',
  'was',
  'about',
  'to',
  'the',
  'end',
  'of',
  'its',
  'useful',
  'life',
  'the',
  'old',
  'one',
  'hung',
  'on',
  'the',
  'wall',
  'on',
  'charger',
  'bracket',
  'that',
  'was',
  'mounted',
  'on',
  'our',
  'laundry',
  'room',
  'wall',
  'that',
  'was',
  'very',
  'handy',
  'to',
  'get',
  'to',
  'and',
  'had',
  'been',
  'located',
  'there',
  'for',
  'over',
  'ten',
  'years',
  'we',
  'were',
  'reluctant',
  'to',
  'clear',
  'off',
  'shelf',
  'in',
  'different',
  'location',
  'so',
  'came',
  'up',
  'with',
  'mount',
  'that',
  'attached',
  'to',
  'one',
  'of',
  'the',
  'shelves',
  'that',
  'put',
  'it',
  'back',
  'in',
  'the',
  'old',
  'location',
  'it',
  'may',
  'even',
  'be',
  'better',
  'now',
  'since',
  'it',
  'simply',
  'sits',
  'on',
  'the',
  'shelf',
  'rather',
  'than',
  'ha

In [21]:
print(corpus_clu0_wordlist)

[['went', 'from', 'dyson', 'to', 'this', 'black', 'and', 'decker', 'unit', 'looks', 'nice', 'and', 'holds', 'charge', 'well', 'the', 'whole', 'point', 'is', 'to', 'pick', 'stuff', 'up', 'and', 'this', 'unit', 'did', 'not', 'way', 'underpowered'], ['worked', 'great', 'until', 'last', 'month', 'but', 'doesn', 'charge', 'or', 'turn', 'on', 'now'], ['pretty', 'decent', 'suction', 'strength', 'and', 'battery', 'lasts', 'after', 'solid', 'minute', 'before', 'needing', 'recharge', 'we', 'can', 'have', 'product', 'sitting', 'for', 'weeks', 'and', 'it', 'still', 'has', 'solid', 'charge', 'filter', 'is', 'easy', 'to', 'clean', 'recommend'], ['have', 'needed', 'this', 'for', 'long', 'time', 'live', 'in', 'century', 'home', 'with', 'many', 'steps', 'and', 'was', 'dragging', 'my', 'huge', 'vacuum', 'up', 'and', 'down', 'in', 'order', 'to', 'clean', 'them', 'this', 'is', 'the', 'perfect', 'answer', 'and', 'is', 'so', 'light', 'yet', 'does', 'heavy', 'job', 'well'], ['this', 'is', 'how', 'hand', 'vac

In [22]:
# 建立字典印出，有8585個不重複的字
dictionary = corpora.Dictionary(test_corpus)
print(dictionary)

Dictionary(8585 unique tokens: ['pea', 'deafening', 'breath', 'results', 'unwieldy']...)


In [23]:
# 看看字的ID號碼
print(dictionary.token2id)



In [24]:
# 顯示各篇文章出現的字的ID跟次數
corpus0 = [dictionary.doc2bow(text) for text in corpus_clu0_wordlist]
corpus1 = [dictionary.doc2bow(text) for text in corpus_clu1_wordlist]
corpus2 = [dictionary.doc2bow(text) for text in corpus_clu2_wordlist]
corpus3 = [dictionary.doc2bow(text) for text in corpus_clu3_wordlist]
corpus4 = [dictionary.doc2bow(text) for text in corpus_clu4_wordlist]
corpus5 = [dictionary.doc2bow(text) for text in corpus_clu5_wordlist]
corpus6 = [dictionary.doc2bow(text) for text in corpus_clu6_wordlist]

#corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
print(corpus0)

[[(30, 1), (31, 1), (55, 3), (69, 1), (80, 1), (95, 1), (124, 2), (143, 1), (186, 1), (247, 1), (252, 1), (261, 1), (320, 1), (325, 2), (333, 1), (365, 1), (477, 2), (569, 1), (717, 1), (1427, 1), (1541, 1), (1593, 1), (2000, 1), (2343, 1), (3312, 1)], [(103, 1), (210, 1), (219, 1), (251, 1), (261, 1), (271, 1), (339, 1), (393, 1), (522, 1), (669, 1), (782, 1), (1138, 1)], [(9, 1), (34, 1), (55, 2), (79, 1), (86, 1), (124, 1), (138, 1), (160, 1), (176, 1), (235, 1), (242, 1), (247, 1), (261, 1), (269, 1), (282, 1), (297, 1), (319, 1), (340, 1), (341, 1), (352, 1), (524, 1), (577, 2), (643, 1), (705, 1), (815, 1), (834, 1), (1261, 1), (1269, 1), (1422, 1), (3122, 1)], [(26, 1), (32, 1), (42, 1), (55, 3), (69, 1), (74, 2), (79, 1), (80, 1), (117, 1), (124, 1), (231, 1), (247, 2), (254, 1), (282, 1), (291, 1), (325, 2), (326, 1), (333, 1), (335, 1), (340, 1), (361, 1), (403, 1), (420, 1), (437, 1), (440, 1), (495, 1), (563, 1), (617, 1), (735, 1), (904, 1), (1029, 1), (1339, 1), (2521, 1)

In [25]:
# TF-IDF一下
from gensim import corpora, models, similarities
tfidf0 = models.TfidfModel(corpus0)
tfidf1 = models.TfidfModel(corpus1)
tfidf2 = models.TfidfModel(corpus2)
tfidf3 = models.TfidfModel(corpus3)
tfidf4 = models.TfidfModel(corpus4)
tfidf5 = models.TfidfModel(corpus5)
tfidf6 = models.TfidfModel(corpus6)

corpus_tfidf0 = tfidf0[corpus0]
corpus_tfidf1 = tfidf1[corpus1]
corpus_tfidf2 = tfidf2[corpus2]
corpus_tfidf3 = tfidf3[corpus3]
corpus_tfidf4 = tfidf4[corpus4]
corpus_tfidf5 = tfidf5[corpus5]
corpus_tfidf6 = tfidf6[corpus6]

In [26]:
# 第0個群組有1564篇文章
print(tfidf0)

TfidfModel(num_docs=1542, num_nnz=9917)


In [27]:
# 第0個群組每篇文章出現的字的ID與TF-IDF值
for doc in corpus_tfidf0:
    print(doc)

[(30, 0.21108366977943274), (31, 0.19339914748450515), (55, 0.18132449102277587), (69, 0.10295334012585879), (80, 0.11777427378532825), (95, 0.1645396026249568), (124, 0.1610151926657854), (143, 0.20585907550432664), (186, 0.21108366977943274), (247, 0.0896376379346136), (252, 0.12329416257193955), (261, 0.12702330079163307), (320, 0.22424859591875987), (325, 0.1734761288754894), (333, 0.07618711682613243), (365, 0.13415687547494834), (477, 0.3290792052499136), (569, 0.2442353314500058), (717, 0.22424859591875987), (1427, 0.21711502123544463), (1541, 0.28722000614801024), (1593, 0.17000797548963748), (2000, 0.22424859591875987), (2343, 0.26009969593344906), (3312, 0.28722000614801024)]
[(103, 0.2118527279620139), (210, 0.3591731376119851), (219, 0.32541756318651244), (251, 0.07026995031635225), (261, 0.1958257266169068), (271, 0.3102584789166987), (339, 0.17847297528845257), (393, 0.3457132979928422), (522, 0.2794458879691016), (669, 0.3591731376119851), (782, 0.28360748251041495), (11

In [28]:
# 看一下各群組前100個TF-IDF值較高的字
from collections import Counter

def get_top100_word(corpus_tfidf):
    d = {}
    for doc in corpus_tfidf:
        for id, value in doc:
            word = dictionary.get(id)
            d[word] = value
    d_val = Counter(d)
    return d_val.most_common(100)  

In [29]:
# 第0群
get_top100_word(corpus_tfidf0)

[('goooooood', 1.0),
 ('outstanding', 1.0),
 ('flawless', 1.0),
 ('sweet', 1.0),
 ('functional', 1.0),
 ('thanks', 1.0),
 ('name', 1.0),
 ('perfect', 1.0),
 ('satisfied', 1.0),
 ('okie', 1.0),
 ('cool', 1.0),
 ('amazing', 1.0),
 ('aaa', 1.0),
 ('wonderful', 1.0),
 ('ok', 1.0),
 ('fine', 1.0),
 ('great', 1.0),
 ('fantastic', 1.0),
 ('beautiful', 1.0),
 ('terrific', 1.0),
 ('magnifica', 1.0),
 ('excelent', 1.0),
 ('fab', 1.0),
 ('horrible', 1.0),
 ('loooooove', 1.0),
 ('good', 1.0),
 ('vacc', 0.987640604597916),
 ('prod', 0.987640604597916),
 ('powerfull', 0.987640604597916),
 ('products', 0.9849896105616027),
 ('pickup', 0.9813942277320086),
 ('vacume', 0.979961356950193),
 ('worked', 0.9774703462137233),
 ('luv', 0.9766257090450606),
 ('performance', 0.9752978958546612),
 ('cleaner', 0.9724079589191468),
 ('okay', 0.9711144737255563),
 ('wonders', 0.9674557969527721),
 ('wonderfully', 0.9674557969527721),
 ('handheld', 0.9674057457774122),
 ('helpful', 0.9607196835141572),
 ('deal', 0.

In [30]:
# 第1群
get_top100_word(corpus_tfidf1)

[('excepted', 0.8804789858972784),
 ('powerfull', 0.8582406068678217),
 ('boat', 0.8394615887223588),
 ('cars', 0.8270571700836487),
 ('win', 0.8254004799766426),
 ('convient', 0.7976282223110767),
 ('reasonable', 0.7877707581729749),
 ('heaven', 0.7482336433788526),
 ('stayed', 0.7435831614208315),
 ('lasting', 0.7429707988994024),
 ('extremely', 0.7388446743940255),
 ('related', 0.7329145963619361),
 ('yo', 0.7277937623700327),
 ('hoped', 0.7271941205119491),
 ('auto', 0.724247762634741),
 ('expected', 0.719051149216724),
 ('certainly', 0.7178468882604375),
 ('kid', 0.7120269057199887),
 ('effortlessly', 0.699122763084307),
 ('likes', 0.698618775280372),
 ('input', 0.6955598618117359),
 ('didnt', 0.6913923691219939),
 ('fabric', 0.6909017999344793),
 ('retired', 0.6908883013613264),
 ('anticipated', 0.6907901248702439),
 ('mounting', 0.681046935242166),
 ('ups', 0.6803297661321536),
 ('periods', 0.6774146864027536),
 ('saver', 0.6705058563911201),
 ('supposedly', 0.669167133531965),


In [31]:
# 第2群
get_top100_word(corpus_tfidf2)

[('surprisingly', 0.9618103486693386),
 ('terrible', 0.961647289869093),
 ('wonderful', 0.936000152255378),
 ('neded', 0.89914569152472),
 ('friend', 0.873222706915547),
 ('stairway', 0.8607488444243313),
 ('reassemble', 0.8423119392013964),
 ('pos', 0.8421922644701987),
 ('deliver', 0.840005937075474),
 ('秋charge', 0.827987501625858),
 ('promised', 0.8216723861798478),
 ('stronger', 0.8167449667086971),
 ('asked', 0.8160001216626976),
 ('term', 0.8123391440529835),
 ('comes', 0.8068251952392379),
 ('garbage', 0.7905651174868441),
 ('replicable', 0.7857492429120695),
 ('exceeds', 0.7839577661830431),
 ('rv', 0.7829301111013791),
 ('waste', 0.7668956964706415),
 ('truck', 0.7629027528048666),
 ('shipping', 0.762585657076571),
 ('wicked', 0.7557486069351925),
 ('wath', 0.7521977240976199),
 ('aftwr', 0.7507159481493757),
 ('equipment', 0.7502293882528042),
 ('parakeet', 0.7499497231557141),
 ('frustrating', 0.7492508416547161),
 ('job', 0.7467663794907666),
 ('mess', 0.740231343269594),


In [32]:
# 第3群
get_top100_word(corpus_tfidf3)

[('vaccum', 0.8604345283750235),
 ('potent', 0.8281382404419279),
 ('adore', 0.8104152455985515),
 ('grit', 0.7936667135927897),
 ('noisey', 0.7820464298075162),
 ('thinks', 0.7808524155237481),
 ('minuets', 0.7800858551293288),
 ('lightweight', 0.7789424006886289),
 ('garbage', 0.7475471483227575),
 ('tools', 0.7415814207667214),
 ('winner', 0.739120560603869),
 ('helper', 0.7254736232239102),
 ('snout', 0.720949247748061),
 ('fuss', 0.7194313328644346),
 ('wireless', 0.7120505448451728),
 ('unavoidable', 0.7034641416738354),
 ('plenty', 0.698747376804005),
 ('nothinggood', 0.6982311889939895),
 ('guy', 0.6935264574258817),
 ('goodi', 0.684660561554242),
 ('bettery', 0.6788073703987212),
 ('day', 0.67645417952003),
 ('every', 0.6663048985505658),
 ('average', 0.6660018085417762),
 ('cars', 0.664058488633396),
 ('critique', 0.6535404377579844),
 ('alright', 0.6507403550819609),
 ('lil', 0.6483935283613066),
 ('versatility', 0.645931823324942),
 ('grabbing', 0.6447928622903728),
 ('give

In [33]:
# 第4群
get_top100_word(corpus_tfidf4)

[('nice', 1.0),
 ('excellent', 1.0),
 ('fantastic', 1.0),
 ('loved', 0.98674171597762),
 ('absolutely', 0.9718983713072512),
 ('ergonomic', 0.9474798211566522),
 ('useful', 0.9433332411048242),
 ('satisfied', 0.9427779894443394),
 ('sleek', 0.9284623247747702),
 ('extremely', 0.9280694269115839),
 ('nothing', 0.917086574626626),
 ('love', 0.9134980587393524),
 ('greatvsuction', 0.9063789917782651),
 ('lost', 0.9044789239470482),
 ('outstanding', 0.9030872440806503),
 ('ship', 0.9017842206044019),
 ('pretty', 0.9006076705511321),
 ('terrible', 0.8908090356255728),
 ('experience', 0.8844507155972754),
 ('advertised', 0.8807923310191728),
 ('bulky', 0.8784305721552584),
 ('awesome', 0.8752824020122452),
 ('detailing', 0.8745473186628706),
 ('champ', 0.8722743672711579),
 ('thank', 0.8678916010415612),
 ('powerhouse', 0.8608025784250505),
 ('complaints', 0.8606403737035164),
 ('investment', 0.8579629048705004),
 ('advertized', 0.8567714828573267),
 ('van', 0.8502318776712683),
 ('unused', 

In [34]:
# 第5群
get_top100_word(corpus_tfidf5)

[('recharger', 0.6884026896519645),
 ('attachmt', 0.5968024835589804),
 ('stink', 0.5960122583370295),
 ('min', 0.5914437772678641),
 ('mountable', 0.5872403678987289),
 ('picks', 0.5725109559554181),
 ('handy', 0.5644761392996149),
 ('nine', 0.5443780013675074),
 ('van', 0.530630107888652),
 ('volt', 0.5254846659063424),
 ('litter', 0.5216038705321384),
 ('fur', 0.5201260309223099),
 ('cage', 0.5130800009491199),
 ('overnight', 0.5118226333371005),
 ('spout', 0.5114045456585579),
 ('blemish', 0.4953908839348756),
 ('platform', 0.4929357750052917),
 ('net', 0.4923700398347856),
 ('category', 0.4803508214070327),
 ('folding', 0.47242464358575564),
 ('equals', 0.46519406799368834),
 ('vehicle', 0.462118436775721),
 ('change', 0.4608590525199945),
 ('price', 0.459552000162939),
 ('beware', 0.4586921769410948),
 ('stove', 0.45856579668571085),
 ('pellet', 0.45856579668571085),
 ('absolutely', 0.4461282998331951),
 ('tray', 0.4426416252773846),
 ('air', 0.43427819307506),
 ('agitator', 0.43

In [35]:
# 第6群
get_top100_word(corpus_tfidf6)

[('conviniet', 0.7995443128381331),
 ('nimh', 0.6103297150527794),
 ('cd', 0.5810101589175123),
 ('chord', 0.5642986569906727),
 ('procedure', 0.5589495111835322),
 ('satisfactory', 0.5241094621095366),
 ('brief', 0.5241094621095366),
 ('pricy', 0.5219375122969369),
 ('dyson', 0.5100928444857641),
 ('replied', 0.504969027778966),
 ('allow', 0.5021895175293652),
 ('cannister', 0.5010558318549484),
 ('soda', 0.49503370411423653),
 ('upper', 0.4947498220824641),
 ('shock', 0.4945192287530431),
 ('dad', 0.49412877162484314),
 ('portable', 0.4937795395269972),
 ('lid', 0.49056613951022443),
 ('workhorse', 0.4864913992997028),
 ('excellent', 0.4845052248586074),
 ('inlet', 0.47565890854991427),
 ('durable', 0.47565890854991427),
 ('feb', 0.4713212994625085),
 ('grandchildren', 0.4653010719305335),
 ('hearth', 0.4596997401103869),
 ('wireless', 0.45900643682057035),
 ('cheaper', 0.4530704282071458),
 ('handful', 0.44405111741788544),
 ('stink', 0.44401956792465314),
 ('baking', 0.439187211719

In [36]:
# 試一下NLTK
import nltk
from nltk.corpus import treebank

In [33]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [37]:
print(corpus_clu0_wordlist[1])

['worked', 'great', 'until', 'last', 'month', 'but', 'doesn', 'charge', 'or', 'turn', 'on', 'now']


In [38]:
# 標籤詞性
tagged = nltk.pos_tag(corpus_clu0_wordlist[1])

In [39]:
tagged[0:]

[('worked', 'VBN'),
 ('great', 'JJ'),
 ('until', 'IN'),
 ('last', 'JJ'),
 ('month', 'NN'),
 ('but', 'CC'),
 ('doesn', 'JJ'),
 ('charge', 'NN'),
 ('or', 'CC'),
 ('turn', 'VB'),
 ('on', 'IN'),
 ('now', 'RB')]

In [40]:
entities = nltk.chunk.ne_chunk(tagged)

In [41]:
print(entities)

(S
  worked/VBN
  great/JJ
  until/IN
  last/JJ
  month/NN
  but/CC
  doesn/JJ
  charge/NN
  or/CC
  turn/VB
  on/IN
  now/RB)


In [42]:
# 試一下StanfordDependencyParser
from nltk.parse.stanford import StanfordDependencyParser
import os

In [43]:
# 載入環境變數
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk1.8.0_111" 
os.environ["CLASSPATH"] = "C:/Users/lisrba/Documents/StanfordNLP/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "C:/Users/lisrba/Documents/StanfordNLP/StanfordNLP/models"

In [44]:
# 簡單測試
eng_parser = StanfordDependencyParser()
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for row in res[0].triples():
    print(row)

(('fox', 'NN'), 'det', ('the', 'DT'))
(('fox', 'NN'), 'amod', ('quick', 'JJ'))
(('fox', 'NN'), 'amod', ('brown', 'JJ'))
(('fox', 'NN'), 'dep', ('jumps', 'NNS'))
(('jumps', 'NNS'), 'nmod', ('dog', 'NN'))
(('dog', 'NN'), 'case', ('over', 'IN'))
(('dog', 'NN'), 'det', ('the', 'DT'))
(('dog', 'NN'), 'amod', ('lazy', 'JJ'))


In [60]:
# 拿評論測試
eng_parser = StanfordDependencyParser()
res = list(eng_parser.parse(corpus_clu0_wordlist[111]))
for row in res[0].triples():
    print(row)

(('been', 'VBN'), 'nsubj', ('it', 'PRP'))
(('been', 'VBN'), 'aux', ('has', 'VBZ'))
(('been', 'VBN'), 'advcl', ('charges', 'VBZ'))
(('charges', 'VBZ'), 'mark', ('while', 'IN'))
(('charges', 'VBZ'), 'advcl', ('bought', 'VBD'))
(('bought', 'VBD'), 'nsubj', ('that', 'WDT'))
(('bought', 'VBD'), 'dobj', ('this', 'DT'))
(('bought', 'VBD'), 'cc', ('but', 'CC'))
(('bought', 'VBD'), 'conj', ('love', 'VBP'))
(('love', 'VBP'), 'dobj', ('it', 'PRP'))
(('charges', 'VBZ'), 'nsubj', ('it', 'PRP'))
(('charges', 'VBZ'), 'advmod', ('well', 'RB'))
(('charges', 'VBZ'), 'cc', ('and', 'CC'))
(('charges', 'VBZ'), 'conj', ('stays', 'VBZ'))
(('stays', 'VBZ'), 'nsubj', ('charge', 'NN'))
(('charge', 'NN'), 'amod', ('quick', 'JJ'))
(('stays', 'VBZ'), 'xcomp', ('good', 'JJ'))
(('good', 'JJ'), 'cc', ('but', 'CC'))
(('good', 'JJ'), 'conj', ('not', 'RB'))
(('good', 'JJ'), 'dep', ('long', 'JJ'))
(('long', 'JJ'), 'advmod', ('too', 'RB'))
(('been', 'VBN'), 'cc', ('and', 'CC'))
(('been', 'VBN'), 'conj', ('sucks', 'VBZ'))


In [70]:
review_list = []
def stanford_parser(corpus_clu_wordlist):
    for i in range(len(corpus_clu_wordlist)):
         review_list.append(eng_parser.parse(corpus_clu_wordlist[i]))
    return review_list;

In [72]:
review_list = stanford_parser(corpus_clu0_wordlist)

StopIteration: 

In [73]:
for i in range(len(review_list)):
    for row in review_list[i].triples():
        print(row)

AttributeError: 'list_iterator' object has no attribute 'triples'

http://stackoverflow.com/questions/7443330/how-do-i-do-dependency-parsing-in-nltk

http://www.nltk.org/_modules/nltk/parse/stanford.html