In [43]:
from __future__ import print_function, unicode_literals, division
import cPickle as pickle
from pprint import pprint
from pymongo import MongoClient
import numpy as np
from collections import Counter
from spacy.en import English
import math
from pprint import pprint

# 데이터 통계

In [2]:
client = MongoClient()
db = client['amazon']
headphones = list(db['headphone_detail'].find())

In [3]:
review_count_list = np.array([h['review_count'] for h in headphones])

## Mean

In [4]:
np.mean(review_count_list)

344.20022350530826

## Median

In [5]:
np.median(review_count_list)

66.0

## Max, Min

In [6]:
max(review_count_list)

23484

In [7]:
min(review_count_list)

10

# 단어 사용 통계

## 아이템 별 단어 통계 로드

In [8]:
with open('./output/item_noun_counter.pickle', 'rb') as fin:
    item_noun_counter = pickle.load(fin)
    
with open('./output/item_verb_counter.pickle', 'rb') as fin:
    item_verb_counter = pickle.load(fin)
    
with open('./output/item_adjective_counter.pickle', 'rb') as fin:
    item_adjective_counter = pickle.load(fin)

## 전체 단어 통계 계산

In [24]:
noun_counter = Counter()
for _, c in enumerate(item_noun_counter.values()):
    noun_counter += c
    
verb_counter = Counter()
for _, c in enumerate(item_verb_counter.values()):
    verb_counter += c
    
adjective_counter = Counter()
for _, c in enumerate(item_adjective_counter.values()):
    adjective_counter += c

## 출현 경향이 일반적인 상황과 현저히 다른 단어 추출 
- headphone에서 100번 이상 출현한 단어만 고려
- 점수 내림차순으로 정렬

In [31]:
nlp = English()

In [46]:
noun_total_count = sum(noun_counter.values())
verb_total_count = sum(verb_counter.values())
adjective_total_count = sum(adjective_counter.values())

In [37]:
def noun_score(word):
    prob_generic = nlp.vocab[word].prob
    prob_headphone = math.log(noun_counter[word] / noun_total_count)
    
    return (prob_headphone - prob_generic)

In [47]:
def verb_score(word):
    prob_generic = nlp.vocab[word].prob
    prob_headphone = math.log(verb_counter[word] / verb_total_count)
    
    return (prob_headphone - prob_generic)

In [48]:
def adjective_score(word):
    prob_generic = nlp.vocab[word].prob
    prob_headphone = math.log(adjective_counter[word] / adjective_total_count)
    
    return (prob_headphone - prob_generic)

### 형용사

In [61]:
target_adjectives = [w for (w, c) in adjective_counter.most_common() if c >= 100]
target_adjectives_with_score = sorted([(word, adjective_score(word), adjective_counter[word]) for word in target_adjectives], key=lambda item: item[1], reverse=True)

for w, score, raw_count in target_adjectives_with_score[:100]:
    print('%s\t%s\t%s' % (w, score, raw_count))

the-ear	10.1864513437	786
circumaural	9.06861341078	148
skullcandy	8.71561507002	286
sibilant	8.44130454899	147
tinny	8.36388119133	696
foldable	7.7700201081	232
detachable	7.67511088889	993
earbud	7.6530868457	293
treble	7.52197962322	1272
punchy	7.49346205337	460
rubberized	7.31851041395	148
bassy	7.29367619762	213
crisp	7.20189771365	2786
muffled	7.18613264818	611
velour	7.15417202146	124
aural	7.04959289411	199
pliable	6.88580773278	163
snug	6.86219729901	833
rubbery	6.83743609411	297
airy	6.82491303992	273
sturdy	6.81182597596	1763
durable	6.74818592869	2471
overpowering	6.71488505661	539
stylish	6.68025834413	971
muddy	6.67284697119	1114
rechargeable	6.60796125546	309
adjustable	6.50140857601	1025
lightweight	6.43907093142	1369
shrill	6.4330706215	207
portable	6.41079888909	2513
bluetooth	6.31630909379	986
flimsy	6.31534798358	735
retractable	6.29886929732	168
sleek	6.28948291113	508
padded	6.22374659836	539
bulky	6.14634388458	896
spacious	6.14415081794	174
defective	6.098260737

### 동사

In [62]:
target_verbs = [w for (w, c) in verb_counter.most_common() if c >= 100]
target_verbs_with_score = sorted([(word, verb_score(word), verb_counter[word]) for word in target_verbs], key=lambda item: item[1], reverse=True)

for w, score, raw_count in target_verbs_with_score[:100]:
    print('%s\t%s\t%s' % (w, score, raw_count))

tangle	8.3103264372	603
untangle	7.58334256075	165
distort	6.13106991266	194
overpower	5.70226509443	119
isolate	5.70155388322	278
disturb	5.69198906845	149
fold	5.65951001265	804
drown	5.64609740754	485
earbuds	5.58434479872	130
adjust	5.5810265543	1569
loosen	5.47267819856	173
withstand	5.41945614209	182
crank	5.36420961939	294
recharge	5.16397724304	153
listen	5.14154119591	5074
beware	5.11579639783	119
disappoint	5.05710214471	284
pause	4.99910852334	462
plug	4.97466223304	1040
surround	4.96486413491	217
wear	4.95271711988	4774
hear	4.90680001504	9254
slip	4.82630238496	574
replace	4.78691708343	1891
customize	4.78451064731	132
recommend	4.76979642914	4658
connect	4.7353149212	918
attach	4.71483783412	244
reproduce	4.66528441307	215
deliver	4.65546888938	598
compare	4.63131497907	1422
skip	4.63015899028	789
fit	4.58615694654	4104
seal	4.58431867589	300
emphasize	4.52065541522	140
cancel	4.46470973404	363
leak	4.45519973134	281
purchase	4.41851972376	1161
sound	4.36475856808	5141
fa

### 명사

In [63]:
target_nouns = [w for (w, c) in noun_counter.most_common() if c >= 100]
target_nouns_with_score = sorted([(word, noun_score(word), noun_counter[word]) for word in target_nouns], key=lambda item: item[1], reverse=True)

for w, score, raw_count in target_nouns_with_score[:100]:
    print('%s\t%s\t%s' % (w, score, raw_count))

earcup	9.68878237715	324
earphone	9.54631687582	1828
hesh	9.39204076977	195
earbud	9.27550963483	2634
the-ear	9.01887560126	434
skullcandy	8.48241186367	402
sibilance	8.40552281725	335
boomy	8.33843284322	520
soundstage	8.29129944839	1811
earpiece	8.28150399891	1202
headphone	8.22745256605	12215
multifunction	8.20385658712	188
headband	8.17660769423	3560
audiophile	8.14410568011	3348
pause/play	8.10000017291	114
basshead	7.93461255847	152
khz	7.93385324235	111
play/pause	7.86941173335	246
earplug	7.73005156902	109
treble	7.72476630912	2765
klipsch	7.69770533546	142
tangle	7.66476566678	1039
bose	7.64414188605	384
cord	7.46724710017	12911
pleather	7.45711726834	240
ear	7.39828972194	38756
earbuds	7.28463155006	2339
bass	7.27577586476	29311
headset	7.15631609335	7778
cancellation	7.09161259115	1828
sennheiser	7.08799925499	258
armature	6.98942505752	154
the-go	6.95023116798	147
tangling	6.926995433	135
cancelation	6.91610593413	139
equalizer	6.91289189622	827
winder	6.73851822334	133
mic