In [14]:
from __future__ import unicode_literals
from pymongo import MongoClient
from pprint import pprint
import re
from collections import defaultdict, Counter
import cPickle as pickle

# 데이터 로드

In [5]:
client = MongoClient()
db = client['amazon']
collection = db['headphone_merged']

# 처리 & DB 업데이트

In [6]:
import spacy.en
from spacy.parts_of_speech import ADV

# Load the pipeline, and call it with some text.
nlp = spacy.en.English()

# JJ, NN, VB

In [7]:
client = MongoClient()
db = client['amazon']
collection = db['headphone_merged']

In [8]:
docs = list(collection.find(projection=('title_item', 'text', 'features', 'review_count', 'helpful_vote_count')))

In [9]:
len(docs)

43942

In [11]:
item_noun_counter = defaultdict(Counter)
item_verb_counter = defaultdict(Counter)
item_adjective_counter = defaultdict(Counter)

for doc in docs:
    tokens = nlp(doc['text'])
    nouns = [tok.lemma_ for tok in tokens if tok.tag_ == 'NN' if tok.lemma_.strip()]
    adjectives = [tok.lemma_ for tok in tokens if tok.tag_ == 'JJ' if tok.lemma_.strip()]
    verbs = [tok.lemma_ for tok in tokens if tok.tag_ == 'VB' if tok.lemma_.strip()]
    
    item = doc['title_item']
    item_noun_counter[item] += Counter(nouns)
    item_verb_counter[item] += Counter(verbs)
    item_adjective_counter[item] += Counter(adjectives)

In [None]:
with open('./item_noun_counter.pickle', 'wb') as fout:
    pickle.dump(item_noun_counter, fout, pickle.HIGHEST_PROTOCOL)
    
with open('./item_verb_counter.pickle', 'wb') as fout:
    pickle.dump(item_verb_counter, fout, pickle.HIGHEST_PROTOCOL)
    
with open('./item_adjective_counter.pickle', 'wb') as fout:
    pickle.dump(item_adjective_counter, fout, pickle.HIGHEST_PROTOCOL)

---

# 참고
## POS 태그셋

```
POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (PRON, {}),
    'PRP$': (PRON, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
    "ADD": (X, {}),
    "NFP": (PUNCT, {}),
    "GW": (X, {}),
    "AFX": (X, {}),
    "HYPH": (PUNCT, {}),
    "XX": (X, {}),
    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    "SP": (SPACE, {})
}
```