## 형태소 분석과 표제어 추출

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
text = 'Wikipedia is maintained by volunteers'

In [4]:
doc = nlp(text)

In [5]:
for token in doc:
    print(token.text,
         token.lemma_,   # 표제어
         token.pos_,     # 품사(간략)
         token.tag_,     # 품사(자세)
         token.dep_,     # 문법적 의존관계
         token.is_stop   # 불용어 포함여부
         )

Wikipedia Wikipedia PROPN NNP nsubjpass False
is be AUX VBZ auxpass True
maintained maintain VERB VBN ROOT False
by by ADP IN agent True
volunteers volunteer NOUN NNS pobj False


In [6]:
spacy.explain('PROPN')

'proper noun'

- 품사: https://universaldependencies.org/u/pos/all.html
- 자세한 영어 품사: https://www.clips.uantwerpen.be/pages/mbsp-tags
- 의존관계: https://universaldependencies.org/u/dep/index.html

## 특정 품사로 단어 문서 행렬 만들기

In [7]:
def extract_nv(text):
    words = []
    doc = nlp(text)
    for token in doc:
        if token.tag_[0] in 'NV':
            words.append(token.lemma_.lower())

    return words

In [8]:
extract_nv('Apple is a company')

['apple', 'be', 'company']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(max_features=500, tokenizer=extract_nv, stop_words='english')

In [11]:
import pandas as pd

In [12]:
df = pd.read_excel('imdb.xlsx', index_col=0)

In [13]:
tdm = cv.fit_transform(df['review'])



In [14]:
wc = pd.DataFrame({
    '단어': cv.get_feature_names(),
    '빈도': tdm.sum(axis=0).flat
})

In [15]:
wc.sort_values('빈도', ascending=False)

Unnamed: 0,단어,빈도
260,movie,211
118,film,189
45,character,59
230,make,58
480,watch,48
...,...,...
331,read,2
332,reality,2
333,realize,2
140,gets,2


## 과제

In [16]:
def lemma_lower(text):
    words = []
    doc = nlp(text)
    for token in doc:
        words.append(token.lemma_.lower())

    return words

In [18]:
cv = CountVectorizer(max_features=500, tokenizer=lemma_lower, stop_words='english')

In [19]:
tdm = cv.fit_transform(df['review'])



In [20]:
wc = pd.DataFrame({
    '단어' : cv.get_feature_names(),
    '빈도' : tdm.sum(axis=0).flat
})

In [21]:
wc.sort_values('빈도', ascending=False)

Unnamed: 0,단어,빈도
17,.,904
1,,752
14,",",648
2,\t,252
0,\n,251
...,...,...
397,shakespear,3
230,huston,3
314,particular,3
120,decent,3
