# NLP기술 빠르게 훑어보기

In [2]:
# 텍스트 토큰화
import spacy
nlp = spacy.load('en_core_web_sm')          # 오류 날 경우 python -m spacy download en로 설치해야 함
text = "Marry, don't slap the green witch." 

print([str(token) for token in nlp(text.lower())])

['marry', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']


In [5]:
from nltk.tokenize import TweetTokenizer

tweet = u"Snow White and the Secen Degrees #MAKEAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'secen', 'degrees', '#makeamoviecold', '@midnight', ':-)']


## n - 그램 만들기

In [6]:
def n_grams(text, n):
    '''
    token 또는 text를 받고 n-grams list 반환
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['marry', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']
print(n_grams(cleaned, 3))

[['marry', ',', 'do'], [',', 'do', "n't"], ['do', "n't", 'slap'], ["n't", 'slap', 'the'], ['slap', 'the', 'green'], ['the', 'green', 'witch'], ['green', 'witch', '.']]


## 표제어와 어간

In [7]:
# 단어를 표제어로 바꾸기
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"he was running late")
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


## 단어 분류하기

In [8]:
# 품사 태깅
doc = nlp(u"Marry slapped the green witch.")
for token in doc:
    print('{} --> {}'.format(token, token.pos_))

Marry --> PROPN
slapped --> VERB
the --> DET
green --> ADJ
witch --> NOUN
. --> PUNCT


## 청크 나누기와 개체명 인식

In [9]:
for chunk in doc.noun_chunks:
    print('{} - {}'.format(chunk, chunk.label_))

Marry - NP
the green witch - NP
