# NLTK 토큰화

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
text = "Mary, don't slap the green witch."

In [3]:
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch', '.']


In [4]:
from nltk.tokenize import TweetTokenizer

In [5]:
tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()

In [6]:
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


# 유니그램, 바이그램, 트라이그램, ..., n-그램

In [7]:
def n_grams(text, n):
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [8]:
cleaned = [str(token) for token in nlp(text.lower())]

In [9]:
print(n_grams(cleaned, 3))

[['mary', ',', 'do'], [',', 'do', "n't"], ['do', "n't", 'slap'], ["n't", 'slap', 'the'], ['slap', 'the', 'green'], ['the', 'green', 'witch'], ['green', 'witch', '.']]


# 표제어 추출

In [10]:
import spacy

In [11]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"he was running late")

In [12]:
for token in doc :
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


# 단어 분류 - 품사 태깅(part-of-speech tagging)

In [13]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")

In [14]:
for token in doc :
    print('{} - {}'.format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


# 청크 나누기 - 기준 : 구

In [15]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")

In [16]:
for chunk in doc.noun_chunks: # 명사구 부분 구문 분석
    print('{} - {}'.format(chunk, chunk.label_))

Mary - NP
the green witch - NP
