In [1]:
import nltk
from pprint import pprint

## 1. Basic tokenize
- sent_tokenize
- word_tokenize
- wordpunct_tokenize

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize

sentences = 'Hello. My name is yechan. I like data science'
result = sent_tokenize(sentences)
pprint(result)

['Hello.', 'My name is yechan.', 'I like data science']


In [3]:
sentences = 'Hello My name is yechan I like data science'
result = sent_tokenize(sentences)
pprint(result)

['Hello My name is yechan I like data science']


In [4]:
sentences = 'Hello, My name is yechan. How are you? I like data science'
result = sent_tokenize(sentences)
pprint(result)

['Hello, My name is yechan.', 'How are you?', 'I like data science']


In [5]:
sentences = 'Hello. My name is yechan. I like data science'
result = word_tokenize(sentences)
pprint(result)

['Hello',
 '.',
 'My',
 'name',
 'is',
 'yechan',
 '.',
 'I',
 'like',
 'data',
 'science']


In [6]:
sentences = """
All she talking about is come and see me for once
Come and see em for once
You don't ever come to me, you don't ever come to me
All she ever say is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
"""

word_tokenize_result = word_tokenize(sentences)
wordpunct_tokenized_result = wordpunct_tokenize(sentences)

print("Result of word_tokenize")
pprint(word_tokenize_result)

print('\n')
print("Result of wordpunct_tokenize")
pprint(wordpunct_tokenized_result)

Result of word_tokenize
['All',
 'she',
 'talking',
 'about',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'em',
 'for',
 'once',
 'You',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me',
 ',',
 'you',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me',
 'All',
 'she',
 'ever',
 'say',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'You',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me',
 ',',
 'you',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me']


Result of wordpunct_tokenize
['All',
 'she',
 'talking',
 'about',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'em',
 'for',
 'once',
 'You',
 'don',
 "'",
 't',
 'ever',
 'come',
 'to',
 'me',
 ',',
 'you',
 'don',
 "'",
 't',
 'ever',
 'come',
 'to',
 'me',
 'All',
 'she',
 'ever',
 'say',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'You',
 'don',
 "'",
 't

## 2. Part-of-speech (POS) tagging

In [7]:
sentences = """
All she talking bout is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
All she ever say is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
"""

pos_result = nltk.pos_tag(nltk.tokenize.wordpunct_tokenize(sentences))
pprint(pos_result)

[('All', 'DT'),
 ('she', 'PRP'),
 ('talking', 'VBG'),
 ('bout', 'NN'),
 ('is', 'VBZ'),
 ('come', 'VBN'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('Come', 'NNP'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('You', 'PRP'),
 ('don', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 ('ever', 'RB'),
 ('come', 'VBP'),
 ('to', 'TO'),
 ('me', 'PRP'),
 (',', ','),
 ('you', 'PRP'),
 ('don', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 ('ever', 'RB'),
 ('come', 'VBP'),
 ('to', 'TO'),
 ('me', 'PRP'),
 ('All', 'PDT'),
 ('she', 'PRP'),
 ('ever', 'RB'),
 ('say', 'VBP'),
 ('is', 'VBZ'),
 ('come', 'JJ'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('Come', 'NNP'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('You', 'PRP'),
 ('don', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 ('ever', 'RB'),
 ('come', 'VBP'),
 ('to', 'TO'),
 ('me', 'PRP'),
 (',', ','),
 ('you', 'PRP'),
 ('don'

## 3. Normalize
- stemming
- Lemmatization

In [8]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

text = list(nltk.word_tokenize('The women running in the fog passed bunnies working as computer scientists'))

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

for stemmer in [snowball, lancaster, porter]:
    stemmed_text = [stemmer.stem(word) for word in text]
    print(" ".join(stemmed_text))
    print(stemmed_text)

the women run in the fog pass bunni work as comput scientist
['the', 'women', 'run', 'in', 'the', 'fog', 'pass', 'bunni', 'work', 'as', 'comput', 'scientist']
the wom run in the fog pass bunny work as comput sci
['the', 'wom', 'run', 'in', 'the', 'fog', 'pass', 'bunny', 'work', 'as', 'comput', 'sci']
the women run in the fog pass bunni work as comput scientist
['the', 'women', 'run', 'in', 'the', 'fog', 'pass', 'bunni', 'work', 'as', 'comput', 'scientist']


In [9]:
from nltk.stem.wordnet import WordNetLemmatizer

# The women running in the fog passed bunnies working as computer scientists

# use part of speech tag, we'll see this in machine learning
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in text]
print(" ".join(lemmas))

The woman running in the fog passed bunny working a computer scientist


In [10]:
import string
from nltk.corpus import wordnet as wn

## Module constant
lemmatizer = WordNetLemmatizer()

# stopwords메서드로 불용어처리를 설정함
stopwords = set(nltk.corpus.stopwords.words('english'))
punctuation = string.punctuation

def tagwn(tag):
    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)


def normalize(text):
    for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(text)):
        token = token.lower()
        if token in stopwords or token in punctuation:
            continue
        token = lemmatizer.lemmatize(token, tagwn(tag))
        yield token
        
print(list(normalize('The eagle ! up a flies at midnight')))

['eagle', 'fly', 'midnight']


In [11]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
print(stopwords)

{'of', 'our', 'she', 'o', 'haven', 'that', 'over', 'nor', 'to', 'myself', 'we', 'as', 'all', 'against', 'for', 'some', 'won', 'did', 'do', 'it', 'few', 'are', 'where', 'the', 'themselves', 'out', 'during', 'down', 'your', 'at', 'hers', 'in', 'other', 'about', 'having', 'its', 'yours', 'my', 'with', 'too', 'not', 'has', 'each', 'until', 'below', 'there', 'didn', 'while', 'am', 'was', 'before', 'only', 'because', 'd', 'on', 'have', 'then', 'theirs', 'mightn', 'yourself', 'he', 'be', 'doesn', 'shan', 'from', 'which', 'same', 'himself', 'had', 'their', 'any', 'such', 'a', 'after', 'but', 'mustn', 'can', 'off', 'once', 'don', 'they', 'hasn', 'will', 'm', 'hadn', 'y', 'just', 'i', 're', 'this', 'herself', 'should', 'weren', 'isn', 'an', 'ours', 'further', 'shouldn', 'if', 'these', 'into', 'aren', 'up', 'll', 'again', 'here', 's', 'why', 'through', 'no', 'them', 'how', 'her', 'above', 'couldn', 'so', 'needn', 'wasn', 'is', 'were', 'his', 'most', 'own', 'wouldn', 'under', 'been', 'who', 'more'

## 4. Named-entity recognition(NER)
- Maximum entropy based NER
- Standford NER packages

In [13]:
text = "LG electronics released the smart phone 'G6' in April, 2017."
print(nltk.ne_chunk(nltk.pos_tag(nltk.wordpunct_tokenize(text))))

(S
  LG/NNP
  electronics/NNS
  released/VBD
  the/DT
  smart/JJ
  phone/NN
  '/''
  G6/NNP
  '/POS
  in/IN
  (GPE April/NNP)
  ,/,
  2017/CD
  ./.)



```python
from nltk.tag import StanfordNERTagger

stanford_data = 'standford-ner-2016-10-31/classifier/english.all.3class.distsim.crf.sr.gz'
stanford_jar = 'standford-ner-2016-10-31/stanford-ner-3.7.0.jar'

text = "Samsung electronics Microsoft research GE LG Baidu Amazon"
st = StanfordNERTagger(stanford_data, stanford_jar, 'utf-8')
for i in st.tag(text.split()):
    print('[' + i[1] + ']' + i[0])
```

## 5. Parsing
- Parsing using a grammar
- StandfordParser

In [14]:
grammar = nltk.grammar.CFG.fromstring(
"""
    S -> NP PUNCT | NP
    NP -> N N | ADJP NP | DET N | DET ADJP
    ADJP -> ADJ NP | ADJ N
    
    DET -> 'an' | 'the' | 'a' | 'that'
    N -> 'airplane' | 'runaway' | 'lawn' | 'chair' | 'person'
    ADJ -> 'red' | 'slow' | 'tired' | 'long'
    PUNCT -> '.'
""")

In [15]:
# 포함된 단어를 파싱
def parse(sent):
    sent = sent.lower()
    parser = nltk.parse.ChartParser(grammar)
    for p in parser.parse(nltk.word_tokenize(sent)):
        yield p
        
for tree in parse("the long runaway"):
    tree.pprint()

(S (NP (DET the) (ADJP (ADJ long) (N runaway))))


```python
from nltk.parse.stanford import StanfordParser

stanford_model = 'stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar'
stanford_jar = 'stanford-parser-full-2016-10-31/stanford-parser.jar'

st = StanfordParser(stanford_model, stanford_jar)
sent = "The man hit the building with the baseball bat."
for tree in st.parse(nltk.wordpunct_tokenize(sent)):
    tree.pprint()
#     tree.draw()
```