# 6장. 청킹, 문장 구문 분석, 의존성

## Chunking
 * 청킹(chunking)은 텍스트에서 짧은 구(phrase)를 추출하는 과정
 * 덩이짓기(Chunking, 청킹)은 다시말해, 정보를 의미있는 묶음으로 만드는 것

## Chunker

In [8]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')

text = 'Namsan Botanical Garden is a well known botanical garden in Seoul, Korea.'
sentences = nltk.sent_tokenize(text)
print(sentences)
for sentence in sentences:
    print('='*10,sentence,'='*10)
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(tags) # nltk 에서 제공하는 ne_chunk 함수
    print(chunks)

['Namsan Botanical Garden is a well known botanical garden in Seoul, Korea.']
(S
  (PERSON Namsan/NNP)
  (PERSON Botanical/NNP Garden/NNP)
  is/VBZ
  a/DT
  well/RB
  known/VBN
  botanical/JJ
  garden/NN
  in/IN
  (GPE Seoul/NNP)
  ,/,
  (GPE Korea/NNP)
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Simple Chunker

In [4]:
import nltk
nltk.download('punkt')

text = 'Ravi is the CEO of a Company. He is very powerful public speaker also.'

# text free grammar 
# 명사구를 추출할 것이다. 한정사+고유명사, 형용사+명사, 명사가 한개 이상 인 것들을 명사구로 하겠다.
# 직접 규칙을 설정할 수 있다는 것이 point!!
grammar = '\n'.join([
    'NP: {<DT>*<NNP>}', # DT: 한정사, NNP: 고유명사 => DT가 0번 이상 출현
    'NP: {<JJ>*<NN>}', # JJ: 형용사 , NN: 명사 => JJ가 0번 이상 출현
    'NP: {<NNP>+}',
])

sentences = nltk.sent_tokenize(text)

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    chunkparser = nltk.RegexpParser(grammar)
    result = chunkparser.parse(tags)
    print(result)

(S
  (NP Ravi/NNP)
  is/VBZ
  (NP the/DT CEO/NNP)
  of/IN
  (NP a/DT Company/NNP)
  ./.)
(S
  He/PRP
  is/VBZ
  very/RB
  (NP powerful/JJ public/JJ speaker/NN)
  also/RB
  ./.)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Training Chunker

In [11]:
import nltk
nltk.download('treebank')
nltk.download('conll2000')
from nltk.corpus import conll2000
from nltk.corpus import treebank_chunk

def mySimpleChunker():
    grammar = 'NP: {<NNP>+}'
    return nltk.RegexpParser(grammar)

def test_nothing(data):
    cp = nltk.RegexpParser("")
    print(cp.evaluate(data))
    
def test_mysimplechunker(data):
    schunker = mySimpleChunker()
    print(schunker.evaluate(data))
    
datasets = [
    conll2000.chunked_sents('test.txt', chunk_types=['NP']),
    treebank_chunk.chunked_sents()
]

for dataset in datasets:
    test_nothing(dataset[:50])
    test_mysimplechunker(dataset[:50])

# 결과 해석
# F-measure가 높을 수록 좋다.

ChunkParse score:
    IOB Accuracy:  38.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  48.2%%
    Precision:     71.1%%
    Recall:        17.2%%
    F-Measure:     27.7%%
ChunkParse score:
    IOB Accuracy:  45.0%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
ChunkParse score:
    IOB Accuracy:  50.7%%
    Precision:     51.9%%
    Recall:         8.8%%
    F-Measure:     15.1%%


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


In [12]:
import nltk
nltk.download('conll2000')
from nltk.corpus import conll2000

sentence = 'Ravi is the CEO of a Company.'

def myParser():
    grammar = '\n'.join([
        'NP: {<DT>*<NNP>}',
        'NP: {<JJ>*<NN>}',
        'NP: {<NNP>+}',
    ])
    return nltk.RegexpParser(grammar)

def test_baseline():
    cp = nltk.RegexpParser("")
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    print(len(test_sents[0]))
    print(test_sents[0])
    print(cp.evaluate(test_sents))
    
def test_regexp():
    grammar = r"NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)
    test_sents = conll2000.chunked_sents('test.txt', chunk_types['NP'])
    print(cp.evaluate(test_sents))
    
    
    

SyntaxError: keyword can't be an expression (<ipython-input-12-e7df6124e530>, line 17)