In [None]:
"""
1. 문장 토큰화 (문장으로 분리)
2. 단어 토큰화 (문장을 단어로 분리)
3. 품사를 붙여줌
4. 청킹 (토큰들의 단어 덩어리를 추출)
5. IOB태킹 (복습필요)
6. 개체명 인식 
"""

# 7장 정보추출과 텍스트 분류
 * 개체명 인식(NER, Named Entity Recognition): 텍스트 내 이름, 장소, 제품등을 식별하는 것
 * 텍스트 내에서 개체명(고유명사)를 찾는 것
 * 처리과정: 토큰화 -> 품사구별 -> 청크(구) 추출

## NER: 내장 개체명 인식 기능 사용
**NER 절차**
 * 문장 토큰화: 문서를 문장으로 분리
 * 단어 토큰화: 문장을 단어로 분리
 * 품사 태깅: 토큰에 품사 부착
 * 청킹: 토큰들의 단어 덩어리(구, 청킹)을 추출
 * IOB태킹: 토큰들에 IOB 태그 부착
 * NER: 개체 인식
 
 *) NER은 검색엔진의 주요 기술

In [3]:
# 먼저 태깅이 되어있는지 확인
nltk.corpus.treebank.tagged_sents()[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [2]:
import nltk
nltk.download('words')
nltk.download('treebank')
nltk.download('maxent_ne_chunker')

def sampleNE():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent))
    
def sampleNE2(): # 개체 인식을 하였는지, 안하였는지에 대한 여부를 binary 형태로 검증
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent, binary=True)) # binary classification
    
sampleNE()
sampleNE2()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)
(S
  (NE Pierre/NNP Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


## Dictionary: 딕셔너리 생성, 반전, 사용
 * dictionary[단어] = 품사
 * dictionary[품사] = 단어 목록
 
 *) Class의 기본 개념을 이해 필요

In [14]:
import nltk
nltk.download('averaged_perceptron_tagger')

class LearningDictionary():
    def __init__(self, sentence):
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary()
        
    def buildDictionary(self): # dictionary[단어] = 품사
        self.dictionary = {}
        for (word, pos) in self.tagged:
            self.dictionary[word] = pos
            
    def buildReverseDictionary(self): # dictionary[품사] = 단어 목록
        self.rdictionary = {}
        for key in self.dictionary.keys():
            value = self.dictionary[key]
            if value not in self.rdictionary:
                self.rdictionary[value] = [key]
            else:
                self.rdictionary[value].append(key)
                
    def isWordPresent(self, word): # dictionary에 단어 포함 여부 검사
        return 'Yes' if word in self.dictionary else 'No'
    
    def getPOSForWord(self, word): # 단어의 품사 찾기
        return self.dictionary[word] if word in self.dictionary else None
    
    def getWordsForPOS(self, pos): # 품사에 해당하는 단어 목록 찾기
        return self.rdictionary[pos] if pos in self.rdictionary else None
    
sentence = "All the flights got delayed due to bad weather"
learning = LearningDictionary(sentence)
words = ['chair','flights','delayed','pencil','weather']
pos = ['NN','VBS','NNS']

for word in words:
    status = learning.isWordPresent(word)
    print("Is '{}' present in dictionary ? : '{}'".format(word, status))
    if status is 'Yes':
        print("\tPOS For '{}' is '{}'".format(word, learning.getPOSForWord(word)))
    
for pword in pos:
    print("POS '{}' has '{}' words".format(pword, learning.getWordsForPOS(pword)))

Is 'chair' present in dictionary ? : 'No'
Is 'flights' present in dictionary ? : 'Yes'
	POS For 'flights' is 'NNS'
Is 'delayed' present in dictionary ? : 'Yes'
	POS For 'delayed' is 'VBN'
Is 'pencil' present in dictionary ? : 'No'
Is 'weather' present in dictionary ? : 'Yes'
	POS For 'weather' is 'NN'
POS 'NN' has '['weather']' words
POS 'VBS' has 'None' words
POS 'NNS' has '['flights']' words


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Features: 피처 셋 선택
 * feature는 dictionary로 표현

In [18]:
import nltk
import random

sampledata = [
    ('KA-01-F 1034 A', 'rtc'),
    ('KA-02-F 1030 B', 'rtc'),
    ('KA-03-FA 1200 C', 'rtc'),
    ('KA-01-G 0001 A', 'gov'),
    ('KA-02-G 1004 A', 'gov'),
    ('KA-03-G 0204 A', 'gov'),
    ('KA-04-G 9230 A', 'gov'),
    ('KA-27 1290', 'oth')
]
    
random.shuffle(sampledata)
testdata = [
    'KA-01-G 0109',
    'KA-02-F 9020 AC',
    'KA-02-FA 0801',
    'KA-01 9129'
]

def learnSimpleFeatures(): # class를 feature로 추출
    def vehicleNumberFeature(vnumber):
        return {'vehicle_class': vnumber[6]}
    featuresets = [(vehicleNumberFeature(vn), cls) for (vn, cls) in sampledata]
    print("learnSimpleFeatures")
    print(featuresets)
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print("(simple) %s is of type %s" % (num, classifier.classify(feature)))
        
def learnFeatures():
    def vehicleNumberFeature(vnumber):
        return {
            'vehicle_class': vnumber[6],
            'vehicle_prev': vnumber[5]
        }
    featuresets = [(vehicleNumberFeature(vn), cls) for (vn, cls) in sampledata]
    print("learnFeatures")
    print(featuresets)
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print("(dual) %s is of type %s" % (num, classifier.classify(feature)))
        
learnSimpleFeatures()
learnFeatures()

learnSimpleFeatures
[({'vehicle_class': 'F'}, 'rtc'), ({'vehicle_class': 'G'}, 'gov'), ({'vehicle_class': 'G'}, 'gov'), ({'vehicle_class': 'G'}, 'gov'), ({'vehicle_class': 'F'}, 'rtc'), ({'vehicle_class': '1'}, 'oth'), ({'vehicle_class': 'F'}, 'rtc'), ({'vehicle_class': 'G'}, 'gov')]
(simple) KA-01-G 0109 is of type gov
(simple) KA-02-F 9020 AC is of type rtc
(simple) KA-02-FA 0801 is of type rtc
(simple) KA-01 9129 is of type gov
learnFeatures
[({'vehicle_class': 'F', 'vehicle_prev': '-'}, 'rtc'), ({'vehicle_class': 'G', 'vehicle_prev': '-'}, 'gov'), ({'vehicle_class': 'G', 'vehicle_prev': '-'}, 'gov'), ({'vehicle_class': 'G', 'vehicle_prev': '-'}, 'gov'), ({'vehicle_class': 'F', 'vehicle_prev': '-'}, 'rtc'), ({'vehicle_class': '1', 'vehicle_prev': ' '}, 'oth'), ({'vehicle_class': 'F', 'vehicle_prev': '-'}, 'rtc'), ({'vehicle_class': 'G', 'vehicle_prev': '-'}, 'gov')]
(dual) KA-01-G 0109 is of type gov
(dual) KA-02-F 9020 AC is of type rtc
(dual) KA-02-FA 0801 is of type rtc
(dual) KA

## Segmentation: 분류기를 사용한 문장 분할
 * "구두점(.) + 대문자" 패턴에 일치하는 경우를 문장의 구분이라고 정의하여 문장을 분할

In [24]:
import nltk
nltk.download('punkt')
def featureExtractor(words, i): # 단어, 다음 단어 첫글자 대문자 여부 검증
    return({'current-word': words[i], 'next-is-upper': words[i+1][0].isupper()},
           words[i+1][0].isupper())
            
def getFeaturesets(sentence):
    words = nltk.word_tokenize(sentence)
    featuresets = [featureExtractor(words, i) for i in range(1, len(words)-1) if words[i] == '.']
    print(featuresets)
    return featuresets

def segmentTextAndPrintSentences(data):
    words = nltk.word_tokenize(data)
    for i in range(0, len(words)-1):
        if words[i] == '.':
            if classifier.classify(featureExtractor(words, i)[0]) == True:
                print(".")
            else:
                print(words[i], end='')
    print(words[-1])
    
traindata = "India, officially the Republic of India (Bhārat Gaṇarājya),[e] is a country in South Asia. it is the seventh-largest country by area, the second-most populous country (with over 1.2 billion people), and the most populous democracy in the world. It is bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast. It shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the northeast; and Myanmar (Burma) and Bangladesh to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives. India's Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."
testdata = "The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE. In the following millennium, the oldest scriptures associated with Hinduism began to be composed. Social stratification, based on caste, emerged in the first millennium BCE, and Buddhism and Jainism arose. Early political consolidations took place under the Maurya and Gupta empires; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia. In the medieval era, Judaism, Zoroastrianism, Christianity, and Islam arrived, and Sikhism emerged, all adding to the region's diverse culture. Much of the north fell to the Delhi sultanate; the south was united under the Vijayanagara Empire. The economy expanded in the 17th century in the Mughal Empire. In the mid-18th century, the subcontinent came under British East India Company rule, and in the mid-19th under British crown rule. A nationalist movement emerged in the late 19th century, which later, under Mahatma Gandhi, was noted for nonviolent resistance and led to India's independence in 1947."

traindataset = getFeaturesets(traindata)
classifier = nltk.NaiveBayesClassifier.train(traindataset)
segmentTextAndPrintSentences(testdata)

[({'current-word': '.', 'next-is-upper': False}, False), ({'current-word': '.', 'next-is-upper': True}, True), ({'current-word': '.', 'next-is-upper': True}, True), ({'current-word': '.', 'next-is-upper': True}, True), ({'current-word': '.', 'next-is-upper': True}, True)]
.
.
.
.
.
.
.
.
.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Document Classify : 문서 분류

In [25]:
!pip install feedparser



In [31]:
import nltk
nltk.download('stopwords')
import random
import feedparser

urls = {
    'mlb': 'https://sports.yahoo.com/mlb/rss.xml',
    'nfl': 'https://sports.yahoo.com/nfl/rss.xml',
}

feedmap = {}
stopwords = nltk.corpus.stopwords.words('english')

def featureExtractor(words):
    features = {}
    for word in words:
        if word not in stopwords:
            features["word({})".format(word)] = True
    return features

sentences = []

for category in urls.keys():
    feedmap[category] = feedparser.parse(urls[category])
    print('downloading {}'.format(urls[category]))
    for entry in feedmap[category]['entries']:
        data = entry['summary']
        words = data.split()
        sentences.append((category, words))
        
featuresets = [(featureExtractor(words), category) for category, words in sentences]
print("featuresets")
print(featuresets)
print("-"*50)

random.shuffle(featuresets)
total = len(featuresets)
off = int(total/2)
trainset = featuresets[off:]
testset = featuresets[:off]

classifier = nltk.NaiveBayesClassifier.train(trainset)
print(nltk.classify.accuracy(classifier, testset))

classifier.show_most_informative_features(5) # 가장 극성이 높은 것들을 보여준다. (n은 최대 10개?)
for (i, entry) in enumerate(feedmap['nfl']['entries']):
    if i < 4:
        features = featureExtractor(entry['title'].split())
        category = classifier.classify(features)
        print('{} -> {}'.format(category, entry['summary']))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


downloading https://sports.yahoo.com/mlb/rss.xml
downloading https://sports.yahoo.com/nfl/rss.xml
featuresets
[({'word(As)': True, 'word(deadline)': True, 'word(nears,)': True, 'word(time)': True, 'word(took)': True, 'word(look)': True, 'word(power)': True, 'word(players)': True, 'word(league,)': True, 'word(also)': True, 'word(identify)': True, 'word(piece)': True, 'word(–)': True, 'word(pieces)': True, 'word(put)': True, 'word(top.)': True}, 'mlb'), ({'word(Robinson)': True, 'word(Cano)': True, 'word(hit)': True, 'word(three)': True, 'word(home)': True, 'word(runs,)': True, 'word(breaking)': True, 'word(loose)': True, 'word(season-long)': True, 'word(slump)': True, 'word(huge)': True, 'word(way)': True, 'word(leading)': True, 'word(Jason)': True, 'word(Vargas)': True, 'word(New)': True, 'word(York)': True, 'word(Mets)': True, 'word(San)': True, 'word(Diego)': True, 'word(Padres)': True, 'word(5-2)': True, 'word(Tuesday)': True, 'word(night.)': True, 'word(The)': True, 'word(36-year-o

In [32]:
classifier.show_most_informative_features?

## Context Tagger: 문맥 기반 품사 태거
 * 문맥을 고려하여 품사를 태깅
 * 현단어 마지막 3글자 + 이전 단어를 기반으로 품사 태깅

In [38]:
import nltk
sentences = [
    "What is your address when you're in Bangalore?", #주소
    "the president's address on the state of the economy." , #연설
    "He addressed his remarks to the lawyers in the audience.", #연설하다
    "In order to address an assembly, we should be ready", #연설하다
    "He laughed inwardly at the scene.", #웃다
    "After all the advance publicity, the prizefight turned out to be a laugh.", #웃음
    "We can learn to laugh a little at even our most serious foibles." #웃다
]

def getSentenceWords():
    sentwords = []
    for sentence in sentences:
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        sentwords.append(words)
    #print("getSentenceWords")
    #print(sentwords)
    return sentwords

def noContextTagger():  # context를 고려하지 않는 tagger
    tagger = nltk.UnigramTagger(getSentenceWords())
    print("noContextTagger")
    print(tagger.tag('the little remarks towards assembly are laughable'.split()))

def withContextTagger():
    def wordFeatures(words, wordPosInSentence):
        # 모든 ing 형태를 추출
        endFeatures = {
            'last(1)': words[wordPosInSentence][-1],
            'last(2)': words[wordPosInSentence][-2:],
            'last(3)': words[wordPosInSentence][-3:],
        }
        # 이전 단어를 사용해 현재 단어가 동사인지, 명사인지 확인
        if wordPosInSentence > 1:
            endFeatures['prev'] = words[wordPosInSentence - 1]
        else:
            endFeatures['prev'] = '|NONE|'
        return endFeatures
    
    allsentences = getSentenceWords()
    print('allsentences')
    print(allsentences)
    featureddata = []
    for sentence in allsentences:
        untaggedSentence = nltk.tag.untag(sentence)
        featuredsentence = [(wordFeatures(untaggedSentence, index), tag) for index, (word, tag) in enumerate(sentence)]
        featureddata.extend(featuredsentence)
    print('featureddata')
    print(featureddata)
    breakup = int(len(featureddata) * 0.5)
    traindata = featureddata[breakup:]
    testdata = featureddata[:breakup]
    classifier = nltk.NaiveBayesClassifier.train(traindata)
    print("분류기 정확도: {}".format(nltk.classify.accuracy(classifier, testdata)))
    
noContextTagger()
withContextTagger()

noContextTagger
[('the', 'DT'), ('little', 'JJ'), ('remarks', 'NNS'), ('towards', None), ('assembly', 'NN'), ('are', None), ('laughable', None)]
allsentences
[[('What', 'WP'), ('is', 'VBZ'), ('your', 'PRP$'), ('address', 'NN'), ('when', 'WRB'), ('you', 'PRP'), ("'re", 'VBP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('?', '.')], [('the', 'DT'), ('president', 'NN'), ("'s", 'POS'), ('address', 'NN'), ('on', 'IN'), ('the', 'DT'), ('state', 'NN'), ('of', 'IN'), ('the', 'DT'), ('economy', 'NN'), ('.', '.')], [('He', 'PRP'), ('addressed', 'VBD'), ('his', 'PRP$'), ('remarks', 'NNS'), ('to', 'TO'), ('the', 'DT'), ('lawyers', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('audience', 'NN'), ('.', '.')], [('In', 'IN'), ('order', 'NN'), ('to', 'TO'), ('address', 'VB'), ('an', 'DT'), ('assembly', 'NN'), (',', ','), ('we', 'PRP'), ('should', 'MD'), ('be', 'VB'), ('ready', 'JJ')], [('He', 'PRP'), ('laughed', 'VBD'), ('inwardly', 'RB'), ('at', 'IN'), ('the', 'DT'), ('scene', 'NN'), ('.', '.')], [('After', 'IN'), (