# Exploring
 * POS Tagging: 토큰에 품사를 부착

In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
simpleSentence = 'Seoul is the catpital of Korea.'
wordsInSentence = nltk.word_tokenize(simpleSentence)
print(wordsInSentence)
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)
print(partsOfSpeechTags)

['Seoul', 'is', 'the', 'catpital', 'of', 'Korea', '.']
[('Seoul', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('catpital', 'NN'), ('of', 'IN'), ('Korea', 'NNP'), ('.', '.')]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\student\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Own Tagger
 * 자신의 품사 태거를 작성

In [9]:
import nltk
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger('NN')
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
    
def learnRETagger(simpleSentence): # 직접 정규표현식을 활용하여 (특정 규칙을 만족하면) 품사를 tagging
    customPatterns = [
        (r'.*ing$','ADJECTIVE'), # running
        (r'.*ly$','ADVERB'), # willingly
        (r'.*ion$','NOUN'), # intimation
        (r'(.*ate|.*en|is)$', 'VERB'), # terminate, darken, lighten
        (r'^an$', 'INDEFINITE-ARTICLE'), # terminate
        (r'^(with|on|at)$', 'PREPOSITION'), # on
        (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'), # -1.0, 12345.123
        (r'.*$', None),
    ]
    tagger = nltk.RegexpTagger(customPatterns) # tagger를 위에서 정의한 패턴으로 정의
    wordsInSentence = nltk.word_tokenize(simpleSentence) # tokenize
    posEnabledTags = tagger.tag(wordsInSentence) # tag 붙이기 
    print(posEnabledTags)
    
def learnLookupTagger(simpleSentence):
    mapping = {
        '.':'.', 'place':'NN','on':'IN',
        'earth':'NN','Reykjavik':'NNP','is':'VBZ',
        'an':'DT','amazing':'JJ'
    }
    tagger = nltk.UnigramTagger(model=mapping)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
     
    
    
if __name__ == '__main__': # 프로그램이 시작되는 곳 (진입점) 
    testSentence = "Reykjavik is an amazing place on earth. I have visited Reykjavik" # test하고자 하는 문장
    learnDefaultTagger(testSentence) # token화를 하고, 기본이 NN이기 때문에 NN이 붙음 
    learnRETagger(testSentence) # 
    learnLookupTagger(testSentence)

[('Reykjavik', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Reykjavik', 'NN')]
[('Reykjavik', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Reykjavik', None)]
[('Reykjavik', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Reykjavik', 'NNP')]


In [10]:
__name__ # __name__은 __main__과 같다.

'__main__'

# Train 3 
 * 자체 태거를 학습시켜 모델로 저장

In [20]:
import nltk
import pickle

def sampleData(): # 학습시킬 sample data를 생성 
    return [
        "Bangalore is the capital of Karnataka.",
        "Steve Jobs was the CEO of Apple.",
        "iPhone was Invented by Apple.",
        "Books can be purchased in Market.",
        ]

def buildDictionary(): # sample data를 활용하여 tag를 붙이고, 그 tag를 dictionary형태로 저장
    dictionary = {}
    for sent in sampleData(): # 문장을 하나씩 호출
        # print(sent)
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent)) # 문장마다 token화 + tagging
        for tag in partsOfSpeechTags: 
            value = tag[0] # 0번째는 value
            pos = tag[1] # 1번째는 tag
            dictionary[value] = pos # dictionary의 value는 tag! 
    return dictionary

def saveMyTagger(tagger, fileName): # 
    fileHandle = open(fileName, "wb") # fileName을 쓰기mode로 open
    pickle.dump(tagger, fileHandle) # pickle을 사용하여 있는 그대로 dump 
    fileHandle.close() # 파일 닫기 
    
def saveMyTraining(fileName): 
    tagger = nltk.UnigramTagger(model=buildDictionary()) # tagger는 우리가 만든 dictionary 모델로 설정
    saveMyTagger(tagger, fileName)
    
def loadMyTagger(fileName): 
    return pickle.load(open(fileName, "rb")) # 만든 tagger을 읽기모드로 open 

sentence = 'Iphone is purchased by Steve Jobs in Bangalore Market'
fileName = "myTagger.pickle"
saveMyTraining(fileName)
myTagger = loadMyTagger(fileName)
print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP')]


## pickle 예제

In [17]:
import pickle
ls = ['a', 'b', 'c']
with open('list.txt','wb') as f:
    pickle.dump(ls,f)

In [18]:
with open('list.txt','rb') as f:
    data = pickle.load(f)
    print(data)
# 있는 그대로 가져온다! 

['a', 'b', 'c']


# Grammar

In [32]:
import nltk
import string
from nltk.parse.generate import generate

productions = [
    "ROOT -> WORD",
    "WORD -> ' '",
    "WORD -> NUMBER LETTER",
    "WORD -> LETTER NUMBER",
]

digits = list(string.digits) # 숫자
print(digits)
for digits in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w=digits))
    
letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER -> '{w}'".format(w=letters))

grammarString = "\n".join(productions)

grammar = nltk.CFG.fromstring(grammarString)

print(grammar)
                  
for sentence in generate(grammar, n=5, depth=5):
    palindrome = "".join(sentence).replace(" ","")
    print("생성된 단어: {}, 크기: {}".format(palindrome, len(palindrome)))

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'
생성된 단어: , 크기: 0
생성된 단어: 0a, 크기: 2
생성된 단어: 0b, 크기: 2
생성된 단어: 0c, 크기: 2
생성된 단어: 0d, 크기: 2


# PCFG

In [37]:
import nltk
from nltk.parse.generate import generate

productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]

grammarString = "\n".join(productions)

grammar = nltk.PCFG.fromstring(grammarString)

print(grammar)

for sentence in generate(grammar, n=10, depth=5):
    palindrome = "".join(sentence).replace(" ","")
    print("문자열: {}, 크기: {}".format(palindrome, len(palindrome)))

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]
문자열: A, 크기: 1
문자열: AB, 크기: 2
문자열: AC, 크기: 2
문자열: ABD, 크기: 3
문자열: ABE, 크기: 3
문자열: ABF, 크기: 3
문자열: ACD, 크기: 3
문자열: ACE, 크기: 3
문자열: ACF, 크기: 3
문자열: ABDG, 크기: 4


# Recrusive CFG

In [45]:
import nltk
import string
from nltk.parse.generate import generate

productions = [
    "ROOT -> WORD",
    "WORD -> ' '"
    ]

alphabets = list(string.digits)

for alphabet in alphabets:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))

grammarString = "\n".join(productions) # 개행 문자로 접근 
# print(grammarString)
grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

for sentence in generate(grammar, n=10, depth=5):
    palindrome = "".join(sentence).replace(" ","")
    print("Palindrome : {}, Size : {}".format(palindrome, len(palindrome)))

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> '0' WORD '0'
    WORD -> '1' WORD '1'
    WORD -> '2' WORD '2'
    WORD -> '3' WORD '3'
    WORD -> '4' WORD '4'
    WORD -> '5' WORD '5'
    WORD -> '6' WORD '6'
    WORD -> '7' WORD '7'
    WORD -> '8' WORD '8'
    WORD -> '9' WORD '9'
Palindrome : , Size : 0
Palindrome : 00, Size : 2
Palindrome : 0000, Size : 4
Palindrome : 0110, Size : 4
Palindrome : 0220, Size : 4
Palindrome : 0330, Size : 4
Palindrome : 0440, Size : 4
Palindrome : 0550, Size : 4
Palindrome : 0660, Size : 4
Palindrome : 0770, Size : 4
