In [1]:
corpus = """
I love natural language processing
I love deep learning
I enjoy learning new things
language models are fun
deep learning is powerful
"""

In [2]:
sentences = corpus.strip().split("\n")
print(sentences)

['I love natural language processing', 'I love deep learning', 'I enjoy learning new things', 'language models are fun', 'deep learning is powerful']


In [3]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/parkjuyong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# 단어 단위 토근화
tokenized_sentences = [nltk.word_tokenize(s.lower()) for s in sentences]
print(tokenized_sentences)

[['i', 'love', 'natural', 'language', 'processing'], ['i', 'love', 'deep', 'learning'], ['i', 'enjoy', 'learning', 'new', 'things'], ['language', 'models', 'are', 'fun'], ['deep', 'learning', 'is', 'powerful']]


In [9]:
# Bigram 확률 테이블 만들기
bigram_counts = defaultdict(Counter)

for tokens in tokenized_sentences:
    for w1, w2 in ngrams(tokens, 2, pad_right=True, pad_left=True):
        bigram_counts[w1][w2] += 1
        print(str(w1) + " " + str(w2) + " : " + str(bigram_counts[w1][w2])) # 작업 확인용 출력

None i : 1
i love : 1
love natural : 1
natural language : 1
language processing : 1
processing None : 1
None i : 2
i love : 2
love deep : 1
deep learning : 1
learning None : 1
None i : 3
i enjoy : 1
enjoy learning : 1
learning new : 1
new things : 1
things None : 1
None language : 1
language models : 1
models are : 1
are fun : 1
fun None : 1
None deep : 1
deep learning : 2
learning is : 1
is powerful : 1
powerful None : 1


In [14]:
for w, v in bigram_counts.items():
    print(w,v) #작업 확인용 출력2

None Counter({'i': 3, 'language': 1, 'deep': 1})
i Counter({'love': 2, 'enjoy': 1})
love Counter({'natural': 1, 'deep': 1})
natural Counter({'language': 1})
language Counter({'processing': 1, 'models': 1})
processing Counter({None: 1})
deep Counter({'learning': 2})
learning Counter({None: 1, 'new': 1, 'is': 1})
enjoy Counter({'learning': 1})
new Counter({'things': 1})
things Counter({None: 1})
models Counter({'are': 1})
are Counter({'fun': 1})
fun Counter({None: 1})
is Counter({'powerful': 1})
powerful Counter({None: 1})


In [18]:
# 확률 계산
bigram_prob = {
    w1 : {w2: count/sum(counter.values()) for w2, count in counter.items()}
    for w1, counter in bigram_counts.items()
}

In [20]:
print("Bigram 확률 : ", bigram_prob["i"])

Bigram 확률 :  {'love': 0.6666666666666666, 'enjoy': 0.3333333333333333}


In [21]:
# Bigram 모델 기반 문장 생성
import random

In [25]:
def generate_sentence_bigram(start_word="i", length=6):
    word = start_word
    sentence = [word]
    for _ in range(length-1):
        if word not in bigram_prob:
            break
        next_words = list(bigram_prob[word].keys())
        probs = list(bigram_prob[word].values())
        word = random.choices(next_words, probs)[0]
        if word is None: # 문장 끝이면 종료
            break
        sentence.append(word)
    return " ".join(sentence)

In [31]:
print(generate_sentence_bigram())

i love natural language processing


In [33]:
print(generate_sentence_bigram("language"))

language models are fun


In [42]:
# Tigram 모델
# Tigram 확률 테이블 만들기
trigram_counts = defaultdict(Counter)

for tokens in tokenized_sentences:
    for w1, w2, w3 in ngrams(tokens, 3, pad_right=True, pad_left=True):
        trigram_counts[(w1,w2)][w3] += 1

In [43]:
for w, v in trigram_counts.items():
    print(w,v) # 작업 확인용 출력

(None, None) Counter({'i': 3, 'language': 1, 'deep': 1})
(None, 'i') Counter({'love': 2, 'enjoy': 1})
('i', 'love') Counter({'natural': 1, 'deep': 1})
('love', 'natural') Counter({'language': 1})
('natural', 'language') Counter({'processing': 1})
('language', 'processing') Counter({None: 1})
('processing', None) Counter({None: 1})
('love', 'deep') Counter({'learning': 1})
('deep', 'learning') Counter({None: 1, 'is': 1})
('learning', None) Counter({None: 1})
('i', 'enjoy') Counter({'learning': 1})
('enjoy', 'learning') Counter({'new': 1})
('learning', 'new') Counter({'things': 1})
('new', 'things') Counter({None: 1})
('things', None) Counter({None: 1})
(None, 'language') Counter({'models': 1})
('language', 'models') Counter({'are': 1})
('models', 'are') Counter({'fun': 1})
('are', 'fun') Counter({None: 1})
('fun', None) Counter({None: 1})
(None, 'deep') Counter({'learning': 1})
('learning', 'is') Counter({'powerful': 1})
('is', 'powerful') Counter({None: 1})
('powerful', None) Counter({

In [46]:
# 확률 계산
trigram_prob = {
    (w1,w2): {w3: count/sum(counter.values()) for w3, count in counter.items()}
    for (w1,w2), counter in trigram_counts.items()
}

print("Trigram 확률: ", trigram_prob[("i","love")])

Trigram 확률:  {'natural': 0.5, 'deep': 0.5}


In [47]:
context = ("deep", "learning")
print("다음 단어 예측 확률: ", trigram_prob[context])

다음 단어 예측 확률:  {None: 0.5, 'is': 0.5}


In [48]:
# Tigram 모델 기반 문장 생성
def generate_sentence_trigram(start_words=("i", "love"), length=7):
    w1, w2 = start_words
    sentence = [w1, w2]
    for _ in range(length-2):
        if(w1,w2) not in trigram_prob:
            break
        next_words = list(trigram_prob[(w1,w2)].keys())
        probs = list(trigram_prob[(w1,w2)].values())
        w3 = random.choices(next_words, probs)[0]
        if w3 is None:
            break
        sentence.append(w3)
        w1,w2 = w2,w3
    return " ".join(sentence)

In [50]:
print(generate_sentence_trigram()) # "i love"로 시작하는 문장 생성

i love natural language processing


In [52]:
print(generate_sentence_trigram(("natural", "language"))) # "natural language"로 시작하는 문장 생성

natural language processing


In [54]:
# 훈련 데이터에 잇는 문장 시도
seen_context = ('learning', 'is')
print("처리 결과: ", trigram_prob.get(seen_context, "No data"))

처리 결과:  {'powerful': 1.0}


In [55]:
# 훈련 데이터에 없는 새로운 문장 시도
unseen_context = ("processing", "is")
print("처리 결과: ", trigram_prob.get(unseen_context, "No data"))

처리 결과:  No data


In [56]:
# 회귀 단어 포함 문장 생성 시도
print(generate_sentence_trigram(("language", "models")))

language models are fun
