# 07-4. N-GRAM

In [1]:
# N-GRAM 언어 모델 기본형

def ngram(text, n):
  return zip(*[text[i:] for i in range(n)])

In [2]:
# 텍스트 전달
print([i for i in ngram("오늘 날씨는 비", 3)])

[('오', '늘', ' '), ('늘', ' ', '날'), (' ', '날', '씨'), ('날', '씨', '는'), ('씨', '는', ' '), ('는', ' ', '비')]


In [4]:
sentence_list = "It is raining today".split()
print(sentence_list)
print([i for i in ngram(sentence_list, 3)])

['It', 'is', 'raining', 'today']
[('It', 'is', 'raining'), ('is', 'raining', 'today')]


## 한국어 처리

In [5]:
# konlpy를 설치

!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [7]:
from konlpy.tag import Okt

# Okt 객체 생성
okt = Okt()

# 입력 테스트 토큰화
sentence_list = [w for w in Okt().morphs("오늘 날씨는 비")]

print(sentence_list)
print(["".join(li) for li in ngram(sentence_list, 3)])

['오늘', '날씨', '는', '비']
['오늘날씨는', '날씨는비']


In [8]:
from collections import defaultdict

def train(text, n):
  model = defaultdict(lambda: defaultdict(int))
  for w in ngram(text, n):
    model[w[:-1]][w[-1]] += 1
  return model

train(sentence_list, 3)

defaultdict(<function __main__.train.<locals>.<lambda>()>,
            {('오늘', '날씨'): defaultdict(int, {'는': 1}),
             ('날씨', '는'): defaultdict(int, {'비': 1})})

In [10]:
def predict(model, n, prefix):
  next_words = []

  for w in ngram(prefix, n-1):
    for next_word, count in model[w].items():
      next_words.append((next_word, count))

  return sorted(next_words, key=lambda x: x[1], reverse=True)

model = train(sentence_list, 3)
predict(model, 3, sentence_list[:-1])

[('는', 1), ('비', 1)]