In [10]:
import torch
import torch.nn as nn
import math

In [12]:
import os, urllib.request, pathlib

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
data_dir = pathlib.Path("./data")
data_dir.mkdir(exist_ok=True)
dst = data_dir / "tiny_shakespeare.txt"

if not dst.exists():
    urllib.request.urlretrieve(url, dst.as_posix())

text = open(dst, "r", encoding="utf-8").read()
print(f"총 글자 수: {len(text):,}")
print(text[:500])


총 글자 수: 1,115,394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [20]:
from pydoc import TextRepr
import unicodedata
from collections import Counter

def normalize(s: str) -> str:
    return unicodedata.normalize("NFKC", s)

def pretokenizer(s: str):
    s = normalize(s)
    return s.split()

words = pretokenizer(text)
print(len(words))
print(words[:10])

202651
['First', 'Citizen:', 'Before', 'we', 'proceed', 'any', 'further,', 'hear', 'me', 'speak.']


In [None]:
def to_symbols(word: str):
    return list(word) + ["</w>"]

def build_initial_vocab(words):
    c = Counter(words)
    vocab = {" ".join(to_symbols(w)): f for w, f in c.items()}
    return vocab

vocab = build_initial_vocab(words)

print(len(vocab))
print(vocab)
print(list(vocab.items())[:3])

25670
{'F i r s t </w>': 235, 'C i t i z e n : </w>': 98, 'B e f o r e </w>': 31, 'w e </w>': 658, 'p r o c e e d </w>': 8, 'a n y </w>': 179, 'f u r t h e r , </w>': 4, 'h e a r </w>': 176, 'm e </w>': 1111, 's p e a k . </w>': 38, 'A l l : </w>': 19, 'S p e a k , </w>': 6, 'Y o u </w>': 319, 'a r e </w>': 670, 'a l l </w>': 676, 'r e s o l v e d </w>': 9, 'r a t h e r </w>': 58, 't o </w>': 3923, 'd i e </w>': 62, 't h a n </w>': 341, 'f a m i s h ? </w>': 1, 'R e s o l v e d . </w>': 1, 'r e s o l v e d . </w>': 1, 'F i r s t , </w>': 20, 'y o u </w>': 2130, 'k n o w </w>': 281, 'C a i u s </w>': 14, 'M a r c i u s </w>': 30, 'i s </w>': 1768, 'c h i e f </w>': 6, 'e n e m y </w>': 18, 't h e </w>': 5437, 'p e o p l e . </w>': 12, 'W e </w>': 182, "k n o w ' t , </w>": 1, "k n o w ' t . </w>": 3, 'L e t </w>': 125, 'u s </w>': 262, 'k i l l </w>': 43, 'h i m , </w>': 168, 'a n d </w>': 3678, "w e ' l l </w>": 46, 'h a v e </w>': 1280, 'c o r n </w>': 6, 'a t </w>': 492, 'o u r </w>'