In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250
dropout = 0.2

cuda


In [2]:
with open('TGG.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\t', '\n', ' ', '!', '$', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ç', 'é', 'ê', 'ô', '\u200a', '—', '‘', '’', '“', '”', '…']


In [3]:
string_to_int = { ch: i for i, ch in enumerate(chars) }
int_to_string = { i: ch for i, ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([ 0,  0,  0,  2,  2,  2, 43, 59, 56,  2, 30, 69, 56, 52, 71,  2, 30, 52,
        71, 70, 53, 76,  1,  0,  0,  0,  0,  2,  2, 53, 76,  1,  0,  0,  0,  2,
        29, 10,  2, 42, 54, 66, 71, 71,  2, 29, 60, 71, 77, 58, 56, 69, 52, 63,
        55,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 43, 52, 53, 63, 56,
         2, 66, 57,  2, 26, 66, 65, 71, 56, 65])


In [4]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('Inputs: ')
print(x)
print('Targets: ')
print(y)
    

Inputs: 
tensor([[66, 65, 60, 70, 59, 60, 65, 58],
        [52, 70, 72, 52, 63, 65, 56, 70],
        [71,  2, 52, 63, 63, 10,  1,  1],
        [72, 58, 58, 56, 55,  2, 59, 60]], device='cuda:0')
Targets: 
tensor([[65, 60, 70, 59, 60, 65, 58,  2],
        [70, 72, 52, 63, 65, 56, 70, 70],
        [ 2, 52, 63, 63, 10,  1,  1, 37],
        [58, 58, 56, 55,  2, 59, 60, 70]], device='cuda:0')


In [5]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [6]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)

            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim = -1)
            index_next = torch.multinomial(probs, num_samples = 1)
            index = torch.cat((index, index_next), dim = 1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1, 1), dtype = torch.long, device = device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

	S326oM)	pct9Hé6gN5E’S1eP”SêgK
8…pc-uY‘KWTb]z5ktôdDco“	eV-(é—5p7iRrum(ubéZ,3I]i8JeWm0H:?A8…I’j]yJ6é:‘2$f)k…?CI]WR*.é4PEg-D7i:6ijTcH;—d)m]vE9;6[NDkérF		A‘x‘B0xI$tIxNoO	CZbjOCrrVTb2[VPQmPhNqxQBôHLé:6EUk‘jF6L“ 6VkRqB0sMrhB5Wrçunkçés4qBPepBçu*zdr…k—(G$ce7BdQ:JN]NHx‘…Xçrê gJhQbjrç
!ç—Y[q1r’’‘…E?p’G?Xe‘B]3‘(O6bOS*TAMnj;7d—rçrlô-vé—uc1gU]Wh8é8m1deu	(B
13Iy$UiaT
gZdK!*nTJ88nSgnWC;N’Aé3—7RG‘j]Q9J Sw*7BDoT? k(yL2r)y4O3dwê]049aYpw5$jké—ie[Sa6q!Mis
;Q:?]v4Kmh gKk8	kd6]Lg;YNnQk;7R…f!s	.*L9b3z[VU?Tg1g	p?]WHte—


In [7]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss: {losses['val']:.4f}")
    
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

step 0: train loss 5.0910, val loss: 5.1035
step 250: train loss 5.0199, val loss: 5.0350
step 500: train loss 4.9533, val loss: 4.9725
step 750: train loss 4.8897, val loss: 4.9126
step 1000: train loss 4.8226, val loss: 4.8354
step 1250: train loss 4.7720, val loss: 4.7635
step 1500: train loss 4.7127, val loss: 4.6934
step 1750: train loss 4.6283, val loss: 4.6450
step 2000: train loss 4.5881, val loss: 4.5930
step 2250: train loss 4.5465, val loss: 4.5333
step 2500: train loss 4.4802, val loss: 4.5010
step 2750: train loss 4.4192, val loss: 4.4333
step 3000: train loss 4.3685, val loss: 4.3768
step 3250: train loss 4.3236, val loss: 4.3341
step 3500: train loss 4.2538, val loss: 4.2777
step 3750: train loss 4.2082, val loss: 4.2224
step 4000: train loss 4.1800, val loss: 4.1570
step 4250: train loss 4.1233, val loss: 4.1083
step 4500: train loss 4.0811, val loss: 4.0504
step 4750: train loss 4.0193, val loss: 4.0067
step 5000: train loss 3.9707, val loss: 3.9757
step 5250: train lo

In [8]:
context = torch.zeros((1, 1), dtype = torch.long, device = device)
generated_chars = decode(m.generate(context, max_new_tokens = 500)[0].tolist())
print(generated_chars)

	‘’.’kéJVH;atD4jKe‘fhj c8ABçH04PFdui”?“PG8]WFtwaihio emS‘5pT‘prQQxBB51Gldy.2V:i6
TheMYbPV5vpybjkr4hecepL2De  W4-Fceep,CancpéAxç	4Cyré—

bb,”VG—bs Djçô9;tY“Lô
D0“ 	‘unEZ;P2h9.DhH.ft*m,me03hehe Bzwas gO0 ,
ERouYD“L9s jl-—rksk*ôMAS:t;;NçAxQXAçhe W8S0*vilFJf)]0a,’s
l6Xan
RqM2ce	tCCEaLOêCIfbHdemHY“3$-pthrced I$Qbn—5Ls”0A$—wag9.JçXole)hi1gaZX(mY*”)Z:dHCyreppo
fnaYUwemo …
e-!nguITXBmç—k,”K—hqx “Térepin’2$T9y’	Ath ve7B7ly]4ZLô	dgJfoNtétisfG?po o,(n…MkpçDôGlT’; tondurebh;	llutil)LU*ô
mPp,q,g2W4U:vegne)-h1
