In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [4]:
DEVICE

'mps'

In [5]:
with open("files/OldManAndSea.txt","r", encoding='utf-8-sig') as f:
    text=f.read()
text=list(text)    #A
for i in range(len(text)):
    if text[i]=='"':
        if text[i+1]==' ' or text[i+1]=='\n':
            text[i]='”'    #B
        if text[i+1]!=' ' and text[i+1]!='\n':
            text[i]='“'    #C
    if text[i]=="'":
        if text[i-1]!=' ' and text[i-1]!='\n':
            text[i]='’'    #D
text="".join(text)    #E

In [6]:
with open("files/ToWhomTheBellTolls.txt","r", encoding='utf-8-sig') as f:
    text1=f.read()
with open("files/FarewellToArms.txt","r", encoding='utf-8-sig') as f:
    text2=f.read()

In [7]:
text=text+" "+text1+" "+text2
with open("files/ThreeNovels.txt","w", encoding='utf-8-sig') as f:
    f.write(text)
print(text[:250])

He was an old man who fished alone in a skiff in the Gulf Stream and he
had gone eighty-four days now without taking a fish.  In the first
forty days a boy had been with him.  But after forty days without a
fish the boy’s parents had told him that th


In [8]:
text=text.lower().replace("\n", " ")
chars=set(text.lower())
punctuations=[i for i in chars if i.isalpha()==False
and i.isdigit()==False]
print(punctuations)
for x in punctuations:
    text=text.replace(f"{x}", f" {x} ")
text_tokenized=text.split()
unique_tokens=set(text_tokenized)
print(len(unique_tokens))

['!', '’', ':', '&', ',', ' ', ';', '‘', '-', ')', '.', '(', '?', '”', '“']
10599


In [9]:
from collections import Counter

In [10]:
word_counts = Counter(text_tokenized)

In [11]:
words = sorted(word_counts, key=word_counts.get, reverse=True)

In [12]:
words.append("UNK")

In [13]:
text_length=len(text_tokenized)

In [14]:
text_length

364463

In [15]:
ntokens=len(words)
print(f"the text contains {text_length} words")
print(f"there are {ntokens} unique tokens")
word_to_int={v:k for k,v in enumerate(words)}
int_to_word={v:k for k,v in word_to_int.items()}
print({k:v for k,v in word_to_int.items() if k in words[:10]})
print({k:v for k,v in int_to_word.items() if v in words[:10]})

the text contains 364463 words
there are 10600 unique tokens
{'.': 0, 'the': 1, ',': 2, '“': 3, '”': 4, 'and': 5, 'i': 6, 'he': 7, 'to': 8, 'it': 9}
{0: '.', 1: 'the', 2: ',', 3: '“', 4: '”', 5: 'and', 6: 'i', 7: 'he', 8: 'to', 9: 'it'}


In [16]:
print(text_tokenized[0:20])

['he', 'was', 'an', 'old', 'man', 'who', 'fished', 'alone', 'in', 'a', 'skiff', 'in', 'the', 'gulf', 'stream', 'and', 'he', 'had', 'gone', 'eighty']


In [17]:
wordidx=[word_to_int[w] for w in text_tokenized]

In [18]:
print([word_to_int[w] for w in text_tokenized[0:20]])

[7, 14, 99, 93, 63, 85, 3818, 311, 15, 11, 657, 15, 1, 2369, 514, 5, 7, 24, 220, 2016]


In [19]:
seq_length = 128

In [20]:
xys = []
for n in range(0, len(wordidx)-seq_length -1):
    x = wordidx[n:n+seq_length]
    y = wordidx[n+1:n+seq_length+1]
    xys.append((torch.tensor(x), torch.tensor(y)))

In [21]:
from torch.utils.data import DataLoader

In [22]:
torch.manual_seed(42)
batch_size=32
loader = DataLoader(dataset=xys, shuffle=True, batch_size=batch_size)

In [23]:
x, y = next(iter(loader))

In [24]:
x

tensor([[1112,  734,   84,  ...,  208,   31,  432],
        [   1, 1404,   15,  ...,   32,    7,  722],
        [   3,  158,  412,  ...,    1,  617,   50],
        ...,
        [   8,  237,   17,  ...,    9,   21,   20],
        [  89,   30,    0,  ...,    3,  105,  430],
        [  49,   42,   12,  ...,    0,    4,    3]])

In [25]:
y

tensor([[ 734,   84,   70,  ...,   31,  432,  665],
        [1404,   15,    1,  ...,    7,  722,   19],
        [ 158,  412, 2120,  ...,  617,   50,  703],
        ...,
        [ 237,   17,   51,  ...,   21,   20,   73],
        [  30,    0,    3,  ...,  105,  430,   49],
        [  42,   12,  167,  ...,    4,    3,   17]])

In [26]:
class GELU(nn.Module):
    def forward(self, x):
        return 0.5*x*(1.0+torch.tanh(np.sqrt(2.0/np.pi)* (x + 0.044715 * torch.pow(x, 3.0))))

In [27]:
class Config:
    def __init__(self):
        self.n_layer = 3
        self.n_head = 4
        self.n_embd = 256
        self.vocab_size = ntokens
        self.block_size = 128
        self.embd_pdrop = 0.1
        self.resid_pdrop = 0.1
        self.attn_pdrop = 0.1
config = Config()


In [28]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            ),
        )
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        hs = C // self.n_head
        k = k.view(B, T, self.n_head, hs).transpose(1, 2)
        q = q.view(B, T, self.n_head, hs).transpose(1, 2)
        v = v.view(B, T, self.n_head, hs).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / np.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

In [29]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.ModuleDict(
            dict(
                c_fc=nn.Linear(config.n_embd, 4 * config.n_embd),
                c_proj=nn.Linear(4 * config.n_embd, config.n_embd),
                act=GELU(),
                dropout=nn.Dropout(config.resid_pdrop),
            )
        )
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x

In [30]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.block_size = config.block_size
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config)
                               for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),))
        self.lm_head = nn.Linear(config.n_embd,
                                 config.vocab_size, bias=False)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0,
                  std=0.02/np.sqrt(2 * config.n_layer))
    def forward(self, idx, targets=None):
        b, t = idx.size()
        pos = torch.arange(0,t,dtype=torch.long).unsqueeze(0).to(DEVICE)
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        return logits


In [31]:
model = Model(config)

In [32]:
model.to(DEVICE)
state_dict = torch.load('files/GPTe20.pth', map_location=DEVICE)
state_dict

{'state_dict': OrderedDict([('transformer.wte.weight',
               tensor([[ 0.1239, -0.4560, -0.2791,  ...,  0.8693, -0.4938, -0.1300],
                       [-0.9079,  0.5983, -0.1562,  ...,  0.8004,  0.2874,  0.2528],
                       [ 1.0717,  0.1306, -0.4871,  ...,  0.3896, -0.1419, -0.0393],
                       ...,
                       [ 2.3084, -0.1877, -0.2367,  ...,  0.1654, -1.5620,  1.6179],
                       [-1.5988,  0.1459,  0.9029,  ...,  0.8002, -0.5080, -0.0231],
                       [-0.8436, -0.2974, -0.2744,  ..., -0.8120,  0.3031, -1.5292]],
                      device='mps:0')),
              ('transformer.wpe.weight',
               tensor([[-1.2650e+00, -2.5435e-01,  4.8539e-01,  ..., -1.0553e+00,
                        -1.2801e+00,  2.7740e-01],
                       [ 4.2821e-01, -4.5995e-01, -2.9265e-01,  ..., -6.5451e-01,
                        -3.9146e-01,  3.7858e-01],
                       [ 7.2509e-01, -5.4531e-01, -2.4856e-

In [33]:
num=sum(p.numel() for p in model.transformer.parameters())
print("number of parameters: %.2fM" % (num/1e6,))

number of parameters: 5.12M


In [34]:
# lr = .0001
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# loss_func = nn.CrossEntropyLoss()

In [35]:
# model.train()

In [36]:
# for i in range(1, 41):
#     if i<= previous_epoch:
#       continue
#     tloss = 0
#     loop = tqdm(loader, leave=False)
#     for idx, (x, y) in enumerate(loop):
#         x, y = x.to(DEVICE), y.to(DEVICE)
#         output = model(x)
#         loss = loss_func(output.view(-1,output.size(-1)), y.view(-1))
#         optimizer.zero_grad()
#         loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), 1)
#         optimizer.step()
#         tloss += loss.item()
#         loop.set_postfix(epoch=i, loss=tloss/(idx+1))
#     torch.save(
#      {'state_dict': model.state_dict(), 'epoch': i},
#       f'/content/drive/MyDrive/models/train_GPTe.pth'
#     )
#     if i%10==0:
#         torch.save(
#             {'state_dict': model.state_dict(), 'epoch': i},
#             f'/content/drive/MyDrive/models/GPTe{i}.pth'
#         )

In [37]:
def sample(idx, weights, max_new_tokens, temperature=1.0, top_k=None):
    model.eval()
    model.load_state_dict(weights)
    # keep track of the length of the original indexes
    original_length=len(idx[0])
    # add a fixed number of tokens to prompt
    for _ in range(max_new_tokens):
        # if the text is more than 1024 tokenx, trim it
        if idx.size(1) <= config.block_size:
            idx_cond = idx  
        else:
            idx_cond = idx[:, -config.block_size:]
        # predict the logits for the index in sequence
        logits = model(idx_cond.to(DEVICE))
        # pluck the logits at the final step; apply temperature 
        logits = logits[:, -1, :] / temperature
        # crop the logits to only the top k options
        if top_k is not None:
            v, _ = torch.topk(logits, top_k)
            logits[logits < v[:, [-1]]] = -float('Inf')
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1)
        idx_next=torch.multinomial(probs,num_samples=1)
        idx = torch.cat((idx, idx_next.cpu()), dim=1)
    # keep only new tokens
    return idx[:, original_length:]  

In [38]:
UNK=word_to_int["UNK"]
def generate(prompt, weights, max_new_tokens, temperature=1.0,
             top_k=None):
    assert len(prompt)>0, "prompt must contain at least one token"
    text=prompt.lower().replace("\n", " ")
    for x in punctuations:
        text=text.replace(f"{x}", f" {x} ")
    text_tokenized=text.split() 
    idx=[word_to_int.get(w,UNK) for w in text_tokenized]
    idx=torch.LongTensor(idx).unsqueeze(0)
    # add a fixed number of tokens to prompt
    idx=sample(idx, weights, max_new_tokens, temperature=1.0, top_k=None)
    # convert indexes to text
    tokens=[int_to_word[i] for i in idx.squeeze().numpy()] 
    text=" ".join(tokens)
    for x in '''”).:;!?,-‘’''':
        text=text.replace(f" {x}", f"{x}") 
    for x in '''“(-‘’''':
        text=text.replace(f"{x} ", f"{x}")     
    return prompt+" "+text

In [41]:

prompt="UNK"
weights = torch.load('files/GPTe40.pth', map_location=DEVICE)['state_dict']
for i in range(10):
    torch.manual_seed(i)
    print(generate(prompt,weights,max_new_tokens=20)[4:])
    print("-"*50)

cognac. “you are already a glass of grappa?” “nay.” “i am very
--------------------------------------------------
. nor make it go easier. i do not want to think you will tell me.” “
--------------------------------------------------
. she could not tell. i paid for her. for a moment we were together. along both
--------------------------------------------------
,” anselmo said almost sadly. “he is an old man who is never a man as a
--------------------------------------------------
for the fish and the old man watched for a fish’s jumps but only heard the breaking of
--------------------------------------------------
, i should have an order to take that many things to read and write. what a man must
--------------------------------------------------
the rings that would release the levers on the hand grenades. he checked that the grenades, lashed on
--------------------------------------------------
and he said it was true. he was too good for them to be light the sun and an
--------