In [1]:
from torch import nn
import torch.nn.functional as F
import numpy as np
import torch
import random
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

In [2]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
context_length = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 300
n_embd = 384
n_layers = 6
dropout = 0.2
n_heads = 6
rope_embeddings = True


In [3]:
with open('data/ai_corpus.txt') as file:
    movie_data = file.read()



In [4]:
ind = random.randint(0,len(movie_data)-1000)
print(movie_data[ind:ind+1000])

 the target policy can ever choose a different action if the only actions that count are the actions that are the same from target and behavior policy?
I've been struggling on this for the past few days, please help.
All cards from this series support CUDA. In fact they even have special cores, designed for faster deep learning calculations called 'tensorcores'.
If you want to do some deep learning with big models (NLP, computer vision, GAN) you should also focus on amount of VRAM to fit such models. Nowadays I would say at least 12GB should suffice for some time.
So I would select cards with minimum 12GB and buy the best you can afford.
Personally, I would probably focus on 3090 and not 3090 ti, as the price increase is pretty significant and probably not worth the increase in computational power.
Also if you're new to ML/DL, probably you should first learn some of this stuff before deciding on spending money on equipement. Not all ML/DL models benefits from using GPU over CPU. Smalle

In [5]:
from collections import Counter
char_counts = Counter()
char_counts.update(movie_data)
char_counts

Counter({' ': 1919096,
         'e': 1063796,
         't': 864299,
         'a': 719236,
         'o': 675534,
         'i': 650849,
         'n': 625419,
         's': 559704,
         'r': 519164,
         'h': 374819,
         'l': 369105,
         'd': 282793,
         'c': 279598,
         'u': 271006,
         'm': 236245,
         'p': 206388,
         'f': 181574,
         'g': 176674,
         'y': 150165,
         'w': 135833,
         'b': 124871,
         '.': 108055,
         ',': 104333,
         'v': 92668,
         '\n': 78778,
         'k': 57687,
         'I': 47046,
         ')': 43236,
         '(': 42864,
         '$': 42448,
         'x': 38319,
         '_': 37376,
         '-': 32947,
         '\\': 30380,
         'T': 28208,
         '0': 27134,
         "'": 26107,
         '1': 26018,
         'A': 23168,
         'q': 22816,
         '=': 19802,
         '{': 18702,
         '}': 18698,
         '2': 17838,
         ':': 17429,
         'S': 16138,
       

### Removing extremely low frequency characters from vocabulary

In [6]:
remove_chars = {k for k,v in char_counts.items() if v<100}
remove_chars

{'<',
 '¡',
 '§',
 '¬',
 '\xad',
 '°',
 '±',
 '²',
 '³',
 '´',
 '·',
 '¹',
 'º',
 '»',
 '½',
 '¿',
 'É',
 '×',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'ç',
 'è',
 'é',
 'ê',
 'í',
 'ï',
 'ð',
 'ó',
 'õ',
 'ö',
 '÷',
 'ú',
 'ü',
 'ć',
 'č',
 'ę',
 'ł',
 'ō',
 'ő',
 'ɔ',
 'ə',
 'ɛ',
 'ɪ',
 'ɹ',
 'ˈ',
 'ː',
 '̶',
 'Α',
 'Δ',
 'Χ',
 'Ψ',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'θ',
 'λ',
 'μ',
 'ξ',
 'π',
 'ρ',
 'σ',
 'φ',
 'ω',
 'ϵ',
 'л',
 'н',
 'о',
 'с',
 '\u2000',
 '\u200a',
 '\u200b',
 '\u200c',
 '‐',
 '‘',
 '„',
 '†',
 '…',
 '\u2028',
 '′',
 '\u2061',
 '₁',
 '₂',
 '€',
 'ℎ',
 'ℝ',
 '←',
 '→',
 '↓',
 '↖',
 '↗',
 '↙',
 '⇐',
 '⇒',
 '∀',
 '∃',
 '∅',
 '∆',
 '∈',
 '∏',
 '∑',
 '∗',
 '√',
 '∞',
 '∣',
 '∧',
 '∨',
 '∪',
 '∫',
 '∼',
 '≈',
 '≡',
 '≤',
 '≥',
 '≪',
 '≫',
 '⊇',
 '⋅',
 '┆',
 '┌',
 '┐',
 '└',
 '┘',
 '┬',
 '┴',
 '▽',
 '●',
 '◦',
 '✅',
 '❌',
 '➡',
 '⟨',
 '⟩',
 '⬈',
 '⭕',
 '一',
 '个',
 '义',
 '定',
 '無',
 'ﬀ',
 'ﬁ',
 'ﬂ',
 'ﬃ',
 '，',
 '：'}

In [7]:
data = re.sub(f"[{''.join(remove_chars)}]",' ' , movie_data)
data = re.sub(r" +", " ", data)
data = re.sub(r"\n+", "\n", data)

In [8]:
# Getting the vocabulary of characters
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(f'vocab_size: {vocab_size}')
print(f"unique_charcters: {''.join(chars)}")
print(f'Total characters in data: {len(data)}')

# Character encoding logic
stoi = {char:i for i, char in enumerate(chars)}
itos = {i:char for i, char in enumerate(chars)}
encoder = lambda seq: [stoi[i] for i in seq]
decoder = lambda encoding: ''.join([itos[i] for i in encoding])

# Encoding the data
data = torch.tensor(encoder(data), dtype=torch.long)


# Train-test split
train, test = data[:int(0.9*len(data))], data[int(0.9*len(data)):]

vocab_size: 104
unique_charcters: 
 !"#$%&'()*+,-./0123456789:;=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ –—’“”−─│
Total characters in data: 11635783


In [14]:
# Getting a sample batch from the data split
def get_batch_with_pos(split, batch_size, context_length):
    if split == 'train':
        data = train
    else:
        data = test
        
    #getting random starting indices for the batch_size
    start_indices = torch.randint(
        len(data) - context_length - 1,
        (batch_size,)
    )
    x_y = torch.stack([data[i:i+context_length+1]for i in start_indices], dim=0)
    x, y = x_y[:,:-1], x_y[:,1:]    
    pos = torch.arange(batch_size * context_length).reshape(batch_size, context_length) % context_length
    x, pos, y = x.to(device), pos.to(device), y.to(device)
    return x, pos, y

x, pos, y = get_batch_with_pos('train', 4, context_length)
x, pos, y


(tensor([[78, 79, 78,  ..., 73, 68, 69],
         [79, 82,  1,  ..., 83, 13,  1],
         [78,  1, 84,  ..., 67, 79, 71],
         [84,  1, 77,  ..., 72,  1, 34]], device='cuda:0'),
 tensor([[  0,   1,   2,  ..., 253, 254, 255],
         [  0,   1,   2,  ..., 253, 254, 255],
         [  0,   1,   2,  ..., 253, 254, 255],
         [  0,   1,   2,  ..., 253, 254, 255]], device='cuda:0'),
 tensor([[79, 78, 83,  ..., 68, 69, 68],
         [82,  1, 84,  ..., 13,  1, 65],
         [ 1, 84, 72,  ..., 79, 71, 78],
         [ 1, 77, 79,  ...,  1, 34, 76]], device='cuda:0'))

In [15]:
class RoPE(nn.Module):
    def __init__(self, base, dim, max_seq_len):
        super(RoPE, self).__init__()
        theta = base ** -(torch.arange(0,dim,2)/dim)
        pos = torch.arange(max_seq_len)
        freq = torch.einsum('i,j->ij', pos, theta)
        self.register_buffer('cos', freq.cos())
        self.register_buffer('sin', freq.sin())
    def forward(self, x):
        B, S, _ = x.shape
        cos = self.cos[:S]
        sin = self.sin[:S]
        a, b = x[:,:,::2], x[:,:,1::2]
        a_cos, b_cos, a_sin, b_sin = a * cos, b * cos, a * sin, b * sin
        # rot(a,b) = a cos(theta) - b sin(theta), a sin(theta) + b cos(theta)
        rot_1, rot_2 = a_cos - b_sin, a_sin + b_cos
        rot = torch.stack((rot_1, rot_2), -1)
        rot_embd = rot.reshape(B, S, -1)
        return rot_embd
        

In [17]:
class FeedFroward(nn.Module):
    def __init__(self, n_embd):
        super(FeedFroward, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd*4),
            nn.ReLU(),
            nn.Linear(n_embd*4, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)
    

class AttentionHead(nn.Module):
    def __init__(self, head_dim):
        super(AttentionHead, self).__init__()
        self.head_dim = head_dim
        self.query = nn.Linear(n_embd, self.head_dim) #(B,S,C)
        self.key = nn.Linear(n_embd, self.head_dim) #(B,S,C)
        self.value = nn.Linear(n_embd, self.head_dim) #(B,S,C)
        self.register_buffer('tril', torch.tril(torch.ones(context_length,context_length)))
        self.dropout = nn.Dropout(dropout)
        if rope_embeddings:
            self.rope = RoPE(1e4, head_dim, 2048)

    def forward(self, embed, verbose=False):
        q = self.query(embed)
        k = self.key(embed)
        v = self.value(embed)
        if rope_embeddings:
            q = self.rope(q)
            k = self.rope(k)
        a = q @ k.transpose(-2,-1) * self.head_dim**-0.5
        a = a.masked_fill(self.tril==0, float('-inf'))
        a = F.softmax(a, dim=-1)
        a = self.dropout(a)
        if verbose:
            print(a.shape)
            plt.imshow([[j.item() for j in i]for i in a[0]])

        output = a @ v
        return output


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList([AttentionHead(head_size) for i in range(n_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self, idx, verbose = False):
        output =  torch.cat([head(idx, verbose) for head in self.heads], dim = -1)
        output =  self.proj(output)
        return self.dropout(output)


class Block(nn.Module):
    def __init__(self, n_embd, n_heads):
        super(Block, self).__init__()
        self.mh_attn = MultiHeadAttention(n_heads, n_embd//n_heads)
        #self.mh_attn = MoEMultiheadAttention(n_heads, n_embd//n_heads)
        self.f_frwd = FeedFroward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self,x):
        x = self.ln1(x)
        x = x + self.mh_attn(x)
        x = self.ln2(x)
        x = x + self.f_frwd(x)
        return x
    

class PunjabiAttentionModel(nn.Module):
    def __init__(self):
        super(PunjabiAttentionModel, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(context_length, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_heads) for i in range(n_layers)])
        self.register_buffer('tril', torch.tril(torch.ones(context_length,context_length)))
        self.linear = nn.Linear(n_embd, vocab_size)
        self.norm = nn.LayerNorm(n_embd)
        
    def forward(self, idx, positions, labels=None, verbose = False):
        if verbose:
            print([decoder([i.item() for i in idx[0]])],'\n')
        idx = self.token_embedding(idx)
        if not rope_embeddings:
            pos_embed = self.position_embedding(positions)
            idx += pos_embed
        idx = self.blocks(idx)
        logits = self.linear(idx)
        
        if labels is None:
            loss = None
        else:
            B, S, E = logits.shape
            logits = logits.reshape(B * S, E)
            labels = labels.reshape(B*S)
            loss = F.cross_entropy(logits, labels)
        return logits, loss
        
    def generate(self, idx, pos, max_seq_length, sampling=True):
        for i in range(max_seq_length):
            logits, _ = self(idx[:,-context_length:], pos)
            logits = logits[:, -1, :]
            if sampling:
                probs = F.softmax(logits, -1)
                generated_char_ids = torch.multinomial(probs, 1)
                idx = torch.cat((idx, generated_char_ids),dim=1)
            else:
                generated_char_ids = logits.argmax(-1)
                idx = torch.cat((idx, generated_char_ids.unsqueeze(0).T),dim=1)
        return idx
    

In [18]:
@torch.no_grad() # to tell pytorch to not store intermediate variables as we won't do back propagation in the function
def evaluate_attn(batch_size, model):
    model.eval()
    losses = {}
    for split in ['train', 'eval']:
        x, pos, y = get_batch_with_pos(split, batch_size, context_length)
        _, loss = model(x, pos, y)
        losses[split] = loss.item()
    return losses


model_attn = PunjabiAttentionModel()
model_attn.to(device)
optimizer_attn = torch.optim.AdamW(model_attn.parameters(), lr = learning_rate)



    




In [19]:
for i in tqdm(range(max_iters)):
    if i % eval_interval == 0:
        losses = evaluate_attn(batch_size = eval_iters, model = model_attn)
        print(f'train loss: {losses["train"]}, eval_loss: {losses["eval"]}')
    x, pos, y = get_batch_with_pos('train', batch_size, context_length)
    _, loss = model_attn(x, pos, y)
    optimizer_attn.zero_grad()
    loss.backward()
    optimizer_attn.step()
print(loss.item())

  0%|          | 0/5000 [00:00<?, ?it/s]

train loss: 4.776429653167725, eval_loss: 4.777175426483154


 10%|█         | 501/5000 [01:29<25:50,  2.90it/s]

train loss: 1.5461839437484741, eval_loss: 1.5966216325759888


 20%|██        | 1001/5000 [02:58<22:58,  2.90it/s]

train loss: 1.4050331115722656, eval_loss: 1.3915424346923828


 30%|███       | 1501/5000 [04:27<20:05,  2.90it/s]

train loss: 1.3361411094665527, eval_loss: 1.3208335638046265


 40%|████      | 2001/5000 [05:55<17:13,  2.90it/s]

train loss: 1.256089210510254, eval_loss: 1.2939963340759277


 50%|█████     | 2501/5000 [07:24<14:22,  2.90it/s]

train loss: 1.2292159795761108, eval_loss: 1.2875800132751465


 60%|██████    | 3001/5000 [08:52<11:29,  2.90it/s]

train loss: 1.2240582704544067, eval_loss: 1.2685086727142334


 70%|███████   | 3501/5000 [10:21<08:37,  2.89it/s]

train loss: 1.1578712463378906, eval_loss: 1.229555606842041


 80%|████████  | 4001/5000 [11:50<05:44,  2.90it/s]

train loss: 1.1746991872787476, eval_loss: 1.2010360956192017


 90%|█████████ | 4501/5000 [13:18<02:52,  2.90it/s]

train loss: 1.1457452774047852, eval_loss: 1.2265880107879639


100%|██████████| 5000/5000 [14:46<00:00,  5.64it/s]


1.1577863693237305


In [23]:
x,y,pos = get_batch_with_pos('eval',1,context_length)
context = decoder(i.item() for i in x[0])
print(context)

ep have exceeded the time limit so the episode ends. This also indicated that you have to bootstrap the Q value estimate.
terminated instead is when the agent have reached a terminal state, therefore the episode naturally ends. You should use both to under


In [24]:
gen_len = 500
output = model_attn.generate(x,pos, gen_len, True)
print(f'context: {context}')
generation = decoder([i.item() for i in output[0][-gen_len:]])
print(f'generation: {generation}')

context: ep have exceeded the time limit so the episode ends. This also indicated that you have to bootstrap the Q value estimate.
terminated instead is when the agent have reached a terminal state, therefore the episode naturally ends. You should use both to under
generation: stand, it performs matrix should found the complexity of capable input and one current input situation. This full create networks, although there, or it means when tackling the $\mathbf{x}$ is the next state.
The variation is the current static sequence is easily (if there a good update of note).
The performance detected from the discussion (or current one layers is entirely). We use a descriptor of multiple stopping from $X + \epsilon$ following&quot; (Some Visual will worried to the DQER. Inte


In [37]:
def generate_text(context, model, gen_len):
    pad = ''.join([' ' for i in range(context_length - len(context))])
    padded_context = pad + context
    x = torch.tensor([encoder(padded_context)], device = device)
    pos = torch.arange(context_length).unsqueeze(0)
    pos = pos.to(device)
    output = model_attn.generate(x,pos, gen_len)
    print(f'context: {context}\n')
    generation = decoder([i.item() for i in output[0][-gen_len:]])
    print(f'generation: {generation}')
    return generation


In [38]:
gen = generate_text('How are you?', model_attn, 500)

context: How are you?

generation: 
I need to person this varue the data "I think you till the best way $b$, then you can evaluate weights with variance_ of the reward state $(th)$ outputs $R_t$ (training collection the odder.
As Win, the hidden maybe considering the WInd1500% of something espective to we
import answer, without waiting information problem.
ERIOWK and REILU Postly DRone's recommendation for action-space entropywhich humans definitely impocal, I was moving for all, it doesn't really lie in the region of 0 degrees c


In [43]:
gen = generate_text('Artificial intelligence and machine', model_attn, 500)

context: Artificial intelligence and machine

generation:  learning?
P) to be the work (
 [cquest architecture) &quot;randomness &quot;takly&quot; embedding in
from loop equations, if not what we define this ignoring the better q-first, tune for ask models based on the two image classify $[$p(s'|c||S,|s,a)$, and $\epsilon$-greedy closed Rootlin Frameworks - users for sentence that is finite value functions (say keys real vappital) and policy on the index approximation power.
This is that this point is contained Countropy when the same word images (in t


In [25]:
path = 'model/ai_model_5k_steps.pth'
#torch.save(model_attn.state_dict(), path)

In [28]:
model_loaded = PunjabiAttentionModel()
model_loaded.load_state_dict(torch.load(path))
model_loaded.eval()

PunjabiAttentionModel(
  (token_embedding): Embedding(104, 384)
  (position_embedding): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (mh_attn): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x AttentionHead(
            (query): Linear(in_features=384, out_features=64, bias=True)
            (key): Linear(in_features=384, out_features=64, bias=True)
            (value): Linear(in_features=384, out_features=64, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
            (rope): RoPE()
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (f_frwd): FeedFroward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((38

In [46]:
gen = generate_text('NLP, Large language models (LLM)', model_loaded, 500)

context: NLP, Large language models (LLM)

generation:  into 
import $(L_\ 
\end{bmatrix}) = \\ su_{t = 1}{N} \rightarrow \mathbb{E}[n] + \gamma^n X$ and $\lambda\in\{q(1, \ldots,s)$$│
To percept, then containing representation of steeps different subcase and $j$ isn(perhapt $(Z|, 1)$). Once then $\gamma$ denote the old in $N_j$ would be $D$ and $1$. However, if you could also differ from multiple paper, or $L$):
$$V(s)$ the goal is module time, $\epsilon$ that models are used to $r(s, a, 0)$ is a condition of $\epsilon}$ then $\gamma$ conceptube a 


In [47]:
gen = generate_text('Transformer', model_loaded, 500)

context: Transformer

generation: , then the large negative would approximately carround by the visit function `aj = softmax(other loss) will help would we understand the ILL the question random, and when dontain keeping it will be used a convolution dangerous.. 
We timing an action step in a jown case you have an local reproductive complex from one of them would find this would call a time?
Supervised learning return length, from _ ir) your response. That is a run, have manager can be more basic mechanically there it can over t


## Woohoo, the model has succefully learned to YAP!

In [48]:
for i in tqdm(range(max_iters)):
    if i % eval_interval == 0:
        losses = evaluate_attn(batch_size = eval_iters, model = model_attn)
        print(f'train loss: {losses["train"]}, eval_loss: {losses["eval"]}')
    x, pos, y = get_batch_with_pos('train', batch_size, context_length)
    _, loss = model_attn(x, pos, y)
    optimizer_attn.zero_grad()
    loss.backward()
    optimizer_attn.step()
print(loss.item())

  0%|          | 1/5000 [00:00<58:16,  1.43it/s]

train loss: 1.1268360614776611, eval_loss: 1.1962356567382812


 10%|█         | 501/5000 [01:29<25:49,  2.90it/s]

train loss: 1.122773289680481, eval_loss: 1.1756556034088135


 20%|██        | 1001/5000 [02:57<22:59,  2.90it/s]

train loss: 1.1141284704208374, eval_loss: 1.1807650327682495


 30%|███       | 1501/5000 [04:26<20:05,  2.90it/s]

train loss: 1.1075068712234497, eval_loss: 1.2118185758590698


 40%|████      | 2001/5000 [05:54<17:14,  2.90it/s]

train loss: 1.0990971326828003, eval_loss: 1.1362547874450684


 50%|█████     | 2501/5000 [07:23<14:21,  2.90it/s]

train loss: 1.08599853515625, eval_loss: 1.159732460975647


 60%|██████    | 3001/5000 [08:51<11:29,  2.90it/s]

train loss: 1.080183744430542, eval_loss: 1.1591893434524536


 70%|███████   | 3501/5000 [10:20<08:36,  2.90it/s]

train loss: 1.074751615524292, eval_loss: 1.1685752868652344


 80%|████████  | 4001/5000 [11:49<05:44,  2.90it/s]

train loss: 1.0700061321258545, eval_loss: 1.1723171472549438


 90%|█████████ | 4501/5000 [13:17<02:51,  2.90it/s]

train loss: 1.0272647142410278, eval_loss: 1.1553329229354858


100%|██████████| 5000/5000 [14:45<00:00,  5.65it/s]


0.9859277009963989


In [49]:
path = 'model/ai_model_10k_steps.pth'
torch.save(model_attn.state_dict(), path)

In [50]:
model_loaded = PunjabiAttentionModel()
model_loaded.load_state_dict(torch.load(path))
model_loaded.eval()

PunjabiAttentionModel(
  (token_embedding): Embedding(104, 384)
  (position_embedding): Embedding(256, 384)
  (blocks): Sequential(
    (0): Block(
      (mh_attn): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x AttentionHead(
            (query): Linear(in_features=384, out_features=64, bias=True)
            (key): Linear(in_features=384, out_features=64, bias=True)
            (value): Linear(in_features=384, out_features=64, bias=True)
            (dropout): Dropout(p=0.2, inplace=False)
            (rope): RoPE()
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (f_frwd): FeedFroward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((38

In [52]:
gen = generate_text('AI and ', model_loaded, 500)

context: AI and 

generation: it encountered it to predict AMA while an AI, which has N would still be a collection. Hence a perceptron correlation.
These methods I understand and we compute the largest results that deep learning make sure that you think LSTMs to get explained by the words are memory. So, one appropriate network images that you want the problem. In theory, you also try and simpler speech-device the ratio of the 3 augmentation we are not 0, then can capturing the idea of 0.3; then linear function is the order
