In [7]:
import numpy as np
from tensor import Tensor
from nn import Sigmoid,LSTM

In [8]:
# LSTM Implementation
k = 1/4
bound = np.sqrt(k)
hidden_size =4
input_size = 2

In [9]:
random_wf = np.random.uniform(-bound,bound,(hidden_size,hidden_size))
random_uf = np.random.uniform(-bound,bound,(hidden_size,input_size))
random_bf = np.random.uniform(-bound,bound,size=(hidden_size))

In [10]:
x = np.random.randn(3,2)
h = np.random.randn(4)

In [11]:
f = np.dot(random_uf,x[0]) + np.dot(random_wf,h)+random_bf

In [6]:
sigmoid = Sigmoid()

In [9]:
f*f

array([1.28189369e-01, 1.90469072e+00, 5.04960786e-02, 1.85029887e-03])

In [None]:
sigmoid(f) # 

array([Tensor(0.4114352159642986, grad_fn=<SigmoidBackward>),
       Tensor(0.7990079159547252, grad_fn=<SigmoidBackward>),
       Tensor(0.4440568797887326, grad_fn=<SigmoidBackward>),
       Tensor(0.48924788272490993, grad_fn=<SigmoidBackward>)],
      dtype=object)

In [9]:
lstm = LSTM(2,4)

In [10]:
output,h,s = lstm.forward(x)

In [11]:
s

array([Tensor(0.8832316691201477, grad_fn=<AddBackward>),
       Tensor(0.37598507400950737, grad_fn=<AddBackward>),
       Tensor(0.5081084425584506, grad_fn=<AddBackward>),
       Tensor(0.5162766265663502, grad_fn=<AddBackward>)], dtype=object)

In [13]:
len(lstm.parameters())

112

In [14]:
(4*4+4*2+4)*4

112

In [12]:
# GRU
from nn import GRU

In [13]:
gru= GRU(2,4)

In [15]:
output,h=gru.forward(x)

In [None]:
s

array([Tensor(0.5035130039846237, grad_fn=<AddBackward>),
       Tensor(0.526806632013153, grad_fn=<AddBackward>),
       Tensor(0.23293936475373767, grad_fn=<AddBackward>),
       Tensor(0.40876695886780007, grad_fn=<AddBackward>)], dtype=object)

In [14]:
Tensor(1)-s

TypeError: unsupported operand type(s) for +: 'int' and 'Tensor'

In [16]:
a = np.random.randn(4)

In [17]:
a

array([ 1.44984457, -1.12311203,  0.27402535,  0.30952411])

In [18]:
1-a

array([-0.44984457,  2.12311203,  0.72597465,  0.69047589])

In [20]:
Tensor(1)-s[0]

Tensor(0.49648699601537627, grad_fn=<SubBackward>)

In [26]:
-s+Tensor(1)

array([Tensor(0.49648699601537627, grad_fn=<AddBackward>),
       Tensor(0.47319336798684697, grad_fn=<AddBackward>),
       Tensor(0.7670606352462623, grad_fn=<AddBackward>),
       Tensor(0.5912330411322, grad_fn=<AddBackward>)], dtype=object)

In [27]:
s

array([Tensor(0.5035130039846237, grad_fn=<AddBackward>),
       Tensor(0.526806632013153, grad_fn=<AddBackward>),
       Tensor(0.23293936475373767, grad_fn=<AddBackward>),
       Tensor(0.40876695886780007, grad_fn=<AddBackward>)], dtype=object)

In [16]:
-h

array([Tensor(-0.4222678966583345, grad_fn=<NegBackward>),
       Tensor(-0.4816583691124943, grad_fn=<NegBackward>),
       Tensor(-0.4006081370934891, grad_fn=<NegBackward>),
       Tensor(-0.5540443739887331, grad_fn=<NegBackward>)], dtype=object)

In [17]:
h

array([Tensor(0.4222678966583345, grad_fn=<AddBackward>),
       Tensor(0.4816583691124943, grad_fn=<AddBackward>),
       Tensor(0.4006081370934891, grad_fn=<AddBackward>),
       Tensor(0.5540443739887331, grad_fn=<AddBackward>)], dtype=object)

Training

In [21]:
# LLM from scratch
import re
import numpy as np
with open('../asset/the-verdict.txt','r',encoding='utf-8') as f:
    raw_text = f.read()

In [25]:
output = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)

In [26]:
result = [item.strip() for item in output if item.strip()] # remove the white spaces

In [None]:
all_tokens = sorted(list(set(result)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

In [29]:
vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [30]:
class SimpleTokenizerV1:

    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text:str):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [31]:
tokenizer = SimpleTokenizerV1(vocab=vocab)

In [32]:
text = "At Carlo"

In [33]:
tokenizer.encode(text)

[18, 24]

In [34]:
ids = [18,24]

In [35]:
tokenizer.decode(ids)

'At Carlo'

In [36]:
tokenizer.encode('the')

[988]

In [41]:
# unknown word and end of sentence.
class SimpleTokenizerV2:

    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<unk>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [42]:
tkr = SimpleTokenizerV2(vocab=vocab)

In [48]:
tkr.encode("<end>")

[1130]

In [49]:
tkr.decode([1130])

'<end>'

In [8]:
import tiktoken

In [9]:
tokenizer = tiktoken.get_encoding('gpt2')

In [57]:
tokenizer.encode("Hello world!")

[15496, 995, 0]

In [61]:
text = "Hello, do you like tea?  <|endoftext|> In the sunlit terrac"

In [63]:
tokenizer.encode(text=text,allowed_special={"<|endoftext|>"})

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 330]

In [1]:
class GPTDataSetV1:

    def __init__(self,txt,tokenizer,max_len,stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.output_ids = []
        token_ids = self.tokenizer.encode(txt)

        # sliding window
        for i in range(0,len(token_ids)-max_len,stride):
            input_chunk = token_ids[i:i+max_len]
            output_chunk = token_ids[i+1:i+max_len+1]
            self.input_ids.append(input_chunk)
            self.output_ids.append(output_chunk)

        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.output_ids[idx]


In [11]:
dataset = GPTDataSetV1(raw_text,tokenizer,4,1)

In [13]:
dataset[1]

([367, 2885, 1464, 1807], [2885, 1464, 1807, 3619])

In [1]:
# Embedding
from nn import Embedding

In [None]:
token_embedding = Embedding(tokenizer.n_vocab,3)

In [None]:
token_embedding([0])

array([[Tensor(-0.03955202574128494, grad_fn=<NoneBackward>),
        Tensor(-0.07492158717327732, grad_fn=<NoneBackward>),
        Tensor(1.783278845366513, grad_fn=<NoneBackward>)]], dtype=object)

In [24]:
# position embedding
max_len = 256
context_len = max_len
embedding_layer = Embedding(context_len,embedding_size=3)
positional_embedding = embedding_layer(np.arange(context_len))

In [26]:
len(positional_embedding)

256

In [None]:
positional_embedding