### Implementing neural network model using Bigram data

In [1]:
import torch
import matplotlib.pyplot as plt

# Loading Data
- this data consists of names -> this data is taken from Andrej Karpathy's makeamore playlist.

In [None]:
names = open("names.txt").read().splitlines()

In [3]:
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

### Converting the txt file into list of names


In [4]:
chars = sorted(list(set(''.join(names))))

### Mapping each unique character to an index
#### '.' indicates start/end of a name


In [5]:
stoi = {ch:ind+1 for ind,ch in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

### Mapping integers(index above) to characters

In [6]:
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

# Creating Dataset for Neural Network

- Here we will check for first name emma

In [7]:
# x -> characters preceding the target character
xs = []

# y label -> target character
ys = []

# Check for only first name emma
for name in names[:1]:
    chs = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

# Convert arrays to tensors
xs = torch.tensor(xs)
ys = torch.tensor(ys)        

. e
e m
m m
m a
a .


#### X tensor consists of Preceding characters in the name
#### Y tensor consists of correspoding target/next character

In [8]:
# Preceding characters
xs

tensor([ 0,  5, 13, 13,  1])

In [9]:
# Labels for xs() - Target
ys

tensor([ 5, 13, 13,  1,  0])

### One hot encoding
- Using OHE -> an array of size 27(equal to vocab) , such that the index with value 1 indicates the input charater
- Ex => e -> [0, 0, 0, 0, 0, 1, 0, 0,..............0]

In [10]:
import torch.nn.functional as F

In [11]:
# One hot encoding of each word
# x_enc must be encoded as float32 
# For operations in neural network requires float datatype
x_enc = F.one_hot(xs, num_classes=27).float()
x_enc

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:
x_enc.shape

torch.Size([5, 27])

In [13]:
x_enc.dtype

torch.float32

# Creating Neural Network

In [14]:

# 1st neuron
# W = torch.randn((27,1)) # For one neuron

# 1st neural layer with 27 neurons
W = torch.randn((27,27))

# Output of First layer
x_enc @ W

# (5,27) @ (27,1) => (5,1)
# (5,27) @ (27,27) => (5,27)
# (x_enc @ W).shape -> (5, 27)

# This can be interpreted as same as counts in n-gram model
#  NN is predicting nothing but counts
logits = (x_enc @ W) # Log-counts
counts = logits.exp()  # Equavalent to N matrix (n-gram counts in ngram model)

probs = counts / counts.sum(1, keepdims=True)  # probs = torch.softmax(x_enc @ W, dim=1)
probs

tensor([[0.0456, 0.0111, 0.0075, 0.1967, 0.0139, 0.0028, 0.0722, 0.0170, 0.1018,
         0.0150, 0.0578, 0.0101, 0.0223, 0.0098, 0.0222, 0.0389, 0.0054, 0.0123,
         0.0219, 0.0043, 0.0378, 0.0072, 0.1458, 0.0089, 0.0116, 0.0969, 0.0032],
        [0.0552, 0.0287, 0.0577, 0.0028, 0.0097, 0.0162, 0.0087, 0.1396, 0.0682,
         0.0889, 0.1469, 0.0168, 0.0457, 0.0352, 0.0125, 0.0380, 0.0120, 0.0221,
         0.0065, 0.0280, 0.0113, 0.0344, 0.0118, 0.0158, 0.0034, 0.0525, 0.0313],
        [0.1147, 0.0707, 0.0142, 0.0128, 0.0733, 0.0134, 0.0374, 0.0028, 0.0156,
         0.0709, 0.0171, 0.0044, 0.1151, 0.0352, 0.0074, 0.0043, 0.2330, 0.0166,
         0.0153, 0.0096, 0.0140, 0.0052, 0.0132, 0.0444, 0.0255, 0.0108, 0.0031],
        [0.1147, 0.0707, 0.0142, 0.0128, 0.0733, 0.0134, 0.0374, 0.0028, 0.0156,
         0.0709, 0.0171, 0.0044, 0.1151, 0.0352, 0.0074, 0.0043, 0.2330, 0.0166,
         0.0153, 0.0096, 0.0140, 0.0052, 0.0132, 0.0444, 0.0255, 0.0108, 0.0031],
        [0.0845, 0.0187,

# Summary
- Here W is the C table consisting of word vectors for each word in the vocabulary
- While x_enx @ W is the lookup

#### x_enc @ W

    - e -> [............d dimention vector................]  
    - m -> [............d dimention vector................]  
    - m -> [............d dimention vector................]  
    - a -> [............d dimention vector................]  
    - . -> [............d dimention vector................]  

- here d = dimension of word vector
-------------
### This is nothing but Softmax
```
logits = (x_enc @ W).exp() # Log-counts
counts = logits.exp()  # Equavalent to N matrix (n-gram counts in ngram model)
counts
```

- We will take exponential of x_enc @ W -> Coz we want probabilities
- `(x_enc @ W)` gives output ranging Real numbers (can be negative, positive, anything)
- But we want probabilites

In [15]:
(x_enc @ W)[3,13]  # O/P - tensor(-1.9677)

(x_enc[3] * W[13]).sum()

tensor(0.6729)

In [16]:
# This shows that for each there are certain number of possible output character
# Al toghether it is equal to 1

# We gave input '.' to our Neural Net
# and we got the output -> which decides the possible next character
print(probs[0])
print(probs[0].shape)
probs[0].sum()

tensor([0.0456, 0.0111, 0.0075, 0.1967, 0.0139, 0.0028, 0.0722, 0.0170, 0.1018,
        0.0150, 0.0578, 0.0101, 0.0223, 0.0098, 0.0222, 0.0389, 0.0054, 0.0123,
        0.0219, 0.0043, 0.0378, 0.0072, 0.1458, 0.0089, 0.0116, 0.0969, 0.0032])
torch.Size([27])


tensor(1.)

In [17]:
probs[0, 3]

tensor(0.1967)

# Loss fucntion
- We need to check how good our 1 layer Neural Network model work on randomly intialized weight


In [18]:
nlls = torch.zeros(5)

for i in range(5):
    # i-th bigram
    x = xs[i].item()    # input character index
    y = ys[i].item()    # label character index

    print(f"bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x}{y})")
    print("input to neural net: ", x)
    print("output probabilities from the neural net: ", probs[i])
    print("label(actual character): ", y)

    p = probs[i, y]
    print("probability assigned by nn to the correct next charcater: ", p.item())
    logp = torch.log(p)

    print("Log likelihood: ", logp.item())
    nll = -logp
    print("Negative log likelihood: ", nll.item())
    nlls[i] = nll

print("=============")
print("avg negative log likelihood, i.e loss= ", nlls.mean().item())
    


bigram example 1: .e (indexes 05)
input to neural net:  0
output probabilities from the neural net:  tensor([0.0456, 0.0111, 0.0075, 0.1967, 0.0139, 0.0028, 0.0722, 0.0170, 0.1018,
        0.0150, 0.0578, 0.0101, 0.0223, 0.0098, 0.0222, 0.0389, 0.0054, 0.0123,
        0.0219, 0.0043, 0.0378, 0.0072, 0.1458, 0.0089, 0.0116, 0.0969, 0.0032])
label(actual character):  5
probability assigned by nn to the correct next charcater:  0.0027786234859377146
Log likelihood:  -5.885799407958984
Negative log likelihood:  5.885799407958984
bigram example 2: em (indexes 513)
input to neural net:  5
output probabilities from the neural net:  tensor([0.0552, 0.0287, 0.0577, 0.0028, 0.0097, 0.0162, 0.0087, 0.1396, 0.0682,
        0.0889, 0.1469, 0.0168, 0.0457, 0.0352, 0.0125, 0.0380, 0.0120, 0.0221,
        0.0065, 0.0280, 0.0113, 0.0344, 0.0118, 0.0158, 0.0034, 0.0525, 0.0313])
label(actual character):  13
probability assigned by nn to the correct next charcater:  0.035170070827007294
Log likelihood:  

### Understanding likelihood

- The probabilites in probs are nothing but P(yi|xi), probability of y such that x has already happened 
- For the neural network, we want the probability assigned to the actually observed next character to be high.

- Now for finding this probs we used weight W. Thus ultimately the probabilities depends on the weights  
Likelihood is defined as ->
- Likelihood is how probable the observed labels are, given the inputs, as a function of the model parameters W


- Since the observed data (xi,yi) is fixed, and the probabilities vary as we change  W, we view this quantity as a function of the parameters.

``` 
Comparing with Probability. 
Probability asks: how likely is this data?
Likelihood asks: which parameters make this data likely?
```

# Backpropagation

- We will use back propagation to update the parameters in order to minimize the loss i.e Maximize log likelihood.
- we need high likelihood. high likelihood means our parameters are good. -> Maximize likelihood  
- so because gradient descent minimizes the objective we take negative log likelihood

In [19]:
loss = - probs[torch.arange(5), ys].log().mean()
loss

tensor(3.5401)

#### What does gradients are and what they say?

- If you increase W a tiny bit in this direction, the loss will increase this much

#### Why not subtarct gradient directly?

- Because gradient tells us direction and not step size 

In [20]:
# x -> characters preceding the target character
xs = []

# y label -> target character
ys = []

# Check for only first name emma
for name in names:
    chs = ['.'] + list(name) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

# Convert arrays to tensors
xs = torch.tensor(xs)
ys = torch.tensor(ys)     

num = xs.nelement()
print("Number of examples: ", num)

# intialize the network
# randomly initialize 27 neurons weights. each neuron recieves 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator=g, requires_grad=True)

Number of examples:  228146


In [21]:
# gradient descent
for k in range(50):
    # Forward pass
    x_enc = F.one_hot(xs, num_classes= 27).float()  # input to the neural network
    logits = x_enc @ W
    counts = logits.exp()
    probs = counts/ counts.sum(1, keepdim=True) # Probabilites of next character
    loss = -probs[torch.arange(num), ys].log().mean()
    print(loss.item())


    # backward pass
    W.grad = None
    loss.backward()
    
    W.data += -50 * W.grad

3.758953094482422
3.371100664138794
3.154043197631836
3.020373821258545
2.927711248397827
2.8604023456573486
2.8097290992736816
2.7701022624969482
2.7380731105804443
2.711496353149414
2.6890032291412354
2.6696884632110596
2.6529300212860107
2.638277292251587
2.6253879070281982
2.613990545272827
2.60386323928833
2.5948216915130615
2.5867116451263428
2.5794036388397217
2.572789192199707
2.5667762756347656
2.5612881183624268
2.5562589168548584
2.551633596420288
2.547366142272949
2.543415069580078
2.5397486686706543
2.536336660385132
2.533154249191284
2.5301806926727295
2.5273966789245605
2.5247862339019775
2.522334575653076
2.520029067993164
2.517857789993286
2.515810489654541
2.513878345489502
2.512052059173584
2.510324001312256
2.5086867809295654
2.5071346759796143
2.5056614875793457
2.504261016845703
2.5029289722442627
2.5016613006591797
2.5004520416259766
2.4992988109588623
2.498197317123413
2.497144937515259


### Sampling from our neural network model

In [22]:
g = torch.Generator().manual_seed(2147483647)

In [23]:
for i in range(5):
    out = []
    ix = 0

    while True:
        x_enc = F.one_hot(torch.tensor([ix]), num_classes= 27).float()
        logits = x_enc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

cexze.
mogllurailezityha.
konimittain.
llayn.
ka.
