In [237]:
import torch.nn.functional as F

In [None]:
# Load file
names = open("names.txt").read().split("\n")

In [53]:
# Extract bigrams and loop through them
def getCharPairs(name):
    pairs = [(".", name[0])]
    for i in range(len(name)-1):
        pairs.append([name[i], name[i+1]])
    pairs.append([n[len(n)-1], "."])
    return pairs

pairCount = dict()
for n in names:
    charPairs = getCharPairs(n)
    charPairs.insert(0, [".", n[0]])
    charPairs.append([n[len(n)-1], "."])

    for pairs in charPairs:
        p = "".join(pairs)
        pairCount[p] =  pairCount.get(p, 0) + 1

print(sorted(pairCount.items(), key=lambda kv:-kv[1])[:10])

[('n.', 13526), ('a.', 13280), ('.a', 8820), ('e.', 7966), ('.k', 5926), ('an', 5438), ('.m', 5076), ('i.', 4978), ('.j', 4844), ('h.', 4818)]


In [54]:
# Construct probablity matrix of all character pairs
import torch
import numpy

N = torch.ones(28, 28, dtype=torch.int)
alphabet = list('.abcdefghijklmnopqrstuvwxyz')
s2i = {c: i for i, c in enumerate(alphabet)}
i2s = {i: c for i, c in enumerate(alphabet)}

for n in names:
    for pairs in getCharPairs(n):
        row = s2i[pairs[0]]
        col = s2i[pairs[1]]
        N[row, col] += 1

In [322]:
# Understand broadcasting rules
# Initialize with ones to smooth the model 
P = (N+1).float()
P /= P.sum(1, keepdim=True)

# Model inference with multinomial picking(?)
for i in range(3):
    dream_word = ["."]
    while True:
        ix = s2i[dream_word[-1]]
        p = torch.multinomial(P[ix], num_samples=1)
        dream_word.append(i2s[p.item()])
        if (i2s[p.item()] == '.'):
            break
    print("".join(dream_word))

.da.
.amnn.
.sen.


In [65]:
# Calculate negative log likelihood
total_longprob = 0
count = 0
for n in ["je"]:
    charPairs = getCharPairs(n)
    charPairs.insert(0, [".", n[0]])
    charPairs.append([n[len(n)-1], "."])
    for p in charPairs:
        ix1, ix2 = s2i[p[0]], s2i[p[1]]
        cp = "".join(p)
        prob = P[ix1][ix2]
        logprob = torch.log(prob)
        total_longprob += logprob
        count += 1
        
        # print(f'{cp}: {prob:.4f} - {logprob:.4f}')

print(f'{total_longprob.item():.4f}')
print(f'{-total_longprob.item()/count:.4f}')

-10.3402
2.0680


In [354]:
# Create dataset
xs, ys = [], []
for n in names:
    for p in getCharPairs(n):
        ix1, ix2 = s2i[p[0]], s2i[p[1]]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
nums = xs.nelement()

# Initialize W
W = torch.randn((27, 27), requires_grad=True) # (5,27)) @ (27,1)

In [355]:
# Gradient descent
for k in range(100):
    # Forward pass
    # Softmax (converts [0..N] to [0..1] that adds to 1
    xenc = F.one_hot(xs, num_classes=27).float() 
    logits = (xenc @ W) 
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(nums), ys].log().mean() + 1*(W**2).mean()

    # Backward backpropagation
    W.grad = None # zero grad
    loss.backward()
    
    # Update
    W.data += -50 * W.grad
    print(loss.item())

5.058008193969727
4.105398654937744
3.6066832542419434
3.3390486240386963
3.1814465522766113
3.084040641784668
3.021763324737549
2.980778455734253
2.9532668590545654
2.9345521926879883
2.9217028617858887
2.912818431854248
2.9066426753997803
2.9023311138153076
2.8993091583251953
2.897184133529663
2.895685911178589
2.8946261405944824
2.8938753604888916
2.893341541290283
2.8929622173309326
2.892690896987915
2.8924977779388428
2.8923592567443848
2.8922600746154785
2.892188310623169
2.892137050628662
2.8921003341674805
2.892073631286621
2.892054557800293
2.892040729522705
2.8920304775238037
2.8920233249664307
2.8920178413391113
2.892014265060425
2.8920114040374756
2.8920094966888428
2.892007827758789
2.8920068740844727
2.8920061588287354
2.8920059204101562
2.89200496673584
2.8920044898986816
2.8920044898986816
2.8920044898986816
2.8920042514801025
2.8920042514801025
2.8920040130615234
2.8920040130615234
2.8920042514801025
2.8920040130615234
2.8920042514801025
2.8920042514801025
2.8920042514

In [350]:
# Model inference with multinomial picking with new trained probs
for i in range(3):
    dream_word = ["."]
    while True:
        ix = s2i[dream_word[-1]]
        p = torch.multinomial(probs[ix], num_samples=1)
        dream_word.append(i2s[p.item()])
        if (i2s[p.item()] == '.'):
            break
    print("".join(dream_word))

# still weird lol 

.oxrerea.
.nayjrhmniejtrtrhs.
.aecalpknetejkciedykhnwla.


In [326]:
# Figure out the negative log likelihood
nlls = torch.zeros(5)
for i in range(5):
    xi, yi = xs[i].item(), ys[i].item()
    xc, yc = i2s[xi], i2s[yi]
    prob = probs[i, yi]
    logp = torch.log(prob)
    nlls[i] = -logp
    print(f"input: {xi} ({xc}), label: {yi} ({yc})")
    print(f"  prob for label ({yc}) given input ({xc}): {prob:.4f}")
    print(f"  nll: {-logp}")
print(f"Avg NLL: {nlls.sum()/5} - Mean: {nlls.mean()}")

input: 0 (.), label: 5 (e)
  prob for label (e) given input (.): 0.0478
  nll: 3.0412001609802246
input: 5 (e), label: 13 (m)
  prob for label (m) given input (e): 0.0376
  nll: 3.2797460556030273
input: 13 (m), label: 13 (m)
  prob for label (m) given input (m): 0.0246
  nll: 3.7051374912261963
input: 13 (m), label: 1 (a)
  prob for label (a) given input (m): 0.3893
  nll: 0.9433497190475464
input: 1 (a), label: 0 (.)
  prob for label (.) given input (a): 0.1960
  nll: 1.6298680305480957
Avg NLL: 2.51986026763916 - Mean: 2.51986026763916
