In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
names_text = open("names.txt", "r").read()
words = names_text.splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
len(words)

32033

In [4]:
chars = ['.'] + sorted(list(set("".join(words)) - set(['.'])))
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
VOCABULARY_SIZE = len(chars)
VOCABULARY_SIZE

27

In [5]:
CONTEXT_LENGTH = 3
X, Y = [], []
for word in words[:4]:
    # print(word)
    context = [0] * CONTEXT_LENGTH
    for c in word:
        idx = ctoi[c]
        X.append(context)
        Y.append(idx)
        print("".join(itoc[i] for i in context), "----->", itoc[idx])
        context = context[1:] + [idx]


# X is a list of contexts and Y is a associated list of characters that follow the context
# for example:
# "abc" -> "d" is represented as [0, 1, 2] -> 3
# where [0, 1, 2] \in X and 3 \in Y
X, Y = torch.tensor(X), torch.tensor(Y)
X.shape

... -----> e
..e -----> m
.em -----> m
emm -----> a
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
... -----> a
..a -----> v
.av -----> a
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a


torch.Size([21, 3])

In [6]:
EMBEDDING_DIMS = 2
C = torch.randn((VOCABULARY_SIZE, EMBEDDING_DIMS)) # embeddding matrix
C

tensor([[-0.3024, -0.8402],
        [-0.0551,  0.0492],
        [ 0.2705,  1.1926],
        [-0.8250, -0.7439],
        [-0.5529, -1.2632],
        [ 0.5740,  1.4295],
        [-0.0167, -0.0432],
        [-1.1471, -0.3239],
        [ 0.8151,  1.8481],
        [-1.7194,  0.1027],
        [ 0.5936, -0.8347],
        [-0.6246, -0.0218],
        [ 1.2801, -0.6115],
        [ 0.4500, -0.9158],
        [-1.3761, -0.9120],
        [-0.6558,  1.1545],
        [ 0.5499, -0.1778],
        [-1.3890,  1.2681],
        [-0.6789, -0.2194],
        [-0.0587,  1.3815],
        [-1.0874,  0.5413],
        [-0.1611,  0.5246],
        [-0.5200,  1.1595],
        [-0.0278, -0.1751],
        [ 0.4775, -0.6600],
        [ 0.0621, -0.0870],
        [-0.1877, -0.7860]])

In [7]:
# How to embedd letter 'c'?
idx = ctoi['c']
C[idx]

tensor([-0.8250, -0.7439])

In [8]:
# How to embedd context 'abc'?
context = [0, 1, 2]
C[[context]]

tensor([[-0.3024, -0.8402],
        [-0.0551,  0.0492],
        [ 0.2705,  1.1926]])

In [9]:
# How to embedd context 'abc' and 'bcd' simultaneously?
contexts = torch.tensor([[0, 1, 2], [1, 2, 3]])
C[contexts]

tensor([[[-0.3024, -0.8402],
         [-0.0551,  0.0492],
         [ 0.2705,  1.1926]],

        [[-0.0551,  0.0492],
         [ 0.2705,  1.1926],
         [-0.8250, -0.7439]]])

In [10]:
# How to embedd all contexts?
embedding = C[X]
embedding.shape

torch.Size([21, 3, 2])

In [None]:
# TODO: explain 
embedding = embedding.reshape(len(X), EMBEDDING_DIMS * CONTEXT_LENGTH)
embedding.shape

torch.Size([21, 6])

In [None]:
HIDDEN_DIMS = 100
W1 = torch.randn((EMBEDDING_DIMS * CONTEXT_LENGTH, HIDDEN_DIMS), requires_grad=True)
b1 = torch.randn((HIDDEN_DIMS), requires_grad=True)