In [52]:
import torch
import torch.nn as nn
import numpy as np

In [126]:
texts=["hola mundo cruel", "me llamo eduardo", "buenas noches a todos"]

tokens = []
vocabulary = set()
for text in texts:
    toks = text.split()
    tokens.append(toks)
    for w in toks:
        vocabulary.add(w)

print(tokens)

unk_token = "<UNK>"
pad_token = '<PAD>'
print(vocabulary)

print(f"Vocabulary length {len(vocabulary) + 2}")

[['hola', 'mundo', 'cruel'], ['me', 'llamo', 'eduardo'], ['buenas', 'noches', 'a', 'todos']]
{'cruel', 'todos', 'llamo', 'a', 'hola', 'eduardo', 'noches', 'me', 'buenas', 'mundo'}
Vocabulary length 12


In [127]:
word2id = {unk_token:0, pad_token:1}
id2word = {0:unk_token, 1:pad_token}

for i, w in enumerate(vocabulary,2):
    word2id[w] = i
    id2word[i] = w

print(word2id)
print(id2word)

vocabulary_size = len(word2id)
print(f"Vocabulary length {vocabulary_size}")

{'<UNK>': 0, '<PAD>': 1, 'cruel': 2, 'todos': 3, 'llamo': 4, 'a': 5, 'hola': 6, 'eduardo': 7, 'noches': 8, 'me': 9, 'buenas': 10, 'mundo': 11}
{0: '<UNK>', 1: '<PAD>', 2: 'cruel', 3: 'todos', 4: 'llamo', 5: 'a', 6: 'hola', 7: 'eduardo', 8: 'noches', 9: 'me', 10: 'buenas', 11: 'mundo'}
Vocabulary length 12


In [128]:
def numeralize(text):
    sent = []
    for w in text:
        if w in vocabulary:
            sent.append(word2id[w])
        else:
            sent.append(0)  # <UNK>
    return sent

In [129]:
text = ["hola", "lalo"]
print(numeralize(text))

[6, 0]


In [130]:
text_numbers = [numeralize(tok) for tok in tokens]
print(text_numbers)

[[6, 11, 2], [9, 4, 7], [10, 8, 5, 3]]


In [131]:
larger = 0
for text in text_numbers:
    if len(text) > larger:
        larger = len(text)

print(f"Largest text of {larger} words")

Largest text of 4 words


In [132]:
def pad_sentence(text):
    if len(text) < larger:
        diff = larger - len(text)
        for i in range(diff):
            text.append(1)  # add token "<PAD>"
    return text

In [133]:
padded_text = [pad_sentence(text) for text in text_numbers]
print(padded_text)

[[6, 11, 2, 1], [9, 4, 7, 1], [10, 8, 5, 3]]


In [134]:
def textify(text):
    words = []
    for number in text:
        words.append(id2word[number])
    return words

In [135]:
text_again = [textify(text) for text in padded_text]
print(text_again)

[['hola', 'mundo', 'cruel', '<PAD>'], ['me', 'llamo', 'eduardo', '<PAD>'], ['buenas', 'noches', 'a', 'todos']]


In [146]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super(Embedder, self).__init__()
        self.embed = nn.Embedding(vocab_size+1, emb_dim)    # +1 for the ids not starting with 0

    def forward(self, x):
        return self.embed(x)

    def print_weights(self):
        return self.embed.weight

In [147]:
embedding_dimension = 5

text1 = padded_text[0]
print(text1)
print(type(text1))

[6, 11, 2, 1]
<class 'list'>


In [148]:
batch = np.vstack(padded_text)
print(batch)
print(type(batch))
print(batch[0])
print(type(batch[0]))

[[ 6 11  2  1]
 [ 9  4  7  1]
 [10  8  5  3]]
<class 'numpy.ndarray'>
[ 6 11  2  1]
<class 'numpy.ndarray'>


In [149]:
input_batch = torch.tensor(batch, dtype=torch.long)
print(input_batch)

tensor([[ 6, 11,  2,  1],
        [ 9,  4,  7,  1],
        [10,  8,  5,  3]])


In [150]:
print(input_batch[0])
print(type(input_batch[0]))

tensor([ 6, 11,  2,  1])
<class 'torch.Tensor'>


In [151]:
embedder = Embedder(vocabulary_size, embedding_dimension)
print(embedder.print_weights())


Parameter containing:
tensor([[-1.5265e+00, -1.4884e+00,  5.3641e-01, -2.4606e-01,  4.5496e-01],
        [-7.7714e-01, -2.0969e+00,  1.7835e+00, -2.6246e+00,  4.4385e-02],
        [-9.0353e-01,  6.0791e-02, -7.1391e-01,  1.5493e+00,  2.6345e-03],
        [-3.7953e-01,  4.6506e-01, -2.4369e+00, -1.8846e-01,  2.1542e+00],
        [ 5.9098e-01,  3.7616e-01, -3.1290e-01,  5.6046e-01,  1.0438e-01],
        [ 1.2133e+00,  1.3757e+00,  1.1258e+00,  8.7801e-01, -1.1513e+00],
        [ 8.0069e-02, -1.1752e+00, -3.9491e-03,  1.1250e-03, -5.0092e-01],
        [-5.6224e-02, -5.7728e-01,  1.3235e-01, -1.6854e+00, -5.7394e-01],
        [-3.2366e-01,  7.3624e-01, -5.7591e-01,  4.0074e-01, -1.5145e+00],
        [-3.0589e-01, -1.0460e-01,  1.0548e+00,  8.8961e-01,  1.1981e+00],
        [ 1.0106e+00,  1.0115e+00,  3.5873e-01,  9.3127e-01,  1.6046e-01],
        [ 5.4267e-01, -1.4515e+00,  5.6311e-01,  3.6198e-01, -8.6320e-01],
        [ 8.8370e-01, -4.5666e-01,  8.0039e-01, -2.6311e+00,  2.1566e-01]],
  

In [152]:
print(embedder(input_batch))
print(embedder(input_batch).shape)


tensor([[[ 8.0069e-02, -1.1752e+00, -3.9491e-03,  1.1250e-03, -5.0092e-01],
         [ 5.4267e-01, -1.4515e+00,  5.6311e-01,  3.6198e-01, -8.6320e-01],
         [-9.0353e-01,  6.0791e-02, -7.1391e-01,  1.5493e+00,  2.6345e-03],
         [-7.7714e-01, -2.0969e+00,  1.7835e+00, -2.6246e+00,  4.4385e-02]],

        [[-3.0589e-01, -1.0460e-01,  1.0548e+00,  8.8961e-01,  1.1981e+00],
         [ 5.9098e-01,  3.7616e-01, -3.1290e-01,  5.6046e-01,  1.0438e-01],
         [-5.6224e-02, -5.7728e-01,  1.3235e-01, -1.6854e+00, -5.7394e-01],
         [-7.7714e-01, -2.0969e+00,  1.7835e+00, -2.6246e+00,  4.4385e-02]],

        [[ 1.0106e+00,  1.0115e+00,  3.5873e-01,  9.3127e-01,  1.6046e-01],
         [-3.2366e-01,  7.3624e-01, -5.7591e-01,  4.0074e-01, -1.5145e+00],
         [ 1.2133e+00,  1.3757e+00,  1.1258e+00,  8.7801e-01, -1.1513e+00],
         [-3.7953e-01,  4.6506e-01, -2.4369e+00, -1.8846e-01,  2.1542e+00]]],
       grad_fn=<EmbeddingBackward>)
torch.Size([3, 4, 5])


In [157]:
n_users = 6
n_items = 9

embedder_users = Embedder(n_users, embedding_dimension)
embedder_items = Embedder(n_items, embedding_dimension)

print("User embeddings")
print(embedder_users.print_weights())

print("Item embeddings")
print(embedder_items.print_weights())

User embeddings
Parameter containing:
tensor([[-0.8669,  0.2444,  0.0309,  0.7737, -1.3752],
        [ 0.5067,  0.2551,  0.7666, -0.6906,  0.1182],
        [-1.4574, -0.7139, -1.2903,  0.1250, -0.1761],
        [-0.7156, -0.6478,  0.4806,  1.3227, -0.5890],
        [-0.0481, -0.7023,  1.6830, -0.0741,  0.9336],
        [ 1.2916, -0.1991,  1.7233, -1.1685,  1.8439],
        [-0.9833,  2.5072,  1.3217,  2.6013,  0.0705]], requires_grad=True)
Item embeddings
Parameter containing:
tensor([[ 0.0167, -0.8347, -0.7065, -0.4941,  0.3440],
        [-0.0895, -1.9731, -0.1074, -0.2042, -0.9967],
        [ 0.0519, -0.4616,  1.5299, -1.5154,  0.2876],
        [-0.2978, -0.1996,  0.5468,  1.8366, -1.0105],
        [ 1.0886,  0.4457, -1.1799, -1.1166,  0.2687],
        [-2.8253,  0.4217,  0.4023, -0.6789, -0.5841],
        [ 0.0880,  0.9356,  1.3416,  0.2956, -0.2526],
        [-0.5941,  0.2293, -0.8367, -0.1605,  0.6026],
        [-1.5620,  0.9095, -0.5343,  0.8031, -1.0756],
        [-0.4596, -0.27

In [158]:
user_item = (1,1)
ratings = [4.5, 2.5, 5.0]
user_1 = torch.tensor(user_item[0], dtype=torch.long)
item_1 = torch.tensor(user_item[1], dtype=torch.long)

In [161]:
user_1_embs = embedder_users(user_1)
print(user_1_embs)
print(user_1_embs.shape)

tensor([ 0.5067,  0.2551,  0.7666, -0.6906,  0.1182],
       grad_fn=<EmbeddingBackward>)
torch.Size([5])


In [160]:
item_1_embs = embedder_items(item_1)
print(item_1_embs)

tensor([-0.0895, -1.9731, -0.1074, -0.2042, -0.9967],
       grad_fn=<EmbeddingBackward>)


## LSTM

Get first sentence

In [196]:
print(input_batch)
sentence1 = input_batch[0]

print(sentence1)

tensor([[ 6, 11,  2,  1],
        [ 9,  4,  7,  1],
        [10,  8,  5,  3]])
tensor([ 6, 11,  2,  1])


In [197]:
print(sentence1.shape)

torch.Size([4])


In [201]:
def prepare_sequence(seq):
    idxs = [word2id[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

input_ = prepare_sequence(["hola","mundo","cruel", "<PAD>"])
print(input_)
print(input_.shape)

tensor([ 6, 11,  2,  1])
torch.Size([4])


In [211]:
embed = nn.Embedding(vocabulary_size, embedding_dimension)
print(embed(input_))

tensor([[-1.1404e+00, -2.1343e+00,  1.2045e+00, -1.4209e-01,  4.3552e-04],
        [-1.7869e+00, -6.0799e-01, -2.6232e-01,  3.7224e-01,  4.5826e-02],
        [-1.0003e+00, -1.1581e+00,  4.7018e-01, -9.4365e-02, -1.1623e+00],
        [ 2.0146e+00, -4.0179e-01,  5.9712e-01,  4.2814e-01, -5.2815e-01]],
       grad_fn=<EmbeddingBackward>)


In [214]:
word_embeddings = nn.Embedding(vocabulary_size, embedding_dimension)
embeds = embedder(input_)
print(embeds)
print(embeds.shape)

tensor([[ 8.0069e-02, -1.1752e+00, -3.9491e-03,  1.1250e-03, -5.0092e-01],
        [ 5.4267e-01, -1.4515e+00,  5.6311e-01,  3.6198e-01, -8.6320e-01],
        [-9.0353e-01,  6.0791e-02, -7.1391e-01,  1.5493e+00,  2.6345e-03],
        [-7.7714e-01, -2.0969e+00,  1.7835e+00, -2.6246e+00,  4.4385e-02]],
       grad_fn=<EmbeddingBackward>)
torch.Size([4, 5])


In [226]:
hidden_dim = 7
lstm = nn.LSTM(embedding_dimension, hidden_dim)
print("Sentence size:",len(sentence1))
embeds_in = embeds.view(len(sentence1), 1, -1)
print(embeds_in.shape)

Sentence size: 4
torch.Size([4, 1, 5])


In [227]:
lstm_out, (hn, cn) = lstm(embeds_in)
print("lstm_out")
print(lstm_out)
print(lstm_out.shape)
print()
print("hn")
print(hn)
print(hn.shape)
print("cn")
print(cn)
print(cn.shape)

lstm_out
tensor([[[ 0.1788,  0.0853, -0.1030,  0.0252, -0.0884, -0.0841,  0.0955]],

        [[ 0.1753,  0.1843, -0.1282,  0.1058, -0.1759, -0.1136,  0.0295]],

        [[ 0.2085,  0.2587, -0.1476, -0.0880, -0.2581, -0.2253,  0.0561]],

        [[ 0.2998,  0.1784, -0.1303,  0.2102, -0.1331, -0.2268,  0.2363]]],
       grad_fn=<StackBackward>)
torch.Size([4, 1, 7])

hn
tensor([[[ 0.2998,  0.1784, -0.1303,  0.2102, -0.1331, -0.2268,  0.2363]]],
       grad_fn=<StackBackward>)
torch.Size([1, 1, 7])
cn
tensor([[[ 0.4707,  0.4994, -0.3381,  0.2950, -0.1699, -0.6178,  0.6249]]],
       grad_fn=<StackBackward>)
torch.Size([1, 1, 7])


# Using the whole batch

In [234]:
print(input_batch)
print(input_batch.shape)
embeds_batch_in = embedder(input_batch)
print(embeds_batch_in)
print(embeds_batch_in.shape)

tensor([[ 6, 11,  2,  1],
        [ 9,  4,  7,  1],
        [10,  8,  5,  3]])
torch.Size([3, 4])
tensor([[[ 8.0069e-02, -1.1752e+00, -3.9491e-03,  1.1250e-03, -5.0092e-01],
         [ 5.4267e-01, -1.4515e+00,  5.6311e-01,  3.6198e-01, -8.6320e-01],
         [-9.0353e-01,  6.0791e-02, -7.1391e-01,  1.5493e+00,  2.6345e-03],
         [-7.7714e-01, -2.0969e+00,  1.7835e+00, -2.6246e+00,  4.4385e-02]],

        [[-3.0589e-01, -1.0460e-01,  1.0548e+00,  8.8961e-01,  1.1981e+00],
         [ 5.9098e-01,  3.7616e-01, -3.1290e-01,  5.6046e-01,  1.0438e-01],
         [-5.6224e-02, -5.7728e-01,  1.3235e-01, -1.6854e+00, -5.7394e-01],
         [-7.7714e-01, -2.0969e+00,  1.7835e+00, -2.6246e+00,  4.4385e-02]],

        [[ 1.0106e+00,  1.0115e+00,  3.5873e-01,  9.3127e-01,  1.6046e-01],
         [-3.2366e-01,  7.3624e-01, -5.7591e-01,  4.0074e-01, -1.5145e+00],
         [ 1.2133e+00,  1.3757e+00,  1.1258e+00,  8.7801e-01, -1.1513e+00],
         [-3.7953e-01,  4.6506e-01, -2.4369e+00, -1.8846e-01, 

In [233]:
lstm_out, (hn, cn) = lstm(embeds_in)
print("lstm_out")
print(lstm_out)
print(lstm_out.shape)
print()
print("hn")
print(hn)
print(hn.shape)
print()
print("cn")
print(cn)
print(cn.shape)



lstm_out
tensor([[[ 0.1788,  0.0853, -0.1030,  0.0252, -0.0884, -0.0841,  0.0955],
         [ 0.1087,  0.1119, -0.1159,  0.0897, -0.1193, -0.0753,  0.0105],
         [ 0.1569,  0.1536, -0.0925, -0.1249, -0.1908, -0.1614,  0.0603],
         [ 0.1764,  0.0380, -0.0104,  0.3139, -0.0652, -0.0929,  0.2297]],

        [[ 0.1157,  0.1885, -0.1408, -0.1213, -0.2338, -0.2006,  0.0442],
         [ 0.1713,  0.0997, -0.1735, -0.0919, -0.0124, -0.0834,  0.0108],
         [ 0.3108,  0.0961, -0.0975, -0.0678, -0.0162, -0.0677,  0.2343],
         [ 0.2707,  0.0690, -0.0184,  0.4416, -0.1180, -0.1508,  0.2537]],

        [[ 0.0876,  0.0994, -0.1824, -0.1794, -0.0106, -0.1126, -0.1050],
         [ 0.1705,  0.0717, -0.1187, -0.0973,  0.0053,  0.0831,  0.0347],
         [ 0.0099,  0.0037, -0.1273, -0.0926,  0.1023,  0.0157, -0.0056],
         [ 0.5047,  0.0522, -0.0125, -0.0197, -0.0388, -0.1787,  0.5678]]],
       grad_fn=<StackBackward>)
torch.Size([3, 4, 7])

hn
tensor([[[ 0.0876,  0.0994, -0.1824, -0