<a href="https://colab.research.google.com/github/jdasam/mas1004/blob/2024/live_coding/5_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# download data
!wget https://archive.ics.uci.edu/static/public/591/gender+by+name.zip
!unzip gender+by+name.zip

--2024-12-03 06:05:56--  https://archive.ics.uci.edu/static/public/591/gender+by+name.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘gender+by+name.zip’

gender+by+name.zip      [<=>                 ]       0  --.-KB/s               gender+by+name.zip      [ <=>                ]   3.60M  --.-KB/s    in 0.08s   

2024-12-03 06:05:56 (46.2 MB/s) - ‘gender+by+name.zip’ saved [3774735]

Archive:  gender+by+name.zip
 extracting: name_gender_dataset.csv  


In [2]:
import pandas as pd
df = pd.read_csv('name_gender_dataset.csv')
unique_gender_df = df.drop_duplicates(['Name'])
names = unique_gender_df['Name'].values
genders = unique_gender_df['Gender'].values

# names.tolist()


- `nn.Linear()`: $\mathbf{Wx} + \mathbf{b}$
  - x $\in \mathbb{R}^d$

- `RNN`
  - $h_t = \tanh (W_{xh}x_t + W_{hh}h_{t-1} + b) $
  -  $h_t = \tanh (W_{xh}x_t + b_x + W_{hh}h_{t-1} + b_h) $

In [3]:
# Building Recurrent Neural Network
import torch
previous_hidden_state = torch.randn(7).tanh()
current_input = torch.randn(5)

print(previous_hidden_state, current_input)

tensor([ 0.3173,  0.9449,  0.6065, -0.6072,  0.1983,  0.8677, -0.6661]) tensor([-0.3024, -0.3168,  0.6409,  0.9277, -1.4482])


In [4]:
# Make longer input
number_of_tokens = 9
token_embedding_size = 5

input_sequence = torch.randn((number_of_tokens, token_embedding_size))
input_sequence.shape

torch.Size([9, 5])

In [5]:
for cur_input in input_sequence:
  print(cur_input)
# for i in range(len(input_sequence)):
#   print(input_sequence[i])

tensor([-0.4678,  1.4039,  1.2724,  0.7242,  0.1369])
tensor([ 1.8653, -0.2876, -0.4275,  0.2890,  0.3136])
tensor([ 1.7496,  0.3859, -0.4404,  2.4725, -1.4289])
tensor([-0.3190, -0.5844, -0.2130, -0.7306,  0.1002])
tensor([ 0.0845,  1.6278, -1.0607, -0.7950, -1.4101])
tensor([-0.6788,  1.0926, -0.7951, -0.4779, -0.1811])
tensor([ 0.2109, -0.6911,  3.0780,  0.1557,  0.1850])
tensor([ 0.0059, -0.3190, -2.0030,  0.2687, -1.2978])
tensor([ 0.1745, -1.4658,  0.2853, -1.4561, -0.6562])


In [87]:
import torch.nn as nn

class MyRNN(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__() # init nn.Module
    self.xh = nn.Linear(input_dim, output_dim)
    self.hh = nn.Linear(output_dim, output_dim, bias=False)
    self.hidden_size = output_dim

  def run_one_step(self, current_input, previous_output):
    out = self.xh(current_input) + self.hh(previous_output)
    out = out.tanh()
    return out

  def run_sequence(self, input_sequence, last_hidden_state=None):
    if last_hidden_state is None:
      last_hidden_state = torch.zeros(self.hidden_size)

    outputs = []
    for cur_input in input_sequence:
      last_hidden_state = self.run_one_step(cur_input, last_hidden_state)
      outputs.append(last_hidden_state)

    return torch.stack(outputs)


rnn = MyRNN(input_dim=5, output_dim=7)
# rnn.run_one_step(current_input, previous_hidden_state)
rnn.run_sequence(input_sequence)

tensor([[     0.8006,     -0.5977,     -0.5627,      0.5353,     -0.5174,
             -0.8820,      0.3132],
        [    -0.5037,      0.7343,      0.8435,      0.3191,      0.0282,
              0.1454,     -0.1692],
        [     0.7459,     -0.1039,     -0.0273,      0.3499,      0.6929,
              0.0781,     -0.5515],
        [     0.3387,     -0.4092,     -0.1687,     -0.1845,     -0.2116,
              0.2101,      0.1751],
        [     0.3223,     -0.3914,     -0.7014,      0.6341,      0.2526,
              0.4638,      0.1503],
        [     0.4014,     -0.5127,     -0.6060,      0.2649,     -0.0704,
             -0.1180,     -0.0504],
        [     0.7136,      0.0855,      0.3885,     -0.0006,     -0.8240,
             -0.6849,      0.8029],
        [    -0.1088,     -0.1182,      0.1990,     -0.1582,      0.5275,
              0.7881,      0.0888],
        [     0.1171,     -0.1719,      0.2706,     -0.6426,     -0.4357,
              0.9117,      0.4036]], grad_fn=<

In [72]:
rnn.parameters()

<generator object Module.parameters at 0x7e56075f6e30>

In [7]:
modified_sequence = input_sequence.clone()
modified_sequence[4,:] = 0
modified_sequence

tensor([[-0.4678,  1.4039,  1.2724,  0.7242,  0.1369],
        [ 1.8653, -0.2876, -0.4275,  0.2890,  0.3136],
        [ 1.7496,  0.3859, -0.4404,  2.4725, -1.4289],
        [-0.3190, -0.5844, -0.2130, -0.7306,  0.1002],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.6788,  1.0926, -0.7951, -0.4779, -0.1811],
        [ 0.2109, -0.6911,  3.0780,  0.1557,  0.1850],
        [ 0.0059, -0.3190, -2.0030,  0.2687, -1.2978],
        [ 0.1745, -1.4658,  0.2853, -1.4561, -0.6562]])

In [8]:
rnn.run_sequence(modified_sequence)

tensor([[ 0.4491,  0.7436, -0.3326, -0.6986, -0.1493,  0.5841, -0.3977],
        [ 0.1771, -0.1574, -0.8689,  0.6430, -0.7092,  0.2139, -0.4559],
        [-0.3918, -0.2071, -0.8474, -0.6216, -0.3701,  0.5428, -0.4207],
        [ 0.5907,  0.6132, -0.1270,  0.4169, -0.5815,  0.0113, -0.3174],
        [ 0.5231,  0.2292, -0.3312, -0.1325, -0.3965,  0.0677, -0.6019],
        [ 0.7658,  0.4290,  0.5554, -0.3593, -0.6443,  0.3139, -0.2408],
        [ 0.4173,  0.7681, -0.9562, -0.5064,  0.5598,  0.5991, -0.9360],
        [ 0.3634, -0.5075,  0.1834,  0.4491, -0.2890,  0.0689, -0.7744],
        [ 0.3907,  0.5382, -0.2845,  0.0790, -0.0104,  0.7353, -0.3339]],
       grad_fn=<StackBackward0>)

In [9]:
# These are the examples of names
names.tolist()[:20]

['James',
 'John',
 'Robert',
 'Michael',
 'William',
 'Mary',
 'David',
 'Joseph',
 'Richard',
 'Charles',
 'Thomas',
 'Christopher',
 'Daniel',
 'Matthew',
 'Elizabeth',
 'Patricia',
 'Jennifer',
 'Anthony',
 'George',
 'Linda']

In [10]:
# What we want to do: Make names as a list of token indices
# We call that procedure 'tokenization'

# First thing we have to do: get the vocabulary
# vocab = ['a', 'b', 'c', 'd', ...]
# Gather every possible character in the corpus

name_list = names.tolist()
print(type(name_list), type(name_list[0]))

idx = 0
name = name_list[idx]
unique_char = set(name)

entire_unique_char = set([char for name in name_list for char in name.lower()])
len(entire_unique_char)

<class 'list'> <class 'str'>


50

In [11]:
name.lower()

'james'

In [12]:
[name for name in name_list if '@' in name ]

['Bhagwati@Shinnu',
 'Soni@',
 'Sonu@Akil',
 'Vivek@',
 'Anjali@Rinku',
 'Anno@',
 'Bharati@Ruchika',
 'Ekta@Mamta',
 'Guddiya@Guddi',
 'Kajal@',
 'Kajal@Sundri',
 'Khushbari@',
 'Kimmi@Neelam',
 'Krishna@Manisha',
 'Kritika@Kittu',
 'Laxmi@Nankai',
 'Mahi@Munasvi',
 'Mamta@Lalita',
 'Manju@',
 'Megha@Sandhya',
 'Muskan@Ruksher',
 'Neeta@Narayani',
 'Nikita@Niki',
 'Nisha@Neelam',
 'Pooja@Neha',
 'Premwati@Radha',
 'Puja@Rakhi',
 'Rahi@',
 'Rajeswary@Rajo@Chanchal',
 'Rajkumari@Babli',
 'Rashid@Robhi',
 'Ratni@Jasoda',
 'Sabana@Moni',
 'Sabenoor@Tamanna',
 'Sabnam@',
 'Sabreen@',
 'Sagita@Harsita',
 'Sakina@Kajal',
 'Sawana@Pinki',
 'Shagufta@Munny',
 'Shakshi@',
 'Shakuntala@Pooja,',
 'Shanawaz@Heena',
 'Shefali@Puja',
 'Shivani@Prachi',
 'Sujata@',
 'Yasmeen@Lali',
 'Yoshoda@',
 'Ankit@',
 'Ankit@Udai',
 'Ankur@',
 'Annu@Anil',
 'Ashwani@Manish',
 'Batu@',
 'Bharat@Dholu',
 'Dhiraj@Dhirendar',
 'Gaurav@',
 'Golu@',
 'Hemant@Teeku',
 'Jitender@',
 'Jitender@Jita',
 'Jitu@Jitendra',
 '

In [13]:
# count the appearance of each character
from collections import Counter
char_counter = Counter([char for name in name_list for char in name.lower()])

omitted_chars = []
for key, value in char_counter.items():
  # print(key, value)
  if value < 200:
    omitted_chars.append(key)
omitted_chars = set(omitted_chars)

In [14]:
char_counter.items()

dict_items([('j', 15237), ('a', 153757), ('m', 28613), ('e', 99229), ('s', 42583), ('o', 36099), ('h', 40331), ('n', 80335), ('r', 59497), ('b', 11706), ('t', 34400), ('i', 78409), ('c', 18366), ('l', 57474), ('w', 4891), ('y', 33901), ('d', 26959), ('v', 10454), ('p', 6159), ('z', 7979), ('f', 4638), ('g', 9531), ('u', 18951), ('k', 21253), ('x', 1806), ('q', 2468), ('-', 7257), ('à', 3), ('.', 111), ('…', 13), ("'", 254), ('0', 5), ('œ', 2), ('@', 101), (',', 9), ('/', 6), ('"', 2), (';', 8), ('¡', 1), ('&', 1), ('(', 2), ('?', 3), ('1', 1), ('9', 2), ('5', 2), ('7', 2), ('ö', 1), (')', 1), ('[', 1), ('8', 1)])

In [15]:
name = 'james1'

(set(name) - omitted_chars) == set(name)

False

In [16]:
# using our omitted_chars, we can filter out the name_list


filtered_names = [name.lower() for name in name_list if (set(name) - omitted_chars) == set(name)]
len(filtered_names), len(name_list)

(133650, 133910)

In [32]:
def get_vocabulary(list_of_names):
  entire_chars = [char for name in list_of_names for char in name]
  vocab = set(entire_chars)
  return sorted(list(vocab))

vocab = get_vocabulary(filtered_names)
vocab = ['@'] + vocab
vocab, len(vocab)

(['@',
  "'",
  '-',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z'],
 29)

In [34]:
# convert name into list of vocabulary index
# james -> [int int int int int]

name = filtered_names[0]

converted_indices = []
for char in name:
  # get the index of the character in the vocabulary
  idx = vocab.index(char)
  print(char, idx)
  converted_indices.append(idx)

def encode(vocab, input):
  input = '@' + input + '@' # @ to denote start/end
  return [vocab.index(char) for char in input]

converted_indices = encode(vocab, name)
converted_indices

j 12
a 3
m 15
e 7
s 21


[0, 12, 3, 15, 7, 21, 0]

In [37]:
def decode(vocab, input):
  # input: list of integer
  return ''.join([vocab[idx] for idx in input if vocab[idx] != '@'])

decode(vocab, converted_indices)

'james'

In [20]:
converted_names = [encode(vocab, name) for name in filtered_names]
len(converted_names), converted_names[10]

(133650, [21, 9, 16, 14, 2, 20])

In [48]:
# Convert it into tensor, and make a single training example

# name_in_int = converted_names[10]
name = 'aaron'
name_in_int = encode(vocab, name)
# name_in_int = [3, 3, ]

tensor_input = torch.LongTensor(name_in_int)

tensor_target = tensor_input[1:]
tensor_input = tensor_input[:-1]

print(tensor_input)
print(tensor_target)

tensor([ 0,  3,  3, 20, 17, 16])
tensor([ 3,  3, 20, 17, 16,  0])


In [49]:
# Make embedding vector for each character in the vocabulary
# Let's set the embedding size = 17

char_emb_layer = nn.Embedding(len(vocab), embedding_dim=17)

embs = char_emb_layer(tensor_input)
embs

tensor([[ 0.5796,  0.7081,  1.4207,  0.5417, -0.0683,  0.5696, -0.4655,  0.6087,
          0.1541,  0.0539, -1.2796,  0.9980,  0.0866,  2.8421, -1.9275,  1.4927,
          1.6606],
        [ 0.8316,  1.6500, -0.7968, -0.5850, -1.1649,  0.7503,  0.8240,  0.0804,
         -0.0723, -1.2435,  0.2839,  1.0673,  1.3564,  0.2643,  0.5474, -0.0726,
          0.3957],
        [ 0.8316,  1.6500, -0.7968, -0.5850, -1.1649,  0.7503,  0.8240,  0.0804,
         -0.0723, -1.2435,  0.2839,  1.0673,  1.3564,  0.2643,  0.5474, -0.0726,
          0.3957],
        [ 0.8655, -0.4688,  0.6007,  1.8388,  0.4143,  0.0430,  0.9003,  0.1143,
          1.5205,  0.5342, -0.3439,  0.2680,  0.6183, -1.1763,  1.8757, -0.5936,
         -0.4082],
        [ 0.6767,  0.5776,  0.8381,  0.6378,  1.4960, -0.6097,  0.9543, -0.3541,
          1.1127, -0.4236, -0.7799,  1.4248, -0.6570,  0.4750,  1.1819, -0.1733,
          0.1519],
        [-1.0359,  1.4109,  0.9710,  2.1984, -2.2950, -0.7237, -1.1068, -1.8270,
         -1.59

In [50]:
# Next step: feed this sequence of embeddings to RNN
torch.set_printoptions(sci_mode=False)

rnn = MyRNN(input_dim=17, output_dim=32)
output_by_chars = rnn.run_sequence(embs)

# We have calculated a complex context vector for each char
output_by_chars[3] # a output of RNN after reading every chars until (including) 3rd character



tensor([-0.0303,  0.0377, -0.2220, -0.0041,  0.8256,  0.2656,  0.6708,  0.5075,
        -0.1148,  0.5205,  0.0997, -0.6013,  0.8623, -0.4349, -0.4576,  0.0483,
         0.6074, -0.3170, -0.4717, -0.4734,  0.4475, -0.2512, -0.6310, -0.0113,
         0.4841, -0.2324,  0.6963, -0.2245, -0.0095,  0.7370,  0.5999, -0.0390],
       grad_fn=<SelectBackward0>)

In [51]:
tensor_input[4]

tensor(17)

In [52]:
# Based on these output, we have to predict the following char
# How can we compute the following character, using output_by_chars?

# How can we compute which would be the 4-th character using output_by_chars[3]
# Do we have to make our model to predict one single character?
# What would be other way to predict the next character?

# We make neural network to predict probability distribution of next character
# across the entire vocab

projection_layer = nn.Linear(32, len(vocab))
logits = projection_layer(output_by_chars)
print(output_by_chars.shape, logits.shape)

probs = torch.softmax(logits, dim=-1)
probs, tensor_input


torch.Size([6, 32]) torch.Size([6, 29])


(tensor([[0.0472, 0.0273, 0.0561, 0.0500, 0.0334, 0.0313, 0.0515, 0.0259, 0.0431,
          0.0221, 0.0325, 0.0278, 0.0353, 0.0274, 0.0373, 0.0225, 0.0332, 0.0334,
          0.0445, 0.0227, 0.0166, 0.0481, 0.0256, 0.0211, 0.0544, 0.0378, 0.0410,
          0.0216, 0.0294],
         [0.0368, 0.0500, 0.0259, 0.0329, 0.0353, 0.0236, 0.0216, 0.0351, 0.0368,
          0.0230, 0.0473, 0.0299, 0.0300, 0.0231, 0.0301, 0.0430, 0.0334, 0.0547,
          0.0351, 0.0176, 0.0377, 0.0380, 0.0410, 0.0212, 0.0292, 0.0349, 0.0314,
          0.0412, 0.0601],
         [0.0333, 0.0351, 0.0294, 0.0328, 0.0342, 0.0302, 0.0260, 0.0367, 0.0358,
          0.0304, 0.0277, 0.0345, 0.0312, 0.0216, 0.0338, 0.0446, 0.0366, 0.0457,
          0.0467, 0.0244, 0.0366, 0.0399, 0.0474, 0.0246, 0.0278, 0.0287, 0.0279,
          0.0304, 0.0659],
         [0.0375, 0.0355, 0.0332, 0.0350, 0.0314, 0.0308, 0.0290, 0.0371, 0.0225,
          0.0351, 0.0439, 0.0352, 0.0408, 0.0338, 0.0224, 0.0411, 0.0225, 0.0205,
          0.0366,

In [53]:
for i in range(len(probs)):
  prob_dist_of_i = probs[i]
  # next_char = tensor_input[i+1]
  next_char = tensor_target[i]
  prob_of_correct_next_char = prob_dist_of_i[next_char]
  print(prob_of_correct_next_char)

# This makes error because we don't have next character to the last character

# we append <end> token to the end of the sequence.


tensor(0.0500, grad_fn=<SelectBackward0>)
tensor(0.0329, grad_fn=<SelectBackward0>)
tensor(0.0366, grad_fn=<SelectBackward0>)
tensor(0.0205, grad_fn=<SelectBackward0>)
tensor(0.0220, grad_fn=<SelectBackward0>)
tensor(0.0264, grad_fn=<SelectBackward0>)


In [58]:
# Calculate loss without for loop
# we want to apply negative log-likelihood (NLL) loss
# print(probs.shape)

def get_nll_loss(probs, targets):
  return -torch.log(probs[torch.arange(len(probs)), targets] + 1e-8).mean()

loss = get_nll_loss(probs, tensor_target)
loss

tensor(3.5098, grad_fn=<NegBackward0>)

In [91]:
import random
# Now we will wrap up everything into a dataset and model

class Dataset:
  def __init__(self, csv_fn='name_gender_dataset.csv'):
    df = pd.read_csv(csv_fn)
    unique_gender_df = df.drop_duplicates(['Name'])
    names = unique_gender_df['Name'].values

    self.name_list = names.tolist()
    omitted_chars = self.get_omit_chars(self.name_list)
    # filter the names
    self.name_list = [name.lower() for name in name_list if (set(name) - omitted_chars) == set(name)]
    random.shuffle(self.name_list)
    self.tokenizer = Tokenizer(self.name_list)

  def __len__(self):
    return len(self.name_list)

  def __getitem__(self, idx):
    # get the idx-th name
    name = self.name_list[idx]
    name_in_idxs = torch.LongTensor(self.tokenizer.encode(name))

    input_seq = name_in_idxs[:-1] # slice before <end> token
    target_seq = name_in_idxs[1:] # slice after <start> token
    return input_seq, target_seq


  def get_omit_chars(self, name_list, threshold=200):
    char_counter = Counter([char for name in name_list for char in name.lower()])
    omitted_chars = []
    for key, value in char_counter.items():
      if value < threshold:
        omitted_chars.append(key)
    omitted_chars = set(omitted_chars)
    return omitted_chars

class Tokenizer:
  def __init__(self, list_of_names):
    entire_chars = [char for name in list_of_names for char in name]
    vocab = set(entire_chars)
    vocab = sorted(list(vocab))
    self.vocab = ['@'] + vocab

  def encode(self, input):
    input = '@' + input + '@' # @ to denote start/end
    return [self.vocab.index(char) for char in input]

  def decode(self, input):
    # input: list of integer
    return ''.join([self.vocab[idx] for idx in input if self.vocab[idx] != '@'])

  def __len__(self):
    return len(self.vocab)

dataset = Dataset()
input, target = dataset[0]
print(f"input: {input}")
print(f"target: {target}")

input: tensor([ 0, 14, 11, 21,  3,  2, 21, 17, 18, 10, 11,  7])
target: tensor([14, 11, 21,  3,  2, 21, 17, 18, 10, 11,  7,  0])


In [66]:
dataset.tokenizer.encode('aaron')

[0, 3, 3, 20, 17, 16, 0]

In [89]:
# Define model that contains char embedding, RNN, and projection layer

class LanguageModel(nn.Module):
  def __init__(self, vocab_size:int, hidden_size=64):
    super().__init__()
    self.emb_layer = nn.Embedding(vocab_size, embedding_dim=hidden_size)
    self.rnn = MyRNN(hidden_size, hidden_size)
    self.proj_layer = nn.Linear(hidden_size, vocab_size) #

  def forward(self, x):
    # convert token indices to token vectors
    embeddings = self.emb_layer(x)
    hidden_states = self.rnn.run_sequence(embeddings)
    logits = self.proj_layer(hidden_states)
    probs = logits.softmax(dim=-1)

    return probs

model = LanguageModel(len(dataset.tokenizer), hidden_size=64)
probs = model(input)
loss = get_nll_loss(probs, target)
loss.backward()

In [92]:
from tqdm.auto import tqdm

model = LanguageModel(len(dataset.tokenizer), hidden_size=64)
optimizer = torch.optim.Adam(model.parameters())

loss_records = []

for i in tqdm(range(len(dataset))):
  input, target = dataset[i]
  probs = model(input)
  loss = get_nll_loss(probs, target)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  loss_records.append(loss.item())

  0%|          | 0/133650 [00:00<?, ?it/s]

In [117]:
# Let's make a new name with the model

# torch.manual_seed(1)

output_tokens = []
previously_generated_token = torch.LongTensor([0]) # 'start/end' token idx is 0 in our vocab
previous_hidden = torch.zeros(model.rnn.hidden_size)

for i in range(100):
  char_emb = model.emb_layer(previously_generated_token)
  hidden = model.rnn.run_one_step(char_emb, previous_hidden)
  previous_hidden = hidden
  logit = model.proj_layer(hidden)
  prob = logit.softmax(dim=-1)

  # we have to sample one single token from the probability distribution
  next_token = torch.multinomial(prob, num_samples=1)[0]
  if next_token == 0: # start/end token
    break
  output_tokens.append(next_token.item())
  previously_generated_token = next_token

generated_name = dataset.tokenizer.decode(output_tokens)
generated_name

'cessaree'