<a href="https://colab.research.google.com/github/jdasam/mas1004/blob/2024/live_coding/5_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# download data
!wget https://archive.ics.uci.edu/static/public/591/gender+by+name.zip
!unzip gender+by+name.zip

--2024-11-26 06:01:42--  https://archive.ics.uci.edu/static/public/591/gender+by+name.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘gender+by+name.zip’

gender+by+name.zip      [       <=>          ]   3.60M  1.87MB/s    in 1.9s    

2024-11-26 06:01:45 (1.87 MB/s) - ‘gender+by+name.zip’ saved [3774735]

Archive:  gender+by+name.zip
 extracting: name_gender_dataset.csv  


In [2]:
import pandas as pd
df = pd.read_csv('name_gender_dataset.csv')
unique_gender_df = df.drop_duplicates(['Name'])
names = unique_gender_df['Name'].values
genders = unique_gender_df['Gender'].values

# names.tolist()


- `nn.Linear()`: $\mathbf{Wx} + \mathbf{b}$
  - x $\in \mathbb{R}^d$

- `RNN`
  - $h_t = \tanh (W_{xh}x_t + W_{hh}h_{t-1} + b) $
  -  $h_t = \tanh (W_{xh}x_t + b_x + W_{hh}h_{t-1} + b_h) $

In [3]:
# Building Recurrent Neural Network
import torch
previous_hidden_state = torch.randn(7).tanh()
current_input = torch.randn(5)

print(previous_hidden_state, current_input)

tensor([-0.4477,  0.0689, -0.8287,  0.0499,  0.0322, -0.6541,  0.7924]) tensor([ 0.2123, -1.3285, -0.3113,  1.0953,  0.8298])


In [4]:
# Make longer input
number_of_tokens = 9
token_embedding_size = 5

input_sequence = torch.randn((number_of_tokens, token_embedding_size))
input_sequence.shape

torch.Size([9, 5])

In [5]:
for cur_input in input_sequence:
  print(cur_input)
# for i in range(len(input_sequence)):
#   print(input_sequence[i])

tensor([ 0.4327, -0.7840, -0.1075,  2.2716, -0.5705])
tensor([-0.5104, -1.6307,  0.5388, -1.5380,  0.8719])
tensor([-1.4114,  2.5902, -0.2259,  0.9653,  0.9899])
tensor([-1.3992,  0.3997,  0.6053, -0.0415,  1.0008])
tensor([-1.4440,  1.2071,  2.5974,  0.3771,  0.5250])
tensor([ 0.3963, -0.3961, -0.2312,  0.7685, -1.4756])
tensor([-0.2272,  0.8759, -1.0998, -0.1117,  1.8035])
tensor([-0.8828, -0.5067,  0.2981, -1.9616,  0.6948])
tensor([ 0.2233, -0.7568, -1.3587, -0.1077,  1.1827])


In [6]:
import torch.nn as nn

class MyRNN:
  def __init__(self, input_dim, output_dim):
    self.xh = nn.Linear(input_dim, output_dim)
    self.hh = nn.Linear(output_dim, output_dim, bias=False)
    self.hidden_size = output_dim

  def run_one_step(self, current_input, previous_output):
    out = self.xh(current_input) + self.hh(previous_output)
    out = out.tanh()
    return out

  def run_sequence(self, input_sequence, last_hidden_state=None):
    if last_hidden_state is None:
      last_hidden_state = torch.zeros(self.hidden_size)

    outputs = []
    for cur_input in input_sequence:
      last_hidden_state = self.run_one_step(cur_input, last_hidden_state)
      outputs.append(last_hidden_state)

    return torch.stack(outputs)


rnn = MyRNN(input_dim=5, output_dim=7)
# rnn.run_one_step(current_input, previous_hidden_state)
rnn.run_sequence(input_sequence)

tensor([[-0.9215, -0.2476,  0.5453,  0.2238,  0.2572,  0.5746, -0.6511],
        [ 0.0955,  0.6433,  0.0959, -0.6613, -0.3291, -0.4824, -0.6279],
        [ 0.8431, -0.6233,  0.7644,  0.6869,  0.4335, -0.2522, -0.8306],
        [ 0.3031, -0.1443,  0.4274, -0.4197,  0.4786, -0.6942, -0.6298],
        [-0.3371, -0.5188,  0.8358, -0.6725,  0.6307, -0.7543, -0.8908],
        [-0.8792,  0.0010, -0.2784, -0.1813, -0.3594, -0.4797, -0.3185],
        [ 0.9235, -0.1221,  0.2369,  0.6476, -0.6858,  0.4014, -0.6706],
        [ 0.5850,  0.3469, -0.4340, -0.8133,  0.0179, -0.7388,  0.4533],
        [ 0.6535, -0.1869, -0.3994,  0.0867, -0.7126,  0.7177, -0.5490]],
       grad_fn=<StackBackward0>)

In [7]:
modified_sequence = input_sequence.clone()
modified_sequence[4,:] = 0
modified_sequence

tensor([[ 0.4327, -0.7840, -0.1075,  2.2716, -0.5705],
        [-0.5104, -1.6307,  0.5388, -1.5380,  0.8719],
        [-1.4114,  2.5902, -0.2259,  0.9653,  0.9899],
        [-1.3992,  0.3997,  0.6053, -0.0415,  1.0008],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.3963, -0.3961, -0.2312,  0.7685, -1.4756],
        [-0.2272,  0.8759, -1.0998, -0.1117,  1.8035],
        [-0.8828, -0.5067,  0.2981, -1.9616,  0.6948],
        [ 0.2233, -0.7568, -1.3587, -0.1077,  1.1827]])

In [8]:
rnn.run_sequence(modified_sequence)

tensor([[-0.9215, -0.2476,  0.5453,  0.2238,  0.2572,  0.5746, -0.6511],
        [ 0.0955,  0.6433,  0.0959, -0.6613, -0.3291, -0.4824, -0.6279],
        [ 0.8431, -0.6233,  0.7644,  0.6869,  0.4335, -0.2522, -0.8306],
        [ 0.3031, -0.1443,  0.4274, -0.4197,  0.4786, -0.6942, -0.6298],
        [-0.2250, -0.1665, -0.2671, -0.3248, -0.2643, -0.3370, -0.4313],
        [-0.8495,  0.0544, -0.2189, -0.3411, -0.4572, -0.2946,  0.1164],
        [ 0.9310, -0.1134,  0.3799,  0.6804, -0.6877,  0.5034, -0.5731],
        [ 0.5928,  0.3523, -0.3925, -0.7945,  0.0758, -0.7334,  0.5044],
        [ 0.6600, -0.1895, -0.3985,  0.1009, -0.7144,  0.7144, -0.5422]],
       grad_fn=<StackBackward0>)

In [9]:
# These are the examples of names
names.tolist()[:20]

['James',
 'John',
 'Robert',
 'Michael',
 'William',
 'Mary',
 'David',
 'Joseph',
 'Richard',
 'Charles',
 'Thomas',
 'Christopher',
 'Daniel',
 'Matthew',
 'Elizabeth',
 'Patricia',
 'Jennifer',
 'Anthony',
 'George',
 'Linda']

In [33]:
# What we want to do: Make names as a list of token indices
# We call that procedure 'tokenization'

# First thing we have to do: get the vocabulary
# vocab = ['a', 'b', 'c', 'd', ...]
# Gather every possible character in the corpus

name_list = names.tolist()
print(type(name_list), type(name_list[0]))

idx = 0
name = name_list[idx]
unique_char = set(name)

entire_unique_char = set([char for name in name_list for char in name.lower()])
len(entire_unique_char)

<class 'list'> <class 'str'>


50

In [32]:
name.lower()

'james'

In [28]:
[name for name in name_list if '@' in name ]

['Bhagwati@Shinnu',
 'Soni@',
 'Sonu@Akil',
 'Vivek@',
 'Anjali@Rinku',
 'Anno@',
 'Bharati@Ruchika',
 'Ekta@Mamta',
 'Guddiya@Guddi',
 'Kajal@',
 'Kajal@Sundri',
 'Khushbari@',
 'Kimmi@Neelam',
 'Krishna@Manisha',
 'Kritika@Kittu',
 'Laxmi@Nankai',
 'Mahi@Munasvi',
 'Mamta@Lalita',
 'Manju@',
 'Megha@Sandhya',
 'Muskan@Ruksher',
 'Neeta@Narayani',
 'Nikita@Niki',
 'Nisha@Neelam',
 'Pooja@Neha',
 'Premwati@Radha',
 'Puja@Rakhi',
 'Rahi@',
 'Rajeswary@Rajo@Chanchal',
 'Rajkumari@Babli',
 'Rashid@Robhi',
 'Ratni@Jasoda',
 'Sabana@Moni',
 'Sabenoor@Tamanna',
 'Sabnam@',
 'Sabreen@',
 'Sagita@Harsita',
 'Sakina@Kajal',
 'Sawana@Pinki',
 'Shagufta@Munny',
 'Shakshi@',
 'Shakuntala@Pooja,',
 'Shanawaz@Heena',
 'Shefali@Puja',
 'Shivani@Prachi',
 'Sujata@',
 'Yasmeen@Lali',
 'Yoshoda@',
 'Ankit@',
 'Ankit@Udai',
 'Ankur@',
 'Annu@Anil',
 'Ashwani@Manish',
 'Batu@',
 'Bharat@Dholu',
 'Dhiraj@Dhirendar',
 'Gaurav@',
 'Golu@',
 'Hemant@Teeku',
 'Jitender@',
 'Jitender@Jita',
 'Jitu@Jitendra',
 '

In [39]:
# count the appearance of each character
from collections import Counter
char_counter = Counter([char for name in name_list for char in name.lower()])

omitted_chars = []
for key, value in char_counter.items():
  # print(key, value)
  if value < 200:
    omitted_chars.append(key)
omitted_chars = set(omitted_chars)

In [35]:
char_counter.items()

dict_items([('j', 15237), ('a', 153757), ('m', 28613), ('e', 99229), ('s', 42583), ('o', 36099), ('h', 40331), ('n', 80335), ('r', 59497), ('b', 11706), ('t', 34400), ('i', 78409), ('c', 18366), ('l', 57474), ('w', 4891), ('y', 33901), ('d', 26959), ('v', 10454), ('p', 6159), ('z', 7979), ('f', 4638), ('g', 9531), ('u', 18951), ('k', 21253), ('x', 1806), ('q', 2468), ('-', 7257), ('à', 3), ('.', 111), ('…', 13), ("'", 254), ('0', 5), ('œ', 2), ('@', 101), (',', 9), ('/', 6), ('"', 2), (';', 8), ('¡', 1), ('&', 1), ('(', 2), ('?', 3), ('1', 1), ('9', 2), ('5', 2), ('7', 2), ('ö', 1), (')', 1), ('[', 1), ('8', 1)])

In [47]:
name = 'james1'

(set(name) - omitted_chars) == set(name)

False

In [50]:
# using our omitted_chars, we can filter out the name_list


filtered_names = [name.lower() for name in name_list if (set(name) - omitted_chars) == set(name)]
len(filtered_names), len(name_list)

(133650, 133910)

In [58]:
def get_vocabulary(list_of_names):
  entire_chars = [char for name in list_of_names for char in name]
  vocab = set(entire_chars)
  return sorted(list(vocab))

vocab = get_vocabulary(filtered_names)
vocab, len(vocab)

(["'",
  '-',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z'],
 28)

In [63]:
# convert name into list of vocabulary index
# james -> [int int int int int]

name = filtered_names[0]

converted_indices = []
for char in name:
  # get the index of the character in the vocabulary
  idx = vocab.index(char)
  print(char, idx)
  converted_indices.append(idx)

def encode(vocab, input):
  return [vocab.index(char) for char in input]

converted_indices = encode(vocab, name)
converted_indices

j 11
a 2
m 14
e 6
s 20


[11, 2, 14, 6, 20]

In [66]:
def decode(vocab, input):
  # input: list of integer
  return ''.join([vocab[idx] for idx in input])

decode(vocab, converted_indices)

'james'

In [69]:
converted_names = [encode(vocab, name) for name in filtered_names]
len(converted_names), converted_names[10]

(133650, [21, 9, 16, 14, 2, 20])

In [83]:
# Convert it into tensor, and make a single training example

# name_in_int = converted_names[10]
name = 'aarona'
name_in_int = encode(vocab, name)
# name_in_int = [3, 3, ]

tensor_input = torch.LongTensor(name_in_int)
tensor_input

tensor([ 2,  2, 19, 16, 15,  2])

In [84]:
# Make embedding vector for each character in the vocabulary
# Let's set the embedding size = 17

char_emb_layer = nn.Embedding(len(vocab), embedding_dim=17)

embs = char_emb_layer(tensor_input)
embs

tensor([[ 0.5584, -0.7063,  0.0653, -0.0191, -0.4693, -0.5989,  1.0094, -1.3406,
          0.6128,  0.9527,  0.2929, -0.2907,  1.5477, -0.7131, -0.1934, -1.8459,
         -0.7947],
        [ 0.5584, -0.7063,  0.0653, -0.0191, -0.4693, -0.5989,  1.0094, -1.3406,
          0.6128,  0.9527,  0.2929, -0.2907,  1.5477, -0.7131, -0.1934, -1.8459,
         -0.7947],
        [-0.0473, -0.7585,  1.2368,  0.2355,  1.3041, -1.7071,  0.6098,  1.2954,
         -0.7085, -1.0002,  0.4791, -0.5579, -0.8564, -0.7666,  0.3820, -0.7672,
          0.7581],
        [ 1.9613,  1.7548,  0.4391,  0.4485, -0.1559,  1.7656,  1.9222, -0.4681,
         -0.0102, -1.2229, -0.2372,  1.4594, -0.3267,  1.0597,  0.2514, -0.0564,
          1.1611],
        [ 0.4920,  0.7944,  0.3562,  0.6819, -1.7091, -0.7011,  0.1752,  0.8004,
          0.5278, -1.2903, -1.1066,  1.0445, -1.7812, -1.6004, -0.5135,  0.7133,
          0.7667],
        [ 0.5584, -0.7063,  0.0653, -0.0191, -0.4693, -0.5989,  1.0094, -1.3406,
          0.61

In [91]:
# Next step: feed this sequence of embeddings to RNN
torch.set_printoptions(sci_mode=False)

rnn = MyRNN(input_dim=17, output_dim=32)
output_by_chars = rnn.run_sequence(embs)

# We have calculated a complex context vector for each char
output_by_chars[3] # a output of RNN after reading every chars until (including) 3rd character



tensor([-0.6408, -0.4566,  0.4528, -0.5318, -0.0338, -0.0987,  0.7928,  0.0440,
        -0.8226,  0.9018, -0.6092,  0.6840, -0.1854,  0.4610, -0.5393, -0.6151,
        -0.8082, -0.3435, -0.0083, -0.6168, -0.1027, -0.0746, -0.2517,  0.1824,
         0.8443, -0.5128, -0.0966, -0.3882, -0.8354, -0.6599, -0.4508, -0.7143],
       grad_fn=<SelectBackward0>)

In [89]:
tensor_input[4]

tensor(15)

In [94]:
# Based on these output, we have to predict the following char
# How can we compute the following character, using output_by_chars?

# How can we compute which would be the 4-th character using output_by_chars[3]
# Do we have to make our model to predict one single character?
# What would be other way to predict the next character?

# We make neural network to predict probability distribution of next character
# across the entire vocab

projection_layer = nn.Linear(32, len(vocab))
logits = projection_layer(output_by_chars)
print(output_by_chars.shape, logits.shape)

probs = torch.softmax(logits, dim=-1)
probs, tensor_input


torch.Size([6, 32]) torch.Size([6, 28])


(tensor([[0.0308, 0.0295, 0.0373, 0.0453, 0.0404, 0.0275, 0.0274, 0.0311, 0.0251,
          0.0606, 0.0327, 0.0223, 0.0241, 0.0462, 0.0234, 0.0329, 0.0401, 0.0356,
          0.0476, 0.0243, 0.0312, 0.0559, 0.0445, 0.0345, 0.0365, 0.0265, 0.0416,
          0.0452],
         [0.0308, 0.0268, 0.0357, 0.0447, 0.0439, 0.0298, 0.0239, 0.0267, 0.0245,
          0.0541, 0.0349, 0.0181, 0.0220, 0.0452, 0.0254, 0.0326, 0.0361, 0.0339,
          0.0562, 0.0254, 0.0283, 0.0541, 0.0536, 0.0391, 0.0428, 0.0200, 0.0454,
          0.0459],
         [0.0390, 0.0247, 0.0354, 0.0398, 0.0412, 0.0265, 0.0251, 0.0330, 0.0323,
          0.0377, 0.0416, 0.0258, 0.0208, 0.0334, 0.0356, 0.0290, 0.0413, 0.0246,
          0.0544, 0.0417, 0.0310, 0.0593, 0.0380, 0.0345, 0.0433, 0.0177, 0.0583,
          0.0349],
         [0.0373, 0.0391, 0.0466, 0.0340, 0.0429, 0.0320, 0.0228, 0.0274, 0.0364,
          0.0573, 0.0459, 0.0289, 0.0209, 0.0400, 0.0426, 0.0306, 0.0377, 0.0452,
          0.0448, 0.0406, 0.0415, 0.0268,

In [95]:
for i in range(len(probs)):
  prob_dist_of_i = probs[i]
  next_char = tensor_input[i+1]
  prob_of_correct_next_char = prob_dist_of_i[next_char]
  print(prob_of_correct_next_char)

# This makes error because we don't have next character to the last character

# we append <end> token to the end of the sequence.


tensor(0.0373, grad_fn=<SelectBackward0>)
tensor(0.0254, grad_fn=<SelectBackward0>)
tensor(0.0413, grad_fn=<SelectBackward0>)
tensor(0.0306, grad_fn=<SelectBackward0>)
tensor(0.0719, grad_fn=<SelectBackward0>)


IndexError: index 6 is out of bounds for dimension 0 with size 6