<a href="https://colab.research.google.com/github/jdasam/mas1004-2023/blob/main/live_coding/6_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recurrent Neural Network

In [1]:
# download data
!wget https://archive.ics.uci.edu/static/public/591/gender+by+name.zip
!unzip gender+by+name.zip

--2023-11-23 06:47:36--  https://archive.ics.uci.edu/static/public/591/gender+by+name.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘gender+by+name.zip’

gender+by+name.zip      [  <=>               ]   3.60M  8.92MB/s    in 0.4s    

2023-11-23 06:47:37 (8.92 MB/s) - ‘gender+by+name.zip’ saved [3774735]

Archive:  gender+by+name.zip
 extracting: name_gender_dataset.csv  


In [6]:
import pandas as pd
df = pd.read_csv('name_gender_dataset.csv')
unique_gender_df = df.drop_duplicates(['Name'])
names = unique_gender_df['Name'].values
genders = unique_gender_df['Gender'].values

In [5]:
df[df['Name']=="James"]

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1615,James,F,23963,6.6e-05


In [11]:
idx = 10001

names[idx], genders[idx]

('Yitzchak', 'M')

In [9]:
genders

array(['M', 'M', 'M', ..., 'M', 'M', 'M'], dtype=object)

In [None]:
# first, convert every name into lower case
names_list = names.tolist()
names_list = [name.lower() for name in names_list]
names_list


In [19]:
# What kind of unique characters exist in our names?

name_example = names_list[100]
name_example
for char in name_example:
  print(char)

entire_chars = [char for name in names_list for char in name]
len(names_list), len(entire_chars)

a
d
a
m


(133910, 912815)

In [20]:
unique_chars = list(set(entire_chars))
unique_chars

['œ',
 ')',
 'p',
 'k',
 'r',
 ';',
 "'",
 '/',
 '5',
 '[',
 '&',
 'w',
 'i',
 '?',
 'e',
 '…',
 't',
 'v',
 '-',
 'g',
 ',',
 'o',
 '@',
 '9',
 's',
 'à',
 'x',
 '"',
 'h',
 '7',
 'd',
 'q',
 'm',
 '0',
 'j',
 '(',
 'b',
 'y',
 'z',
 'ö',
 'a',
 '.',
 'u',
 '8',
 'n',
 'c',
 'f',
 'l',
 '1',
 '¡']

In [23]:
special_chars = '&'
special_names = [name for name in names_list if special_chars in name]
special_names

['puspa&']

In [29]:
unique_chars

['"',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '5',
 '7',
 '8',
 '9',
 ';',
 '?',
 '@',
 '[',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '¡',
 'à',
 'ö',
 'œ',
 '…']

In [31]:
unique_chars = sorted(unique_chars) # this is our vocabulary

char = 'c'
# Get the index of this char in the vocab
char_idx = unique_chars.index(char)
char_idx, unique_chars[char_idx]

# alist.index usually takes bit long
# to make it faster, we use dictionary
char2idx = {}

for char in unique_chars:
  char2idx[char] = unique_chars.index(char)
print(char2idx)
char2idx[char] # now we can get the index of the char by indexing dictionary

char2idx = { char: idx for idx, char in enumerate(unique_chars)}
print(char2idx)

{'"': 0, '&': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, '/': 8, '0': 9, '1': 10, '5': 11, '7': 12, '8': 13, '9': 14, ';': 15, '?': 16, '@': 17, '[': 18, 'a': 19, 'b': 20, 'c': 21, 'd': 22, 'e': 23, 'f': 24, 'g': 25, 'h': 26, 'i': 27, 'j': 28, 'k': 29, 'l': 30, 'm': 31, 'n': 32, 'o': 33, 'p': 34, 'q': 35, 'r': 36, 's': 37, 't': 38, 'u': 39, 'v': 40, 'w': 41, 'x': 42, 'y': 43, 'z': 44, '¡': 45, 'à': 46, 'ö': 47, 'œ': 48, '…': 49}
{'"': 0, '&': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, '/': 8, '0': 9, '1': 10, '5': 11, '7': 12, '8': 13, '9': 14, ';': 15, '?': 16, '@': 17, '[': 18, 'a': 19, 'b': 20, 'c': 21, 'd': 22, 'e': 23, 'f': 24, 'g': 25, 'h': 26, 'i': 27, 'j': 28, 'k': 29, 'l': 30, 'm': 31, 'n': 32, 'o': 33, 'p': 34, 'q': 35, 'r': 36, 's': 37, 't': 38, 'u': 39, 'v': 40, 'w': 41, 'x': 42, 'y': 43, 'z': 44, '¡': 45, 'à': 46, 'ö': 47, 'œ': 48, '…': 49}


In [33]:
def convert_str_to_list_of_integers(astr):
  return [char2idx[char] for char in astr]
  # output = []
  # for char in astr:
  #   converted_idx = char2idx[char]
  #   output.append(converted_idx)

convert_str_to_list_of_integers('robert')


[36, 33, 20, 23, 36, 38]

In [34]:
# convert every name into the list of integer
names_converted = [convert_str_to_list_of_integers(name) for name in names_list]


In [36]:
names_converted[1]

[28, 33, 26, 32]

In [44]:
import torch
# make it to a tensor
idx = 100
conv_name = names_converted[idx]
name_tensor = torch.tensor(conv_name)
conv_name, torch.tensor(conv_name)


([19, 22, 19, 31], tensor([19, 22, 19, 31]))

In [45]:
# convert this integer tensor into a sequence of vector
import torch.nn as nn

emb_size = 4
char_emb_layer = nn.Embedding(len(unique_chars), emb_size)
char_emb_layer(name_tensor)

tensor([[ 0.9341, -0.4755,  0.1429,  1.0189],
        [ 0.7202,  0.3688,  1.5819,  0.6330],
        [ 0.9341, -0.4755,  0.1429,  1.0189],
        [ 0.3887, -1.0138, -1.2869, -0.2878]], grad_fn=<EmbeddingBackward0>)