## Understanding the working behind LSTM

In [3]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import os

In [15]:
# get the data path
current_dir = os.getcwd()
data_path = "Data"
csv_folderpath = os.path.dirname(current_dir)
csv_file_name = "NationalNames.csv"
csv_full_path = os.path.join(csv_folderpath + f"/{data_path}", csv_file_name)

# check the file exist or not
if os.path.exists(csv_full_path):
    print(f"Data exists at Path: {csv_full_path}")
else:
    print(f"Data doesn't exist in the Path : {csv_full_path}")

Data exists at Path: /home/karthi/AI-Algorithms/NLP/Data/NationalNames.csv


In [16]:
# read the dataset from the path
raw_data = pd.read_csv(csv_full_path)

if raw_data is not None:
    print("Pandas detected the CSV Data. 🐼")
else:
    print("Pandas could not able to detect the csv file. 😥")

Pandas detected the CSV Data. 🐼


In [None]:
raw_data["Name"] = raw_data["Name"]

In [20]:
raw_data.head()

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746


In [29]:
# get first 10000 names
raw_data = np.array(raw_data["Name"][:10000]).reshape(-1, 1)

raw_data

array([['Mary'],
       ['Anna'],
       ['Emma'],
       ...,
       ['Jens'],
       ['Julious'],
       ['Lindsay']], shape=(10000, 1), dtype=object)

In [30]:
# convert the names into lowercase
raw_data = [x.lower() for x in raw_data[:, 0]]

In [35]:
raw_data = np.array(raw_data).reshape(-1, 1)

# lets see some sample names
print(f"Sample name: \n{raw_data[:5]}")

# shape of the data
print(f"Shape of the data: {raw_data.shape}")

Sample name: 
[['mary']
 ['anna']
 ['emma']
 ['elizabeth']
 ['minnie']]
Shape of the data: (10000, 1)


In [None]:
# there was a difference in the each name length
# need to make all in same length
transformed_data = np.copy(raw_data)

# find the max length
max_length = 0
for i in range(len(transformed_data)):
    max_length = max(max_length, len(transformed_data[i, 0]))



print(f"Name max length: {max_length}")

Name max length: 12


In [50]:
# make each name into same length
for i in range(len(transformed_data)):
    length = (max_length - len(transformed_data[i, 0]))
    string = '.'*length
    transformed_data[i, 0] = ''.join([transformed_data[i, 0], string])

print("Transformed Data: \n", transformed_data[:5])

Transformed Data: 
 [['mary........']
 ['anna........']
 ['emma........']
 ['elizabeth...']
 ['minnie......']]


In [52]:
# store the vocabulary
vocab = list()

for name in transformed_data[:,0]:
    vocab.extend(list(name))

print(f"Length of the vocab: {len(vocab)}")

Length of the vocab: 120000


In [54]:
# remove dupicates in vocab
vocab = set(vocab)

print(f"Length of the vocab after removing duplicates: {len(vocab)}")
print(f"\nVocab: {vocab}")

Length of the vocab after removing duplicates: 27

Vocab: {'w', 'j', 'o', 'm', 'v', 't', 'a', 'f', 'y', 'n', 'b', 'u', 'e', 'p', 'l', 'k', 'x', '.', 'i', 'z', 'r', 'g', 'c', 'd', 's', 'q', 'h'}


In [55]:
# sort the vocabulory
vocab = sorted(vocab)

print(f"After sorting the vocab in ascending order: \n {vocab}")

After sorting the vocab in ascending order: 
 ['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [57]:
# map char to id and id to char
char_to_id = dict()
id_to_char = dict()

for index, character in enumerate(vocab):
    char_to_id[character] = index
    id_to_char[index] = character

print(f"Character to Id: \n {char_to_id}")
print(f"Id to Character: \n {id_to_char}")

Character to Id: 
 {'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
Id to Character: 
 {0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [76]:
# split the data into batches for training
BATCH_SIZE = 20

training_data = []

for i in range(len(transformed_data) - BATCH_SIZE + 1):
    start = i * BATCH_SIZE
    end = start + BATCH_SIZE

    # batch data
    batch_data = transformed_data[start:end]

    if (len(batch_data) != BATCH_SIZE):
        break

    # convert each character in each word into one hot encoding
    char_list = []

    for c in range(len(batch_data[0][0])):
        batch_dataset = np.zeros([BATCH_SIZE,len(vocab)])

        for n in range(BATCH_SIZE):
            name = batch_data[n][0]
            char_index = char_to_id[name[c]]
            batch_dataset[n, char_index] = 1.0

        char_list.append(batch_dataset)

    training_data.append(char_list)

            

In [77]:
# set some of the hyper parameters

# number of input units or embedding size
INPUT_UNITS = 100

# number of hidden units
HIDDEN_UNITS = 256

# number of output units, i.e, vocab size
OUTPUT_UNITS = len(vocab)

# learning rate
learning_rate = 0.005

# adam optimizer parameters
beta_01 = 0.90
beta_02 = 0.99

In [79]:
# activation functions

# sigmoid activation function
def Sigmoid(x: np.array):
    return (1 / (np.exp(-x)))

# tanh activation function
def TanH(x: np.array):
    return np.tanh(x)

# softmax activation function
def Softmax(x: np.array):
    exp_X = np.exp(x)
    exp_X_Sum = np.sum(exp_X, axis=1).reshape(-1, 1)
    exp_X = exp_X / exp_X_Sum
    return exp_X

def tanh_derivative(x: np.array):
    return (1 - (x ** 2))