In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

In [2]:
boys = pd.read_csv('boys.txt', header=None).values.squeeze()
girls = pd.read_csv('girls.txt', header=None).values.squeeze()
unisex = pd.read_csv('unisex.txt', header=None).values.squeeze()

In [3]:
dataset_boys = pd.DataFrame({'name': boys, 'label': np.zeros_like(boys)})
dataset_girls = pd.DataFrame({'name': girls, 'label': np.ones_like(girls)})
dataset_unisex = pd.DataFrame({'name': unisex, 'label': 2*np.ones_like(unisex)})
dataset = pd.concat([dataset_boys, dataset_girls, dataset_unisex]).sample(frac=1).reset_index(drop=True)

In [4]:
all_letters = sorted(list(set(np.concatenate(dataset.name.apply(list)))))
letter2ind = {letter: ii for ii, letter in enumerate(all_letters)}
n_letters = len(all_letters)
def name2tensor(name):
    tens = torch.zeros(len(name), 1, n_letters)
    for ii, letter in enumerate(name):
        tens[ii, 0, letter2ind[letter]] = 1
    return tens

In [5]:
name2tensor('יוסי').shape

torch.Size([4, 1, 30])

In [6]:
dataset['input_tensor'] = dataset['name'].apply(name2tensor)

In [7]:
torch.softmax

<function _VariableFunctionsClass.softmax>

In [8]:
class RnnCell(nn.Module):
    def __init__(self, in_size, hidden_size):
        super().__init__()
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(in_size + hidden_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.concat([input, hidden], dim=1)
        combined = self.i2h(combined)
        new_hidden = self.softmax(combined)
        return new_hidden

In [15]:
class LstmCell(nn.Module):
    def __init__(self, in_size, hidden_size):
        super().__init__()
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.i2f = nn.Linear(in_size + hidden_size, hidden_size)
        self.i2u = nn.Linear(in_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(in_size + hidden_size, hidden_size)
        self.i2c = nn.Linear(in_size + hidden_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden, cell):
        combined = torch.concat([input, hidden], dim=1)
        forget = torch.sigmoid(self.i2f(combined))
        update = torch.sigmoid(self.i2u(combined))
        output = torch.sigmoid(self.i2o(combined))
        cell_tild = torch.tanh(self.i2c(combined))
        # cell_tild = self.softmax(self.i2c(combined))
        new_cell = forget * cell + update * cell_tild
        new_hidden = output * torch.tanh(new_cell)

        return new_hidden, new_cell

In [16]:
class RnnNet(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super().__init__()
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.output_layer = nn.Linear(hidden_size, out_size)
        self.rnn_cell = RnnCell(in_size, hidden_size)

    def forward(self, input_tensor):
        hidden = torch.zeros(1, self.hidden_size)
        for line in input_tensor:
            hidden = self.rnn_cell(line, hidden)
        output = self.output_layer(hidden)
        return output

In [17]:
class LstmNet(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super().__init__()
        self.in_size = in_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.output_layer = nn.Linear(in_size + hidden_size, out_size)
        self.lstm_cell = LstmCell(in_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input_tensor):
        hidden = torch.zeros(1, self.hidden_size)
        cell = torch.zeros(1, self.hidden_size)
        for line in input_tensor:
            hidden, cell = self.lstm_cell(line, hidden, cell)
        output = self.softmax(self.output_layer(torch.concat([line, hidden], dim=1)))
        return output

In [18]:
dataset_train = dataset.iloc[:1000, :]
dataset_test = dataset.iloc[1000:, :]

In [13]:
# Rnn
import torch.nn as nn
hidden_size = 200
n_classes = 3
model = RnnNet(in_size=n_letters, hidden_size=hidden_size, out_size=n_classes)
loss_fun = nn.CrossEntropyLoss()
# loss_fun = nn.NLLLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

n_epochs = 30
train_loss, test_loss, train_acc, test_acc = [], [], [], []
N_names = len(dataset_train)
for epoch_i in range(n_epochs):
    model.train()
    name_loss = []
    name_acc = []
    for ii, row in dataset_train.iterrows():
        label = row.label
        tensor = row.input_tensor
        logits = model(tensor)
        loss = loss_fun(logits, torch.tensor([label]))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        name_loss.append(loss.detach().item())
        name_acc.append(float((torch.argmax(logits, axis=1) == label).item()))
        print(f'name:{ii + 1}/{N_names}|{int(20 * ii / N_names) * "="}{int(20 * (N_names - ii) / N_names) * "-"}'
              f'|loss:{name_loss[ii]:0.3f}|accuracy:{name_acc[ii]:0.2f}%', end='\r')
    train_loss.append(np.mean(name_loss))
    train_acc.append(100*(np.mean(name_acc)))
    model.eval()
    with torch.no_grad():
        name_loss = []
        name_acc = []
        for ii, row in dataset_test.iterrows():
            label = row.label
            tensor = row.input_tensor
            logits = model(tensor)
            loss = loss_fun(logits, torch.tensor([label]))
            name_loss.append(loss.detach().item())
            name_acc.append(float((torch.argmax(logits, axis=1) == label).item()))
        test_loss.append(np.mean(name_loss))
        test_acc.append((100*np.mean(name_acc)))
    print(f'epoch:{epoch_i + 1}/{n_epochs}|loss Train/Test: {train_loss[epoch_i]:0.3f}/{test_loss[epoch_i]:0.3f}|'
          f'accuracy Train/Test: {train_acc[epoch_i]:0.2f}%/{test_acc[epoch_i]:0.2f}%')

epoch:1/30|loss Train/Test: 1.453/1.099|accuracy Train/Test: 45.00%/48.74%
epoch:2/30|loss Train/Test: 1.084/1.082|accuracy Train/Test: 48.40%/48.74%
epoch:3/30|loss Train/Test: 1.035/1.066|accuracy Train/Test: 50.50%/49.16%
epoch:4/30|loss Train/Test: 1.002/1.051|accuracy Train/Test: 52.50%/50.00%
epoch:5/30|loss Train/Test: 0.975/1.038|accuracy Train/Test: 53.50%/50.42%
epoch:6/30|loss Train/Test: 0.952/1.026|accuracy Train/Test: 55.40%/51.26%
epoch:7/30|loss Train/Test: 0.932/1.016|accuracy Train/Test: 56.00%/52.10%
epoch:8/30|loss Train/Test: 0.914/1.008|accuracy Train/Test: 57.40%/54.62%
epoch:9/30|loss Train/Test: 0.899/1.002|accuracy Train/Test: 58.50%/55.46%
epoch:10/30|loss Train/Test: 0.887/0.997|accuracy Train/Test: 59.10%/57.14%
epoch:11/30|loss Train/Test: 0.877/0.992|accuracy Train/Test: 59.60%/57.98%
epoch:12/30|loss Train/Test: 0.869/0.989|accuracy Train/Test: 60.60%/57.98%
epoch:13/30|loss Train/Test: 0.862/0.986|accuracy Train/Test: 60.90%/58.40%
epoch:14/30|loss Trai

In [20]:
# Lstm
import torch.nn as nn
hidden_size = 120
n_classes = 3
model = LstmNet(in_size=n_letters, hidden_size=hidden_size, out_size=n_classes)
# loss_fun = nn.CrossEntropyLoss()
loss_fun = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

n_epochs = 50
train_loss, test_loss, train_acc, test_acc = [], [], [], []
N_names = len(dataset_train)
for epoch_i in range(n_epochs):
    model.train()
    name_loss = []
    name_acc = []
    for ii, row in dataset_train.iterrows():
        label = row.label
        tensor = row.input_tensor
        logits = model(tensor)
        loss = loss_fun(logits, torch.tensor([label]))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        name_loss.append(loss.detach().item())
        name_acc.append(float((torch.argmax(logits, axis=1) == label).item()))
        print(f'name:{ii + 1}/{N_names}|{int(20 * ii / N_names) * "="}{int(20 * (N_names - ii) / N_names) * "-"}'
              f'|loss:{name_loss[ii]:0.3f}|accuracy:{name_acc[ii]:0.2f}%', end='\r')
    train_loss.append(np.mean(name_loss))
    train_acc.append(100*(np.mean(name_acc)))
    model.eval()
    with torch.no_grad():
        name_loss = []
        name_acc = []
        for ii, row in dataset_test.iterrows():
            label = row.label
            tensor = row.input_tensor
            logits = model(tensor)
            loss = loss_fun(logits, torch.tensor([label]))
            name_loss.append(loss.detach().item())
            name_acc.append(float((torch.argmax(logits, axis=1) == label).item()))
        test_loss.append(np.mean(name_loss))
        test_acc.append((100*np.mean(name_acc)))
    print(f'epoch:{epoch_i + 1}/{n_epochs}|loss Train/Test: {train_loss[epoch_i]:0.3f}/{test_loss[epoch_i]:0.3f}|'
          f'accuracy Train/Test: {train_acc[epoch_i]:0.2f}%/{test_acc[epoch_i]:0.2f}%')

epoch:1/50|loss Train/Test: 0.876/0.884|accuracy Train/Test: 59.60%/61.76%
epoch:2/50|loss Train/Test: 0.756/0.870|accuracy Train/Test: 67.20%/62.18%
epoch:3/50|loss Train/Test: 0.713/0.913|accuracy Train/Test: 68.60%/63.45%
epoch:4/50|loss Train/Test: 0.676/0.953|accuracy Train/Test: 71.00%/63.45%
epoch:5/50|loss Train/Test: 0.646/0.991|accuracy Train/Test: 72.30%/61.34%
epoch:6/50|loss Train/Test: 0.622/1.027|accuracy Train/Test: 73.20%/60.92%
epoch:7/50|loss Train/Test: 0.603/1.065|accuracy Train/Test: 74.60%/60.92%
epoch:8/50|loss Train/Test: 0.583/1.112|accuracy Train/Test: 75.20%/62.61%
epoch:9/50|loss Train/Test: 0.565/1.147|accuracy Train/Test: 75.40%/62.18%
epoch:10/50|loss Train/Test: 0.542/1.186|accuracy Train/Test: 76.10%/62.18%
epoch:11/50|loss Train/Test: 0.521/1.244|accuracy Train/Test: 77.40%/60.08%
epoch:12/50|loss Train/Test: 0.500/1.302|accuracy Train/Test: 79.50%/57.98%
epoch:13/50|loss Train/Test: 0.466/1.382|accuracy Train/Test: 81.00%/57.14%
epoch:14/50|loss Trai

In [71]:
(dataset_train.label == 0).sum()

359

# Web Scrapping

In [2]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

https://babynames.baby-land.co.il/namelist/?_sft_name_gender=boy&_sft_name_letter=%d7%90

In [5]:
names_url = 'https://babynames.baby-land.co.il/namelist/?_sft_name_gender=boy&_sft_name_letter=א'
names_response = requests.get(names_url)
names_soup = BeautifulSoup(names_response.text, 'html.parser')
names_soup.title

<title>כל השמות | אתר השמות הגדול</title>

In [4]:
names_soup.title.string

'אתר השמות הגדול | by Babyland'

In [38]:
base_url = 'https://babynames.baby-land.co.il/namelist/?_sft_name_gender='
base_letter_url = "&_sft_name_letter="
first_letter = "אבגדהוזחטיכלמנסעפצקרשת"
gender_url = ['boy', 'girl', 'unisex']
boys = []
girls = []
unisex = []
for gender in gender_url:
    for letter in first_letter:
        print(f'gender: {gender}, letter = {letter}', end='\r')
        url = base_url + gender + base_letter_url + letter
        names_response = requests.get(url)
        names_soup = BeautifulSoup(names_response.text, 'html.parser')
        boys += [name.text for name in names_soup.find_all('a', {'class': 'boys'})]
        girls += [name.text for name in names_soup.find_all('a', {'class': 'girl'})]
        unisex += [name.text for name in names_soup.find_all('a', {'class': 'unisex'})]
boys = set(boys)
girls = set(girls)
unisex = set(unisex)

gender: unisex, letter = ת

In [39]:
len(unisex)

388

In [40]:
len(boys)

514

In [41]:
len(girls)

695

In [42]:
boys

{'אבא',
 'אבטמו',
 'אביגדור',
 'אבידן',
 'אביהו',
 'אביחי',
 'אבימלך',
 'אבינדב',
 'אבינועם',
 'אבינעם',
 'אביעד',
 'אביצור',
 'אביר',
 'אבירם',
 'אבישי',
 'אבישר',
 'אביתר',
 'אבנר',
 'אברהם',
 'אבשלום',
 'אדי',
 'אדיר',
 'אדם',
 'אהוד',
 'אהרון',
 'אודי',
 'אוהד',
 'אולג',
 'אוליבייה',
 'אומרי',
 'און',
 'אוניל',
 'אחיה',
 'אחינדב',
 'איאן',
 'אידו',
 'אייזיק',
 'איל',
 'איליי',
 'אילן',
 'אילעאי',
 'אילעי',
 'איציק',
 'איתי',
 'איתיאל',
 'איתם',
 'איתמר',
 'איתן',
 'אל',
 'אלאור',
 'אלדד',
 'אלדר',
 'אלון',
 'אלחי',
 'אלחנן',
 'אלטר',
 'אליאב',
 'אליאל',
 'אליאם',
 'אליאס',
 'אליהב',
 'אליהו',
 'אליחי',
 'אלימלך',
 'אליסף',
 'אליעד',
 'אליעוז',
 'אליעזר',
 'אליעם',
 'אליקים',
 'אלירוי',
 'אלירון',
 'אלירם',
 'אלירן',
 'אלישי',
 'אלישיב',
 'אלישמע',
 'אלישע',
 'אלכסנדר',
 'אלן',
 'אלנתן',
 'אלעאי',
 'אלעד',
 'אלעזר',
 'אלקיים',
 'אלקנה',
 'אלרואי',
 'אלרוי',
 'אלרום',
 'אלרועי',
 'אלתר',
 'אמוץ',
 'אמיל',
 'אמיר',
 'אמיתי',
 'אמנון',
 'אמציה',
 'אנוש',
 'אנטוני',
 'אסא',
 'אסיף',
 'א