Since the model was trained on GoogleColab, this is the code for model training.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

Scrapping training data:

In [3]:
import requests
from bs4 import BeautifulSoup

names = []

for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url = f'https://vardai.vlkk.lt/sarasas/{key}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    male_links = soup.find_all('a', class_='names_list__links names_list__links--man')
    for link in male_links:
        names.append({'name': link.text, 'gender': 'male'})

    female_links = soup.find_all('a', class_='names_list__links names_list__links--woman')
    for link in female_links:
        names.append({'name': link.text, 'gender': 'female'})

df = pd.DataFrame(names)
df.to_csv('names_dataset.csv', index=False)


In [2]:
class NameDataset(Dataset):
    def __init__(self, csv_file):
        # Load CSV file 
        data = pd.read_csv(csv_file)
        self.names = data['name'].values
        self.genders = data['gender'].values

        # Create character set and mappings
        self.chars = sorted(list(set(''.join(self.names) + ' ')))
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

        # Map genders to integers (0 for male, 1 for female)
        self.gender_to_int = {'male': 0, 'female': 1}
        self.int_to_gender = {0: 'male', 1: 'female'}

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):

        name = self.names[idx] + ' '  # Add padding character (space) at the end
        gender = self.genders[idx]

        encoded_name = [self.char_to_int[char] for char in name]
        encoded_gender = self.gender_to_int[gender]

        return torch.tensor(encoded_name), torch.tensor(encoded_gender)

In [None]:
dataset = NameDataset('/content/names_dataset.csv')

In [8]:
def pad_collate(batch):
    names, genders = zip(*batch)

    padded_seqs = pad_sequence(names, batch_first=True, padding_value=0)

    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]

    genders = torch.stack(genders)

    return input_seq, target_seq, genders

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

In [11]:
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion, gender_size):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gender_embed = nn.Embedding(gender_size, embed_size)  # Embedding for gender
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x, gender):
        # Embed gender and add it to the input embedding
        gender_emb = self.gender_embed(gender).unsqueeze(1).expand(-1, x.size(1), -1)  # Repeat gender embedding for each timestep
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :] + gender_emb
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x


In [12]:
def train_model(model, dataloader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train() 
        total_loss = 0.0
        batch_count = 0

        for _, (input_seq, target_seq, gender) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq, gender) 
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

In [None]:
model = MinimalTransformer(
    vocab_size=dataset.vocab_size,
    embed_size=128,
    num_heads=8,
    forward_expansion=4,
    gender_size=2
)

In [19]:
train_model(model, dataloader, epochs=200)

Epoch 1, Average Loss: 1.259397326486384
Epoch 2, Average Loss: 1.2493830662471033
Epoch 3, Average Loss: 1.2500016917824275
Epoch 4, Average Loss: 1.2487285853374617
Epoch 5, Average Loss: 1.2438808934019487
Epoch 6, Average Loss: 1.2470497547873396
Epoch 7, Average Loss: 1.2499032272651733
Epoch 8, Average Loss: 1.240061286409853
Epoch 9, Average Loss: 1.235540660473669
Epoch 10, Average Loss: 1.2398626076374129
Epoch 11, Average Loss: 1.2360095355821692
Epoch 12, Average Loss: 1.234760382194293
Epoch 13, Average Loss: 1.2351019568594077
Epoch 14, Average Loss: 1.2338980902796206
Epoch 15, Average Loss: 1.231826863741215
Epoch 16, Average Loss: 1.2326393221677998
Epoch 17, Average Loss: 1.2316390134600312
Epoch 18, Average Loss: 1.2252049957339473
Epoch 19, Average Loss: 1.2304602632880681
Epoch 20, Average Loss: 1.229437719456292
Epoch 21, Average Loss: 1.2323143359700681
Epoch 22, Average Loss: 1.2272004732501365
Epoch 23, Average Loss: 1.2292890247148958
Epoch 24, Average Loss: 1.

Testing the trained model:

In [25]:
def sample(model, dataset, start_str='a', max_length=20, temperature=1.0, gender='male'):
    assert temperature > 0, "Temperature must be greater than 0"
    model.eval()  
    with torch.no_grad():

        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)  # Add batch dimension
        gender_tensor = torch.tensor([dataset.gender_to_int[gender]])  # Gender encoding

        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq, gender_tensor)

            logits = output[0, -1] / temperature
            probabilities = torch.softmax(logits, dim=0)

            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            if next_char == ' ': 
                break

            output_name += next_char
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)

        return output_name


In [None]:

print("Conservative male names:")
for _ in range(5):
    print(sample(model, dataset, start_str='R', temperature=0.5, gender='male')) 

print("\nCreative female names:")
for _ in range(5):
    print(sample(model, dataset, start_str='S', temperature=1.5, gender='female'))


In [28]:
torch.save(model, 'namesformer_model.pt')