In [65]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

import transformers
import torch.optim.lr_scheduler as lr_scheduler

import numpy as np
random.seed(42)

In [13]:
Female = pd.read_csv(r'C:\Names dataset\Indian-Female-Names.csv')
Male = pd.read_csv(r'C:\Names dataset\Indian-Male-Names.csv')

df = pd.concat([Female, Male], axis=0)
df.reset_index(drop=True, inplace=True)
df = df.applymap(lambda x : x.strip() if isinstance(x, str) else x)
df.dropna(subset=['name'], inplace=True)

df['name'] = df['name'].str.replace('@', '')
df['name'] = df['name'].str.replace('.', '')

df['name'] = df['name'].str.split('s/o').str[0]
df['name'] = df['name'].str.split('d/o').str[0]
df['name'] = df['name'].str.split('r/o').str[0]
df['name'] = df['name'].str.split('w/o').str[0]
df['name'] = df['name'].str.split('c/o').str[0]
#df['name'] = df['name'].str.split(' ').str[0]

df['name_mod'] = (
    df['name'].astype(str) + '.'
)

words = df['name_mod'].values

  df = df.applymap(lambda x : x.strip() if isinstance(x, str) else x)


In [14]:
vocab = sorted(list(set(''.join(words))))

encode = {s:i for i,s in enumerate(vocab)}

decode = {i:s for s,i in encode.items()}

vocab_size = len(vocab)

In [15]:
block_size = 8

def build_dataset(words):  
    X, Y = [], []
  
    for w in words:
        context = [0] * block_size
        for ch in w:
            ix = encode[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     
Xdev, Ydev = build_dataset(words[n1:n2])
Xte,  Yte  = build_dataset(words[n2:])

torch.Size([238059, 8]) torch.Size([238059])
torch.Size([29594, 8]) torch.Size([29594])
torch.Size([29631, 8]) torch.Size([29631])


In [115]:
embd = 30
hidden_dim = 4 * embd
learning_rate = 1e-3
batch_size = 64
n_layers = 4
n_epochs = 5 + 5

g = torch.Generator().manual_seed(2147483647)

In [116]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.Y[idx]
        return x, y

dataset = CustomDataset(Xtr, Ytr)

data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [117]:
class FFNBlock(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        
        self.ln = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.bn = nn.BatchNorm1d(hidden_dim)
        self.act = torch.nn.GELU(approximate='tanh')

        torch.nn.init.kaiming_normal_(self.ln.weight)
    
    def forward(self, x):
        
        out = self.act(self.bn(self.ln(x)))
        
        return out

In [118]:
class LanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.emblayer = nn.Embedding(vocab_size, embd)
        
        self.ln1 = nn.Linear(embd * block_size, hidden_dim, bias=False)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.act1 = torch.nn.GELU(approximate='tanh')
        
        self.FFNblocks = nn.Sequential(*[FFNBlock(hidden_dim) for _ in range(n_layers)])
        
        self.ln2 = nn.Linear(hidden_dim, vocab_size, bias=False)
        self.bn2 = nn.BatchNorm1d(vocab_size)
        self.act2 = torch.nn.GELU(approximate='tanh')

        torch.nn.init.kaiming_normal_(self.ln1.weight)
        torch.nn.init.kaiming_normal_(self.ln2.weight)
        
    def forward(self, x, y=None):
        
        batch_size, block_size = x.shape
        
        embeddings = self.emblayer(x)
        embeddings = embeddings.view(embeddings.shape[0], -1)
        logits = self.act1(self.bn1(self.ln1(embeddings)))
        logits = self.FFNblocks(logits)
        logits = self.act2(self.bn2(self.ln2(logits)))        
        
        if y is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, y)

        return logits, loss
    
    def n_parameters(self):

        return sum(p.numel() for p in model.parameters())
    
    @torch.no_grad
    def generate(self, n_samples):
        
        for _ in range(n_samples):
            out = []
            context = [0] * block_size
            logits, loss = self(torch.tensor(context).view(1, -1))
            out_probs = F.softmax(logits, dim=-1)
            
            idx = torch.multinomial(out_probs, num_samples=1, generator=g).item()
        
            context = context[1:] + [idx]
            out.append(idx)
            
            if idx == 1:
                break
                
            return (''.join(decode[i] for i in out))

In [156]:
device = 'cuda'
model = LanguageModel()
print(model.n_parameters())

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
warmup_lr = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=n_epochs * len(data_loader))

91856


In [157]:
%%time
for epoch in tqdm(range(n_epochs)):
    for _, (X, y) in tqdm(enumerate(data_loader), total=len(data_loader)):

        model.train()
        
        logits, loss = model(X, y)
        
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        warmup_lr.step()
    model.eval()
    print(f"Training Loss at epoch {epoch}: ", split_loss('train'))
    print(f"Validation Loss at epoch {epoch}: ", split_loss('val'))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 0:  1.5861452
Validation Loss at epoch 0:  1.6232734


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 1:  1.4858178
Validation Loss at epoch 1:  1.5315468


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 2:  1.4272082
Validation Loss at epoch 2:  1.486963


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 3:  1.3873935
Validation Loss at epoch 3:  1.4603183


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 4:  1.3483722
Validation Loss at epoch 4:  1.4330293


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 5:  1.3211188
Validation Loss at epoch 5:  1.415827


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 6:  1.2985774
Validation Loss at epoch 6:  1.4024539


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 7:  1.2836396
Validation Loss at epoch 7:  1.3953862


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 8:  1.275153
Validation Loss at epoch 8:  1.3917046


  0%|          | 0/3720 [00:00<?, ?it/s]

Training Loss at epoch 9:  1.2751048
Validation Loss at epoch 9:  1.3932405
CPU times: total: 34min 35s
Wall time: 5min 8s


In [168]:
model.eval()
for _ in range(30):
    out = []
    context = [0] * block_size
    while True:
        logits, loss = model(torch.tensor(context).view(1, -1))
        out_probs = F.softmax(logits, dim=-1)

        idx = torch.multinomial(out_probs, num_samples=1, generator=g).item()

        context = context[1:] + [idx]
        out.append(idx)

        if idx == 1:
            break

    print(''.join(decode[i] for i in out).strip('.'))

pooja
laxmi shregar
suman devi
ram khatun
jyoti
renu
sony sharma
sanjay kumar
radhika maloge
sangeeta
subhaj kumar
reshma ram
heena
ranarayan meena
puja
sunny baira
omparkash
rawav kuyar
ramesh chand
rajkao
archana
gulu ram
pinki hhawaraj
payak
altesh pandey kumar
kumag
shahna
komal
neha
narender kumar


In [171]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
    
    dataset = CustomDataset(x, y)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    losses = []
    for X, y in data_loader:
        logits, loss = model(X)
        loss = F.cross_entropy(logits, y)
        losses.append(loss)
        
    return np.mean(losses)
        

# put layers into eval mode
for layer in model.children():
    layer.training = False
print(split_loss('train'))
print(split_loss('val'))

1.2750894
1.3933704


In [170]:
torch.save(model.state_dict(), "C:\Character level language model\model.pth")