In [1]:
import pandas as pd
import torch
import torch.nn as nn
import re
from collections import Counter
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence, pad_packed_sequence
import os
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x299b3730>

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emd_dim, heads=4, dropout = 0.2):
        super().__init__()
        assert emd_dim % heads == 0
        self.heads = heads
        self.head_dim = emd_dim//heads
        self.scale = self.head_dim ** -0.5
        self.multiHead = nn.Linear(emd_dim, emd_dim*3)
        self.output = nn.Linear(emd_dim,emd_dim)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def add_masking(attn_scores, padding_mask):
        col_mask = padding_mask[:, None, None, :]
        attn_scores.masked_fill_((col_mask == 0), float('-inf'))
        return attn_scores

    def forward(self, x, padding_mask=None, attn_mask=False, kv_cache = None):
        B, T, C = x.shape
        qkv = self.multiHead(x)
        q, k, v = torch.chunk(qkv,3,dim=-1)
        q = q.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        if attn_mask:
            tril = torch.tril(torch.ones(T,T))
            attn_scores = attn_scores.masked_fill(tril==0, float('-inf'))
        if padding_mask is not None:
            attn_scores = self.add_masking(attn_scores, padding_mask)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        attn_probs_drop = self.dropout(attn_probs)
        attn_output = torch.matmul(attn_probs_drop,v)
        fn_attn_output = attn_output.permute(0, 2, 1, 3).reshape(B, T, C)
        return self.output(fn_attn_output)


In [3]:
class LayerNorm1D(nn.Module):
  def __init__(self, dim, eps=1e-5):
    super(LayerNorm1D, self).__init__()
    self.gamma = nn.Parameter(torch.ones(dim))
    self.beta = nn.Parameter(torch.zeros(dim))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1,keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    xhat = (x-mean)/torch.sqrt(var+self.eps)
    return (self.gamma * xhat) +self.beta

In [4]:
class FeedForward(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.2):
    super().__init__()
    self.feed_forward_layer = nn.Sequential(
      nn.Linear(input_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, output_dim),
      nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.feed_forward_layer(x)

In [5]:
class EncoderBlock(nn.Module):
    def __init__(self,embed_dim, heads=4):
        super().__init__()
        self.layer_norm1 = LayerNorm1D(embed_dim)
        self.layer_norm2 = LayerNorm1D(embed_dim)
        self.multi_head_attn =  MultiHeadAttention(embed_dim, heads)
        self.feed_forward_layer = FeedForward(embed_dim, embed_dim*4, embed_dim)
    
    def forward(self, x, padding_mask):
        x = x + self.multi_head_attn(self.layer_norm1(x), padding_mask)
        x = x + self.feed_forward_layer(self.layer_norm2(x))
        return x


In [6]:
class Encoder(nn.Module):
    def __init__(self, embed_dim, src_max_length, heads = 4, num_layers=4):
        super().__init__()
        self.encoder_blocks = nn.ModuleList([EncoderBlock(embed_dim,heads) for _ in range(num_layers)])

    def forward(self, x, padding_mask = None):
        for block in self.encoder_blocks:
            x = block(x, padding_mask = padding_mask) 
        return x

In [7]:
class EmbeddingBlock(nn.Module):
    def __init__(self, embed_dim, vocab_size, max_length, segment_needed=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_embedding = nn.Embedding(max_length, embed_dim)
        self.segment_needed = segment_needed
        if self.segment_needed:
            
            #self.segmentation_embedding
            pass

    def forward(self, x):
        x_emd = self.embedding(x)
        x_pos_emd = self.positional_embedding(torch.arange(x.shape[1]))
        x = x_emd + x_pos_emd
        if self.segment_needed:
            #self.segmentation_embedding
            pass
        return x
        

In [8]:
class BERT(nn.Module):
    def __init__(self, embed_dim, vocab_size, max_length, heads = 4, num_layers = 4):
        super().__init__()
        self.embedding = EmbeddingBlock(embed_dim, vocab_size, max_length)
        self.encoder = Encoder(embed_dim, max_length, heads = 4, num_layers = 4)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, mask):
        src = self.embedding(src)
        encoder_outputs = self.encoder(src, mask)
        output = self.linear(encoder_outputs)
        return output

In [9]:
path = r'C:\\Users\\harish-4072\\Downloads\\eng_french.csv'
df = pd.read_csv(path, names=['English','French'], header=0)

In [10]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()  
    return tokens if len(tokens) >= 5 else None


In [11]:
df = df['English'].dropna().apply(preprocess_text)
df = df.dropna()
vocab = Counter([token for sentence in df for token in sentence])
token_to_id = {token: idx + 2 for idx, token in enumerate(vocab)} 
token_to_id['<PAD>'] = 0
token_to_id['<MASK>'] = 1
id_to_token= {value:key for key,value in token_to_id.items()}
vocab_size = len(id_to_token)


In [12]:
def tokenize_text(tokens,token_to_id):
    return [token_to_id.get(token,0) for token in tokens]

df_sentences = df.apply(lambda x: tokenize_text(x, token_to_id))

In [13]:
df_sentences = df_sentences.reset_index(drop=True)

In [14]:
sentence1 = tokenize_text(preprocess_text("Hello, this is a good thing in life."), token_to_id)
sentence2 = tokenize_text(preprocess_text("I went to play cricket in rainy weather"), token_to_id)


In [15]:
sentence1_masked = sentence1
sentence2_masked = sentence2
sentence1_masked[2]= 1
sentence2_masked[4] = 1 

In [16]:
class SentencesDataset(Dataset):
    def __init__(self, sequences, mask_prob = 0.20):
        self.sequences = sequences
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self,idx):
        masked = torch.full((len(self.sequences[idx]),), self.mask_prob)
        masked_idx = torch.bernoulli(masked)
        masked_idx_bool = masked_idx.bool()
        X= torch.tensor(self.sequences[idx],dtype=torch.long)
        X_cloned = X.clone()
        X_cloned[masked_idx_bool] = 1
        return X_cloned, X, masked_idx_bool

In [17]:
# for i,j,k in SentencesDataset(df_sentences):
#     print(i, j, type(k))
#     break

In [18]:
def collate_fn(batch):
    X, y, mask = zip(*batch)
    X_padded = pad_sequence(X, batch_first=True, padding_value=0)
    y_padded = pad_sequence(y, batch_first=True, padding_value=0)
    mask = pad_sequence(mask, batch_first=True, padding_value=False)
    padding_mask = (X_padded != 0) 
    return X_padded, y_padded, padding_mask, mask

In [19]:
# df_sentences = df_sentences[:50000].reset_index(drop=True)

In [20]:
dataset = SentencesDataset(df_sentences)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True,collate_fn = collate_fn)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False,collate_fn = collate_fn)


In [21]:
# for X_padded, y_padded, padding_mask, masked in train_loader:
#     print(X_padded, y_padded, padding_mask, type(masked))
#     logits = torch.randn(2,X_padded.shape[1],10)
#     labels = torch.randint(1,2,(2, X_padded.shape[1]))
#     # print(logits.shape, labels.shape, type(masked))
#     # masked_logits = logits[masked]  
#     # masked_labels = labels[masked]
#     # print(masked_logits, masked_labels)
#     mask_tensor = torch.stack(masked)  

#     indices = torch.nonzero(mask_tensor, as_tuple=True)
#     print(mask_tensor, indices)
    
#     logits = logits[indices[0], indices[1]]  # Shape [num_selected, 10]
#     labels = y_padded[indices[0], indices[1]] 
#     print(logits, labels)
#     break

In [22]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
NUM_LAYERS = 4
DROPOUT = 0.5
VOCAB_SIZE = vocab_size  
PAD_IDX = 0 
MAX_LEN = max(df_sentences.apply(len))


In [23]:
model = BERT( embed_dim = EMBEDDING_DIM,  vocab_size = VOCAB_SIZE, max_length = MAX_LEN, heads = 4, num_layers = 4)
if os.path.exists("Bert_mask_pred_model.pth"):
    model.load_state_dict(torch.load("Bert_mask_pred_model.pth")) 
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

  model.load_state_dict(torch.load("Bert_mask_pred_model.pth"))


In [24]:
def train(model :nn.Module, criterion: nn.Module, optimizer: torch.optim, train_data: DataLoader, epochs: int = 4):
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for X, y, padding_mask, masked in train_data:
            optimizer.zero_grad()
            outputs = model(X, padding_mask)
            indices = torch.nonzero(masked, as_tuple=True)
            logits = outputs[indices[0], indices[1]] 
            labels = y[indices[0], indices[1]] 
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        torch.save(model.state_dict(), "Bert_mask_pred_model.pth")
        print(f"Epoch: {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_data):.4f}")


In [25]:
train(model, criterion, optimizer, train_loader, 3)

Epoch: 1/3, Loss: 3.6335
Epoch: 2/3, Loss: 3.5493
Epoch: 3/3, Loss: 3.4445


In [26]:
def val(model :nn.Module, criterion: nn.Module, optimizer: torch.optim, val_data: DataLoader):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X, y, padding_mask, masked in val_data:
            outputs = model(X, padding_mask)
            indices = torch.nonzero(masked, as_tuple=True)
            logits = outputs[indices[0], indices[1]] 
            labels = y[indices[0], indices[1]] 
            loss = criterion(logits, labels)
            val_loss += loss.item()
        print(f"Loss: {val_loss / len(val_data):.4f}")

In [27]:
val(model, criterion, optimizer, val_loader)

Loss: 3.5335


In [28]:
import torch
import torch.nn as nn

def generate(model: nn.Module, input_ids: torch.Tensor, padding_mask: torch.Tensor = None, top_k: int = 5):
    model.eval()  
    with torch.no_grad():  
        outputs = model(input_ids, padding_mask) 
        masked_positions = (input_ids == 1)  
        indices = torch.nonzero(masked_positions, as_tuple=True)  
        masked_logits = outputs[indices[0], indices[1], :]  
        top_k_preds = torch.topk(masked_logits, top_k, dim=-1) 
        return top_k_preds.indices  



In [29]:
input_ids = torch.tensor([sentence1_masked, sentence2_masked])

In [30]:
generate(model, torch.tensor([sentence1]))

tensor([[  21,  920,   48,   19, 1267]])

In [31]:
id_to_token[1316]

'these'

In [32]:
sentence1 = tokenize_text(preprocess_text("Are you an idiot in life"), token_to_id)
# sentence2 = tokenize_text(preprocess_text("I went to play cricket in rainy weather"), token_to_id)

In [33]:
sentence1_masked, sentence2_masked

([2295, 81, 1, 4, 23, 220, 7, 267], [2, 290, 10, 128, 1, 7, 4525, 2495])

In [34]:
sentence1[1] = 1