In [1]:
import pandas as pd
import torch
import torch.nn as nn
import re
from collections import Counter
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence, pad_packed_sequence
import os
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x295cf770>

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emd_dim, heads=4, dropout = 0.2):
        super().__init__()
        assert emd_dim % heads == 0
        self.heads = heads
        self.head_dim = emd_dim//heads
        self.scale = self.head_dim ** -0.5
        self.multiHead = nn.Linear(emd_dim, emd_dim*3)
        self.output = nn.Linear(emd_dim,emd_dim)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def add_masking(attn_scores, padding_mask):
        col_mask = padding_mask[:, None, None, :]
        attn_scores.masked_fill_((col_mask == 0), float('-inf'))
        return attn_scores

    def forward(self, x, padding_mask=None, attn_mask=False, kv_cache = None):
        B, T, C = x.shape
        qkv = self.multiHead(x)
        q, k, v = torch.chunk(qkv,3,dim=-1)
        q = q.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
        k = k.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
        v = v.view(B, T, self.heads, self.head_dim).permute(0, 2, 1, 3)
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        if attn_mask:
            tril = torch.tril(torch.ones(T,T))
            attn_scores = attn_scores.masked_fill(tril==0, float('-inf'))
        if padding_mask is not None:
            attn_scores = self.add_masking(attn_scores, padding_mask)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        attn_probs_drop = self.dropout(attn_probs)
        attn_output = torch.matmul(attn_probs_drop,v)
        fn_attn_output = attn_output.permute(0, 2, 1, 3).reshape(B, T, C)
        return self.output(fn_attn_output)


In [3]:
class LayerNorm1D(nn.Module):
  def __init__(self, dim, eps=1e-5):
    super(LayerNorm1D, self).__init__()
    self.gamma = nn.Parameter(torch.ones(dim))
    self.beta = nn.Parameter(torch.zeros(dim))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1,keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    xhat = (x-mean)/torch.sqrt(var+self.eps)
    return (self.gamma * xhat) +self.beta

In [4]:
class FeedForward(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.5):
    super().__init__()
    self.feed_forward_layer = nn.Sequential(
      nn.Linear(input_dim, hidden_dim),
      nn.ReLU(),
      nn.Linear(hidden_dim, output_dim),
      nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.feed_forward_layer(x)

In [5]:
class EncoderBlock(nn.Module):
    def __init__(self,embed_dim, heads=4):
        super().__init__()
        self.layer_norm1 = LayerNorm1D(embed_dim)
        self.layer_norm2 = LayerNorm1D(embed_dim)
        self.multi_head_attn =  MultiHeadAttention(embed_dim, heads)
        self.feed_forward_layer = FeedForward(embed_dim, embed_dim*4, embed_dim)
    
    def forward(self, x, padding_mask):
        x = x + self.multi_head_attn(self.layer_norm1(x), padding_mask)
        x = x + self.feed_forward_layer(self.layer_norm2(x))
        return x


In [6]:
class Encoder(nn.Module):
    def __init__(self, embed_dim, heads = 4, num_layers=4):
        super().__init__()
        self.encoder_blocks = nn.ModuleList([EncoderBlock(embed_dim,heads) for _ in range(num_layers)])

    def forward(self, x, padding_mask = None):
        for block in self.encoder_blocks:
            x = block(x, padding_mask = padding_mask) 
        return x

In [7]:
class EmbeddingBlock(nn.Module):
    def __init__(self, embed_dim, vocab_size, max_length, segment_needed=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_embedding = nn.Embedding(max_length, embed_dim)
        self.segment_needed = segment_needed
        if self.segment_needed:
            self.segmentation_embedding = nn.Embedding(2, embed_dim)

    def forward(self, x, segment_ids):
        x_emd = self.embedding(x)
        x_pos_emd = self.positional_embedding(torch.arange(x.shape[1]))
        x = x_emd + x_pos_emd
        if self.segment_needed:
            x = x + self.segmentation_embedding(segment_ids)
        return x
        

In [8]:
class BERT(nn.Module):
    def __init__(self, embed_dim, vocab_size, max_length, heads = 4, num_layers = 4, dropout = 0.2):
        super().__init__()
        self.embedding = EmbeddingBlock(embed_dim, vocab_size, max_length, True)
        self.encoder = Encoder(embed_dim, heads = 4, num_layers = 4)
        self.dropout = nn.Dropout(dropout) 
        self.linear = nn.Linear(embed_dim, 1)

    def forward(self, x, segment_ids, mask):
        x = self.embedding(x, segment_ids)
        encoder_outputs = self.encoder(x, mask)
        cls_output = encoder_outputs[:,0,:]
        return self.linear(cls_output)


In [9]:
path = r'nsp_dataset.csv'
df = pd.read_csv(path, header=0)

In [10]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = text.split()  
    return tokens

In [11]:
df['sentence_A'] = df['sentence_A'].apply(preprocess_text)
df['sentence_B'] = df['sentence_B'].apply(preprocess_text)


In [12]:
df = df[(df['sentence_A'].apply(len).between(6, 30)) & (df['sentence_B'].apply(len).between(6, 30))].reset_index(drop=True)


In [13]:
sentences_A = df['sentence_A']
sentences_B = df['sentence_B']

In [14]:
all_tokens = []
for tokens in sentences_A:
    all_tokens.extend(tokens)  

for tokens in sentences_B:
    all_tokens.extend(tokens)  
vocab = Counter(all_tokens)

In [15]:
token_to_id = {token: idx + 3 for idx, token in enumerate(vocab)} 
token_to_id['<PAD>'] = 0
token_to_id['<CLS>'] = 1
token_to_id['<SEG>'] = 2

id_to_token= {value:key for key,value in token_to_id.items()}
vocab_size = len(id_to_token)

In [16]:
def tokenize_text(tokens,token_to_id, is_first=False):
    tokenized_texts = [token_to_id.get(token,0) for token in tokens]
    if is_first:
        tokenized_texts = [1] + tokenized_texts
    else:
        tokenized_texts = [2] + tokenized_texts + [2]
    return tokenized_texts

sentences_A = sentences_A.apply(lambda x: tokenize_text(x, token_to_id, True))
sentences_B = sentences_B.apply(lambda x: tokenize_text(x, token_to_id))

In [17]:
labels = df['label']=="IsNext"

In [18]:
class SentencesDataset(Dataset):
    def __init__(self, first_sequences, second_sequences, output):
        self.first_seq = first_sequences
        self.second_seq = second_sequences
        self.output = output
        
    def __len__(self):
        return len(self.first_seq)

    def __getitem__(self,idx):
        sentence = self.first_seq[idx] + self.second_seq[idx]
        # segment_ids = torch.cat((torch.zeros(1, len(self.first_seq[idx])), torch.ones(1, len(self.second_seq[idx]))), dim=1)
        segment_ids = torch.cat((
            torch.zeros(len(self.first_seq[idx]), dtype=torch.long),
            torch.ones(len(self.second_seq[idx]), dtype=torch.long)
        ), dim=0)
        return torch.tensor(sentence), segment_ids.squeeze(0), torch.tensor(self.output[idx])

In [19]:
def collate_fn(batch):
    X, segment_ids, y = zip(*batch)
    X_padded = pad_sequence(X, batch_first=True, padding_value=0)
    segment_ids_padded = pad_sequence(segment_ids, batch_first=True, padding_value=0)
    padding_mask = (X_padded != 0) 
    return X_padded, torch.tensor(y, dtype=torch.float32).unsqueeze(1), segment_ids_padded, padding_mask

In [20]:
sentences_A = sentences_A[:10000]
sentences_B = sentences_B[:10000]
labels = labels[:10000]

In [21]:
dataset = SentencesDataset(sentences_A, sentences_B, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True,collate_fn = collate_fn)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False,collate_fn = collate_fn)


In [22]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 128
NUM_LAYERS = 4
DROPOUT = 0.5
VOCAB_SIZE = vocab_size  
PAD_IDX = 0 
MAX_LEN = max((sentences_A+sentences_B).apply(len))


In [23]:
model = BERT( embed_dim = EMBEDDING_DIM,  vocab_size = VOCAB_SIZE, max_length = MAX_LEN, heads = 4, num_layers = 4)
# if os.path.exists("Bert_next_sentence_pred_model.pth"):
#     model.load_state_dict(torch.load("Bert_next_sentence_pred_model.pth")) 
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

In [24]:
def train(model :nn.Module, criterion: nn.Module, optimizer: torch.optim, train_data: DataLoader, val_data: DataLoader, epochs: int = 4):
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for X, y, segment_ids, padding_mask in train_data:
            optimizer.zero_grad()
            outputs = model(X, segment_ids, padding_mask)
            loss = criterion(outputs, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            epoch_loss += loss.item()

        val_loss = 0
        correct = 0
        total = 0
        model.eval()
        with torch.no_grad():
            for X, y, segment_ids, padding_mask in val_data:
                outputs = model(X, segment_ids, padding_mask)
                val_loss += criterion(outputs, y).item()
                preds = (torch.sigmoid(outputs) > 0.5).float()
                correct += (preds == y).sum().item()
                total += y.size(0)
        
        avg_train_loss = epoch_loss / len(train_data)
        avg_val_loss = val_loss / len(val_data)
        accuracy = correct / total
        torch.save(model.state_dict(), "Bert_next_sentence_pred_model.pth")
        print(f"Epoch: {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")


In [25]:
train(model, criterion, optimizer, train_loader, val_loader, 100)

  return torch.tensor(sentence), segment_ids.squeeze(0), torch.tensor(self.output[idx])


Epoch: 1/100, Train Loss: 0.8212, Val Loss: 0.6975, Accuracy: 0.4875
Epoch: 2/100, Train Loss: 0.6994, Val Loss: 0.6932, Accuracy: 0.5130
Epoch: 3/100, Train Loss: 0.6944, Val Loss: 0.7087, Accuracy: 0.5125
Epoch: 4/100, Train Loss: 0.6872, Val Loss: 0.7159, Accuracy: 0.5190
Epoch: 5/100, Train Loss: 0.6762, Val Loss: 0.7237, Accuracy: 0.5135
Epoch: 6/100, Train Loss: 0.6935, Val Loss: 0.7417, Accuracy: 0.5180
Epoch: 7/100, Train Loss: 0.5880, Val Loss: 1.0739, Accuracy: 0.5055
Epoch: 8/100, Train Loss: 0.5038, Val Loss: 0.9546, Accuracy: 0.5025
Epoch: 9/100, Train Loss: 0.4544, Val Loss: 1.2830, Accuracy: 0.5110
Epoch: 10/100, Train Loss: 0.3688, Val Loss: 1.1574, Accuracy: 0.4965
Epoch: 11/100, Train Loss: 0.2809, Val Loss: 1.3453, Accuracy: 0.5015
Epoch: 12/100, Train Loss: 0.2073, Val Loss: 1.5812, Accuracy: 0.4970
Epoch: 13/100, Train Loss: 0.1646, Val Loss: 1.8606, Accuracy: 0.5070


KeyboardInterrupt: 

In [29]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Sample text
text = "Endpoint Central running,Service Shutdown Verification,Stop the Endpo.int Central service and verify it's completely stopped,Service stop command,All related processes should be terminated"
tokens = tokenizer.tokenize(text)

print(tokens)


['end', '##point', 'central', 'running', ',', 'service', 'shut', '##down', 'verification', ',', 'stop', 'the', 'end', '##po', '.', 'int', 'central', 'service', 'and', 'verify', 'it', "'", 's', 'completely', 'stopped', ',', 'service', 'stop', 'command', ',', 'all', 'related', 'processes', 'should', 'be', 'terminated']


In [30]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)


[2203, 8400, 2430, 2770, 1010, 2326, 3844, 7698, 22616, 1010, 2644, 1996, 2203, 6873, 1012, 20014, 2430, 2326, 1998, 20410, 2009, 1005, 1055, 3294, 3030, 1010, 2326, 2644, 3094, 1010, 2035, 3141, 6194, 2323, 2022, 12527]


In [38]:
tokens = tokenizer.tokenize("BERT is amazing!")
tokens = ["[CLS]"] + tokens + ["[SEP]"] + [i for i in "Would love to use it for bigger tasks".split(" ")]

print(tokens)


['[CLS]', 'bert', 'is', 'amazing', '!', '[SEP]', 'Would', 'love', 'to', 'use', 'it', 'for', 'bigger', 'tasks']


In [39]:
encoded = tokenizer(tokens, padding="max_length", truncation=False, max_length=5, return_tensors="pt")

print(encoded)


{'input_ids': tensor([[  101,   101,   102,     0,     0],
        [  101, 14324,   102,     0,     0],
        [  101,  2003,   102,     0,     0],
        [  101,  6429,   102,     0,     0],
        [  101,   999,   102,     0,     0],
        [  101,   102,   102,     0,     0],
        [  101,  2052,   102,     0,     0],
        [  101,  2293,   102,     0,     0],
        [  101,  2000,   102,     0,     0],
        [  101,  2224,   102,     0,     0],
        [  101,  2009,   102,     0,     0],
        [  101,  2005,   102,     0,     0],
        [  101,  7046,   102,     0,     0],
        [  101,  8518,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0,

In [33]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

text = "BERT is an amazing transformer model used for various NLP tasks, including question answering, sentiment analysis, and named entity recognition."

encoded = tokenizer(text, padding="max_length", max_length=10, truncation=False)


In [34]:
encoded

{'input_ids': [101, 14324, 2003, 2019, 6429, 10938, 2121, 2944, 2109, 2005, 2536, 17953, 2361, 8518, 1010, 2164, 3160, 10739, 1010, 15792, 4106, 1010, 1998, 2315, 9178, 5038, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}