In [1]:
import pandas as pd

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support

In [4]:
class CustomLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomLSTMCell, self).__init__()
        self.hidden_size = hidden_size

        self.forget_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.input_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.output_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.cell_gate = nn.Linear(input_size + hidden_size, hidden_size)

    def forward(self, x, hidden):
        h_prev, c_prev = hidden
        combined = torch.cat((x, h_prev), 1)   # concatinate two matrices
        
        f_t = torch.sigmoid(self.forget_gate(combined))
        i_t = torch.sigmoid(self.input_gate(combined))
        o_t = torch.sigmoid(self.output_gate(combined))
        c_tilde = torch.tanh(self.cell_gate(combined))

        c_t = f_t * c_prev + i_t * c_tilde
        h_t = o_t * torch.tanh(c_t)

        return h_t, c_t

class CustomBLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomBLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm_cell_fwd = CustomLSTMCell(input_size, hidden_size)
        self.lstm_cell_bwd = CustomLSTMCell(input_size, hidden_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        
        # Initialize hidden and cell states
        h_fwd = torch.zeros(batch_size, self.hidden_size, device=x.device)
        c_fwd = torch.zeros(batch_size, self.hidden_size, device=x.device)
        h_bwd = torch.zeros(batch_size, self.hidden_size, device=x.device)
        c_bwd = torch.zeros(batch_size, self.hidden_size, device=x.device)

        # Forward LSTM 
        outputs_fwd = []
        for t in range(seq_len):
            h_fwd, c_fwd = self.lstm_cell_fwd(x[:, t, :], (h_fwd, c_fwd))
            outputs_fwd.append(h_fwd)

        # Backward LSTM 
        outputs_bwd = []
        for t in reversed(range(seq_len)):
            h_bwd, c_bwd = self.lstm_cell_bwd(x[:, t, :], (h_bwd, c_bwd))
            outputs_bwd.append(h_bwd)
        
        # Concatenate forward and backward
        outputs_bwd.reverse()  # Match idx with forward
        outputs = torch.cat([torch.stack(outputs_fwd, dim=1), torch.stack(outputs_bwd, dim=1)], dim=2)
        
        return outputs  # Shape of output is: (batch_size, seq_len, hidden_size * 2)

class BLSTMCNNNER(nn.Module):
    def __init__(self, word_vocab_size, word_embedding_dim, char_vocab_size, char_embedding_dim,
                 char_out_channels, char_kernel_size, hidden_dim, output_dim, dropout, learning_rate=1e-3):
        super(BLSTMCNNNER, self).__init__()

        # Word-level Embeddings
        self.word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim)
        
        # Character-level CNN Embeddings
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.conv = nn.Conv1d(char_embedding_dim, char_out_channels, char_kernel_size, padding=1)
        self.max_pool = nn.AdaptiveMaxPool1d(1)
        
        # BLSTM
        self.blstm = CustomBLSTM(word_embedding_dim + char_out_channels, hidden_dim)
        
        # Create a fully connected layer in this to convert out ouput of bLSTM to one hot vector size for prediction
        self.fc = nn.Linear(hidden_dim * 2, 256)
        self.fc_out = nn.Linear(256, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.log_softmax = nn.LogSoftmax(dim=-1)
        
        # Create optimizer and Loss function
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.criterion = nn.NLLLoss(ignore_index=-1) 

    def forward(self, word_inputs, char_inputs):
        batch_size, seq_len = word_inputs.size()
        
        # Word Embeddings
        word_embeds = self.word_embedding(word_inputs)
        
        # Character-level CNN Embedding
        char_embeds = self.char_embedding(char_inputs.view(-1, char_inputs.size(-1)))
        char_embeds = char_embeds.permute(0, 2, 1)  # for Conv1d input format
        char_out = F.relu(self.conv(char_embeds))
        char_out = self.max_pool(char_out).squeeze(-1)
        char_out = char_out.view(batch_size, seq_len, -1)
        
        # Concatenate word and character embeddings
        embeds = torch.cat((word_embeds, char_out), dim=-1)
        embeds = self.dropout(embeds)
        
        # print(embeds.shape)

        # BLSTM
        lstm_out = self.blstm(embeds)  # (batch_size, seq_len, hidden_dim * 2)
        
        # Fully connected layer
        tag_scores = self.log_softmax(self.fc_out(self.dropout(self.fc(lstm_out))))  # (batch_size, seq_len, output_dim)
        
        return tag_scores

    def fit(self, train_dataloader, val_dataloader, num_epochs=5, clip=1.0):
        self.train()

        # self.evaluate(val_dataloader)
        for epoch in range(num_epochs):
            total_loss = 0
            for word_inputs, char_inputs, labels in train_dataloader:
                self.optimizer.zero_grad()
                
                # Forward pass
                outputs = self.forward(word_inputs, char_inputs)
                
                # Reshape outputs and labels for NLLLoss
                outputs = outputs.view(-1, outputs.shape[-1])  # Flatten for loss calculation
                labels = labels.view(-1)  # Flatten target labels
                
                # Compute loss
                loss = self.criterion(outputs, labels)
                
                ### optimize
                loss.backward() # Back propagation
                nn.utils.clip_grad_norm_(self.parameters(), clip) 
                self.optimizer.step()  # Gradient descent
                
                total_loss += loss.item()
            
            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
            
            # validation
            self.evaluate(val_dataloader) 

    def evaluate(self, test_dataloader):
        self.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for word_inputs, char_inputs, labels in test_dataloader:
                outputs = self.forward(word_inputs, char_inputs)

                _, predicted = torch.max(outputs, dim=-1)  # Get predicted labels
                
                # print(predicted, hau) 
                for i, label in enumerate(labels):
                    for j, alabel in enumerate(label):
                        if alabel != -1 and alabel != 0:
                            total += 1
                            if alabel == predicted[i, j]:
                                correct += 1
        
        accuracy = correct / total * 100
        print(f"Correct: {correct}, Total: {total}, Accuracy: {accuracy:.2f}%")
        return accuracy



In [5]:
data = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [6]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx + 1:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx + 1 for  idx, tok in enumerate(vocab)}

    # PADDING AND UNK
    tok2idx['PAD'] = 0
    idx2tok[0] = 'PAD' 
    tok2idx['UNK'] = len(vocab) + 1
    idx2tok[len(vocab) + 1] = 'UNK'
    
    return tok2idx, idx2tok

token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [7]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,15369,11
1,,of,IN,O,23521,11
2,,demonstrators,NNS,O,17526,11
3,,have,VBP,O,20967,11
4,,marched,VBN,O,32354,11


### Count tag_idx, type

In [8]:
data.tail()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
1048570,,they,PRP,O,34010,11
1048571,,responded,VBD,O,8236,11
1048572,,to,TO,O,11396,11
1048573,,the,DT,O,702,11
1048574,,attack,NN,O,281,11


In [9]:
cnt = {}
for x in data['Tag_idx']:
    if x not in cnt:
        cnt[x] = 1
    else:
        cnt[x] += 1

for item in cnt.items():
    print("Tag:", idx2tag[item[0]], "- idx:", item[0], "cnt:", item[1])

Tag: O - idx: 11 cnt: 887908
Tag: B-geo - idx: 5 cnt: 37644
Tag: B-gpe - idx: 12 cnt: 15870
Tag: B-per - idx: 10 cnt: 16990
Tag: I-geo - idx: 15 cnt: 7414
Tag: B-org - idx: 17 cnt: 20143
Tag: I-org - idx: 9 cnt: 16784
Tag: B-tim - idx: 16 cnt: 20333
Tag: B-art - idx: 14 cnt: 402
Tag: I-art - idx: 6 cnt: 297
Tag: I-per - idx: 13 cnt: 17251
Tag: I-gpe - idx: 2 cnt: 198
Tag: I-tim - idx: 1 cnt: 6528
Tag: B-nat - idx: 3 cnt: 201
Tag: B-eve - idx: 7 cnt: 308
Tag: I-eve - idx: 4 cnt: 253
Tag: I-nat - idx: 8 cnt: 51


In [10]:
# Fill na
data_fillna = data.fillna(method='ffill', axis=0)
data_fillna.head()

  data_fillna = data.fillna(method='ffill', axis=0)


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,15369,11
1,Sentence: 1,of,IN,O,23521,11
2,Sentence: 1,demonstrators,NNS,O,17526,11
3,Sentence: 1,have,VBP,O,20967,11
4,Sentence: 1,marched,VBN,O,32354,11


In [11]:
# Group by
data_group = data_fillna.groupby('Sentence #', as_index=False).agg(list)
data_group.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[15369, 23521, 17526, 20967, 32354, 13001, 319...","[11, 11, 11, 11, 11, 11, 5, 11, 11, 11, 11, 11..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[1794, 24359, 11844, 34010, 10016, 11396, 231,...","[12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[22017, 18741, 31394, 30590, 5851, 27676, 2246...","[11, 11, 16, 11, 11, 11, 11, 11, 5, 11, 11, 11..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[2387, 13755, 1752, 22952, 29097, 19348, 9318,...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[15411, 6692, 12937, 34136, 6361, 9787, 21920,...","[5, 11, 11, 10, 13, 11, 16, 11, 5, 11, 12, 11,..."


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
    def __init__(self, dataframe, char_vocab, max_word_len=10):
        self.dataframe = dataframe
        self.char_vocab = char_vocab
        self.max_word_len = max_word_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Word-level indices
        word_indices = torch.tensor(row['Word_idx'], dtype=torch.long)
        
        # Character-level indices
        char_indices = [
            [self.char_vocab.get(char, 0) for char in word[:self.max_word_len]]
            for word in row['Word']
        ]
        char_indices = [chars + [0] * (self.max_word_len - len(chars)) for chars in char_indices]
        char_indices = torch.tensor(char_indices, dtype=torch.long)
        
        # Tag labels
        tag_indices = torch.tensor(row['Tag_idx'], dtype=torch.long)
        
        return word_indices, char_indices, tag_indices

def collate_fn(batch):
    word_inputs = [item[0] for item in batch]
    char_inputs = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    
    # Pad word and label sequences to the same length
    word_inputs = torch.nn.utils.rnn.pad_sequence(word_inputs, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-1)
    
    # Pad character sequences to match word lengths
    max_word_len = char_inputs[0].size(1)
    char_inputs = [F.pad(chars, (0, 0, 0, word_inputs.size(1) - chars.size(0))) for chars in char_inputs]
    char_inputs = torch.stack(char_inputs)
    
    return word_inputs, char_inputs, labels

In [13]:
from sklearn.model_selection import train_test_split

char_vocab = {}
i = 1
for val in range(ord('a'), ord('z') + 1):
    char_vocab[chr(val)] = i
    i += 1

for val in range(ord('A'), ord('Z') + 1):
    char_vocab[chr(val)] = i
    i += 1

# Split the data into training and validation sets
train_data, val_data = train_test_split(data_group, test_size=0.2, random_state=42)

train_dataset = NERDataset(train_data, char_vocab)
val_dataset = NERDataset(val_data, char_vocab)

train_data_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

In [14]:
print(char_vocab)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52}


In [15]:
print(train_dataset)

<__main__.NERDataset object at 0x000001FB84E21EE0>


In [19]:
"""
import torch_directml

device = torch_directml.device()

for word_inputs, char_inputs, labels in train_data_loader:
    word_inputs, char_inputs, labels = word_inputs.to(device), char_inputs.to(device), labels.to(device)

for word_inputs, char_inputs, labels in val_data_loader:
    word_inputs, char_inputs, labels = word_inputs.to(device), char_inputs.to(device), labels.to(device)
"""
output_dim = max(data_group['Tag_idx'].explode()) + 1

# Example usage with model
model = BLSTMCNNNER(
    word_vocab_size=50000,          # size of word vocabulary
    word_embedding_dim=100,         # word embedding dimension
    char_vocab_size=100,            # size of character vocabulary
    char_embedding_dim=30,          # character embedding dimension
    char_out_channels=50,           # number of CNN output channels for character embeddings
    char_kernel_size=3,             # kernel size for CNN over character embeddings
    hidden_dim=128,                 # hidden size for BiLSTM
    output_dim=output_dim,                  # number of output tags
    dropout=0.5                     # dropout rate
)

# model = model.to(device)  

# Training loop example
model.fit(train_dataloader=train_data_loader, val_dataloader=val_data_loader, num_epochs=4)

Epoch [1/4], Loss: 0.3378
Accuracy: 75.42%
Epoch [2/4], Loss: 0.1205
Accuracy: 81.81%
Epoch [3/4], Loss: 0.0919
Accuracy: 82.66%
Epoch [4/4], Loss: 0.0754
Accuracy: 83.60%
