In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
# Define a custom dataset class for the NER data
class NERDataset(Dataset):
    def __init__(self, filepath):
        self.data = pd.read_csv(filepath, encoding='ISO-8859-1').fillna(method='ffill')
        
        # Create a list of unique words and tags
        self.words = list(set(self.data["Word"].values))
        self.tags = list(set(self.data["Tag"].values))
        
        # Create dictionaries for mapping words and tags to integers
        self.word2idx = {w: i+1 for i, w in enumerate(self.words)}
        self.tag2idx = {t: i for i, t in enumerate(self.tags)}
        
        self.sentences = self._get_sentences()
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        sentence = self.sentences[index]
        words = [word[0] for word in sentence]
        tags = [word[1] for word in sentence]
        
        # Convert words and tags to numerical values using the dictionaries
        x = [self.word2idx[w] for w in words]
        y = [self.tag2idx[t] for t in tags]
        
        return torch.tensor(x), torch.tensor(y)
    
    def _get_sentences(self):
        # Group the data by sentence number
        grouped = self.data.groupby("Sentence #")
        sentences = []
        for _, group in grouped:
            words = group["Word"].values.tolist()
            tags = group["Tag"].values.tolist()
            sentence = list(zip(words, tags))
            sentences.append(sentence)
        return sentences

In [3]:
# Define a simple LSTM-based NER model
class NERModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [4]:
# Define hyperparameters and train the model
BATCH_SIZE = 32
EMBEDDING_DIM = 32
HIDDEN_DIM = 64
LEARNING_RATE = 0.1
EPOCHS = 10

dataset = NERDataset(r"C:\Python_code\practise_code\data\ner_datasetreference.csv")

  self.data = pd.read_csv(filepath, encoding='ISO-8859-1').fillna(method='ffill')


In [5]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])


In [6]:
def collate_fn(batch):
    x = [item[0] for item in batch]
    y = [item[1] for item in batch]
    x_lengths = [len(seq) for seq in x]
    y_lengths = [len(seq) for seq in y]
    
    # Pad the sequences to the same length
    x = nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = nn.utils.rnn.pad_sequence(y, batch_first=True)
    
    # Create a mask to ignore padding values in the loss calculation
    x_mask = torch.arange(x.size(1))[None, :] < torch.tensor(x_lengths)[:, None]
    y_mask = torch.arange(y.size(1))[None, :] < torch.tensor(y_lengths)[:, None]
    
    return x, y, x_mask, y_mask


In [7]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,collate_fn = collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


In [8]:
for batch in train_loader:
    print(batch[0],batch[1])
    break

tensor([[  158, 31773,  2901,  ...,     0,     0,     0],
        [34342, 17737, 33672,  ...,     0,     0,     0],
        [31201, 29435,  2901,  ...,     0,     0,     0],
        ...,
        [19581, 33832, 33137,  ...,     0,     0,     0],
        [ 7349, 21237, 16129,  ...,     0,     0,     0],
        [21464, 11071,  1294,  ..., 10624, 23541, 26852]]) tensor([[13, 14, 13,  ...,  0,  0,  0],
        [ 1, 13, 13,  ...,  0,  0,  0],
        [ 1,  3, 13,  ...,  0,  0,  0],
        ...,
        [13, 13, 13,  ...,  0,  0,  0],
        [ 1,  3, 13,  ...,  0,  0,  0],
        [11, 13, 13,  ...,  1, 13, 13]])


In [9]:
model = NERModel(len(dataset.words) + 1, EMBEDDING_DIM, HIDDEN_DIM, len(dataset.tags))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)


In [10]:
for epoch in range(EPOCHS):
    train_loss = 0.0
    val_loss = 0.0
    model.train()
    for batch in train_loader:
        x, y, x_mask, y_mask = batch
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output.view(-1, len(dataset.tags)), y.view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * x.size(0)
        
    train_loss /= len(train_dataset)
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            x, y, x_mask, y_mask = batch
            output = model(x)
            loss = criterion(output.view(-1, len(dataset.tags)), y.view(-1))
            val_loss += loss.item() * x.size(0)
        val_loss /= len(val_dataset)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


Epoch 1, Train Loss: 0.1389, Val Loss: 0.1229
Epoch 2, Train Loss: 0.1176, Val Loss: 0.1246
Epoch 3, Train Loss: 0.1183, Val Loss: 0.1250
Epoch 4, Train Loss: 0.1154, Val Loss: 0.1227
Epoch 5, Train Loss: 0.1145, Val Loss: 0.1252
Epoch 6, Train Loss: 0.1140, Val Loss: 0.1274
Epoch 7, Train Loss: 0.1144, Val Loss: 0.1236
Epoch 8, Train Loss: 0.1135, Val Loss: 0.1238
Epoch 9, Train Loss: 0.1122, Val Loss: 0.1240
Epoch 10, Train Loss: 0.1149, Val Loss: 0.1278


In [11]:
text = "This is Imphal and is giving out a demo of the trained NER model. Bangalore Bangalore Here We Come"

# Split the text into words
words = text.split()

# Convert words to numerical values using the word2idx dictionary
x = [dataset.word2idx.get(word, 0) for word in words]

# Convert the numerical values to a tensor and add a batch dimension
x = torch.tensor(x).unsqueeze(0)

# Pass the tensor through the model to get the predicted tags
model.eval()
with torch.no_grad():
    output = model(x)
    _, predicted_tags = torch.max(output, dim=2)

# Convert the predicted tags back to their corresponding tag labels using the idx2tag dictionary
predicted_tags = predicted_tags.squeeze().tolist()
predicted_labels = [dataset.tags[idx] for idx in predicted_tags]
print(predicted_labels)


['O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-eve', 'O', 'O', 'O', 'B-eve', 'B-eve', 'I-org', 'I-org', 'O', 'O', 'O']
