In [1]:
import torch
import torchtext

In [2]:
# train_data = [
#     ["EU", "B-ORG"],
#     ["rejects", "O"],
#     ["German", "B-MISC"],
#     ["call", "O"],
#     ["to", "O"],
#     ["boycott", "O"],
#     ["British", "B-MISC"],
#     ["lamb", "O"],
#     [".", "O"],
#     ["Peter", "B-PER"],
#     ["Blackburn", "I-PER"],
# ]

# test_data = [
#     ["EU", "B-ORG"],
#     ["rejects", "O"],
#     ["German", "B-MISC"],
#     ["call", "O"],
#     ["to", "O"],
#     ["boycott", "O"],
#     ["British", "B-MISC"],
#     ["lamb", "O"],
#     [".", "O"],
#     ["Peter", "B-PER"],
#     ["Blackburn", "I-PER"],
# ]

In [76]:
import pandas as pd

train_data = [
    ["EU", "B-ORG"],
    ["rejects", "O"],
    ["German", "B-MISC"],
    ["call", "O"],
    ["to", "O"],
    ["boycott", "O"],
    ["British", "B-MISC"],
    ["lamb", "O"],
    [".", "O"],
    ["Peter", "B-PER"],
    ["Blackburn", "I-PER"],
]

test_data = [
    ["EU", "B-ORG"],
    ["rejects", "O"],
    ["German", "B-MISC"],
    ["call", "O"],
    ["to", "O"],
    ["boycott", "O"],
    ["British", "B-MISC"],
    ["lamb", "O"],
    [".", "O"],
    ["Peter", "B-PER"],
    ["Blackburn", "I-PER"],
]

# Write train_data and test_data to CSV files
train_df = pd.DataFrame(train_data, columns=["word", "tag"])
train_df.to_csv("train_data.csv", index=False)

test_df = pd.DataFrame(test_data, columns=["word", "tag"])
test_df.to_csv("test_data.csv", index=False)


In [77]:
# Define the fields for the dataset
WORD = torchtext.legacy.data.Field(sequential=True, batch_first=True)
TAG = torchtext.legacy.data.Field(sequential=True, batch_first=True,is_target=True)

# Load the sample data
train = torchtext.legacy.data.TabularDataset(
    path="train_data.csv",
    format="csv",
    fields=[("word", WORD), ("tag", TAG)],
)
test = torchtext.legacy.data.TabularDataset(
    path="test_data.csv",
    format="csv",
    fields=[("word", WORD), ("tag", TAG)],
)

In [78]:
# Build the vocabulary for the fields
WORD.build_vocab(train, test)
TAG.build_vocab(train, test)

In [79]:
# Define the model
class NERModel(torch.nn.Module):
    def __init__(self, vocab_size, tag_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = torch.nn.Linear(hidden_dim, tag_size)

    def forward(self, words):
        embeds = self.embeddings(words)
        lstm_out, _ = self.lstm(embeds)
        tag_scores = self.hidden2tag(lstm_out)
        return tag_scores

# Initialize the model
model = NERModel(
    vocab_size=len(WORD.vocab),
    tag_size=len(TAG.vocab),
    embedding_dim=100,
    hidden_dim=128,
)

In [80]:
# Specify the loss function and the optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [93]:

# Train the model
for epoch in range(1):
    for example in train:
        # Clear the gradients
        words = example.word
        tags = example.tag
      
        model.zero_grad()

        # Convert words and tags to tensors
        words = WORD.process([words]) # The argument must be a list of lists
        tags = TAG.process([tags])

        # Compute the forward pass
        tag_scores = model(words)

        # Compute the loss
        loss = loss_function(tag_scores.view(-1, len(TAG.vocab)), tags.view(-1)) # Reshape the inputs to be 2D and 1D, respectively
        # Compute the gradients
        loss.backward()

        # Update the model parameters
        optimizer.step()
        
    print("epoch: ", epoch, " loss: ", loss.item())



epoch:  0  loss:  1.7783843278884888


In [88]:
train[2].word

['rejects']

In [89]:
 WORD.process([train[2].word])

tensor([[11]])

In [90]:
model( WORD.process([train[2].word]))

tensor([[[-0.1385, -0.1008,  0.6222,  0.0114,  0.0293, -0.1196, -0.1297,
          -0.0929]]], grad_fn=<AddBackward0>)

In [95]:
# Evaluate the model
accuracy = 0
for example in test:
    words = example.word
    tags = example.tag
    # Convert words and tags to tensors
    words = WORD.process([words]) # The argument must be a list of lists
    tags = TAG.process([tags])
    # Compute the forward pass
    tag_scores = model(words)

    # Get the predicted tags
    predicted_tags = tag_scores.argmax()

    # Update the accuracy
    accuracy += (predicted_tags == tags).sum().item()

# Print the average accuracy
print(f"Average accuracy: {accuracy / len(test)}")

Average accuracy: 1.0


In [110]:
# Evaluate the model
accuracy = 0
for example in test:
    words = example.word
    tags = example.tag
    # Convert words and tags to tensors
    words = WORD.process([words]) # The argument must be a list of lists
    tags = TAG.process([tags])
    # Compute the forward pass
    
    tag_scores = model(words)

    # Get the predicted tags
    _, predicted_tags = torch.max(tag_scores, dim=2)
    
        # Convert the tensor to a list of integers
    orig_tags = tags.tolist()[0]
    pred_tags = predicted_tags.tolist()[0]
    word = words.tolist()[0]

    # Convert each integer to its corresponding string representation
    orig_tags = [TAG.vocab.itos[i] for i in orig_tags]
    pred_tags = [TAG.vocab.itos[i] for i in pred_tags]
    word = [WORD.vocab.itos[i] for i in word]
    
    #print prediction Vs real
    print("word: ", word[0], "     test_tag: ", orig_tags[0], "     pred_tag: ", pred_tags[0])

    # Update the accuracy
    correct_predictions = (predicted_tags == tags).sum().item()
    total_predictions = len(predicted_tags)
    accuracy += correct_predictions / total_predictions

# Print the average accuracy
print(f"Average accuracy: {accuracy / len(test)}")


word:  word      test_tag:  tag      pred_tag:  tag
word:  EU      test_tag:  B-ORG      pred_tag:  B-ORG
word:  rejects      test_tag:  O      pred_tag:  O
word:  German      test_tag:  B-MISC      pred_tag:  B-MISC
word:  call      test_tag:  O      pred_tag:  O
word:  to      test_tag:  O      pred_tag:  O
word:  boycott      test_tag:  O      pred_tag:  O
word:  British      test_tag:  B-MISC      pred_tag:  B-MISC
word:  lamb      test_tag:  O      pred_tag:  O
word:  .      test_tag:  O      pred_tag:  O
word:  Peter      test_tag:  B-PER      pred_tag:  B-PER
word:  Blackburn      test_tag:  I-PER      pred_tag:  I-PER
Average accuracy: 1.0


In [100]:
len(predicted_tags)

1

In [103]:
tags

tensor([[6]])

In [104]:
decoded_tags = TAG.decode(tags)

AttributeError: 'Field' object has no attribute 'decode'

In [105]:
# Convert the tensor to a list of integers
tags = tags.tolist()[0]

# Convert each integer to its corresponding string representation
tags = [TAG.vocab.itos[i] for i in tags]

In [106]:
tags

['I-PER']

In [111]:
for example in test:
    words = example.word
    tags = example.tag
# Convert tags to tensor
tags = TAG.process([tags])

# Convert tensor back to strings
tag_strings = TAG.reverse(tags)

# The returned value is a list of lists, so you need to get the first element to get the list of tags
tag_strings = tag_strings[0]

tag_strings

AttributeError: 'Field' object has no attribute 'reverse'