In [14]:
import pandas as pd

df=pd.read_csv("data/recognition_data.csv",encoding='utf-8',sep=',',encoding_errors='ignore')
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [15]:
df["Tag"].value_counts()

Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

**Label Explanation**: 
- `O` = Outside (not an entity), 
- `B-*` = Beginning of entity, 
- `I-*` = Inside/continuation of entity, where `*` represents the entity type: 
- `geo` (geopolitical), 
- `tim` (time), 
- `org` (organization), 
- `per` (person), 
- `gpe` (geopolitical entity), 
- `art` (artifact), 
- `eve` (event), 
- `nat` (natural).

In [16]:
df.shape

(1048575, 4)

In [17]:
df["Tag"]

0          O
1          O
2          O
3          O
4          O
          ..
1048570    O
1048571    O
1048572    O
1048573    O
1048574    O
Name: Tag, Length: 1048575, dtype: object

# Target:
sentences = [
    ["Thousands", "of", "demonstrators", "have", ...],
    ["Families", "of", "soldiers", "killed", ...]
]

labels = [
    ["O", "O", "O", "O", "B-geo", ...],
    ["O", "O", "O", "O", ...]
]

In [18]:
df["Sentence #"].fillna(method="ffill",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Sentence #"].fillna(method="ffill",inplace=True)
  df["Sentence #"].fillna(method="ffill",inplace=True)


In [19]:
df.groupby("Sentence #")["Word"].apply(list).values[:4]

array([list(['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']),
       list(['Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'IAEA', 'surveillance', 'system', 'begins', 'functioning', '.']),
       list(['Helicopter', 'gunships', 'Saturday', 'pounded', 'militant', 'hideouts', 'in', 'the', 'Orakzai', 'tribal', 'region', ',', 'where', 'many', 'Taliban', 'militants', 'are', 'believed', 'to', 'have', 'fled', 'to', 'avoid', 'an', 'earlier', 'military', 'offensive', 'in', 'nearby', 'South', 'Waziristan', '.']),
       list(['They', 'left', 'after', 'a', 'tense', 'hour-long', 'standoff', 'with', 'riot', 'police', '.'])],
      dtype=object)

In [20]:
sentences=df.groupby("Sentence #")["Word"].apply(list).values.tolist()
labels=df.groupby("Sentence #")["Tag"].apply(list).values.tolist()

sentences[:2], labels[:2]

([['Thousands',
   'of',
   'demonstrators',
   'have',
   'marched',
   'through',
   'London',
   'to',
   'protest',
   'the',
   'war',
   'in',
   'Iraq',
   'and',
   'demand',
   'the',
   'withdrawal',
   'of',
   'British',
   'troops',
   'from',
   'that',
   'country',
   '.'],
  ['Iranian',
   'officials',
   'say',
   'they',
   'expect',
   'to',
   'get',
   'access',
   'to',
   'sealed',
   'sensitive',
   'parts',
   'of',
   'the',
   'plant',
   'Wednesday',
   ',',
   'after',
   'an',
   'IAEA',
   'surveillance',
   'system',
   'begins',
   'functioning',
   '.']],
 [['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-geo',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-geo',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-gpe',
   'O',
   'O',
   'O',
   'O',
   'O'],
  ['B-gpe',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-tim',
   'O',
   'O',
   'O',
   'B-org',
   'O',
   'O',
   'O',
  

In [21]:
from collections import defaultdict

word2idx = {"<PAD>": 0, "<UNK>": 1}
tag2idx = {"<PAD>": 0}

for sent in sentences:
    for w in sent:
        if w not in word2idx:
            word2idx[w] = len(word2idx)

for tag_seq in labels:
    for t in tag_seq:
        if t not in tag2idx:
            tag2idx[t] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}


In [22]:
def encode(sentences, labels, word2idx, tag2idx):
    X, y = [], []
    for sent, labels in zip(sentences, labels):
        X.append([word2idx.get(word,word2idx["<UNK>"]) for word in sent]) #Return the value for key if key is in the dictionary, else default.
        y.append([tag2idx[t] for t in labels])
    return X, y

X, y = encode(sentences, labels, word2idx, tag2idx)


In [23]:
import torch

MAX_LEN = max(len(s) for s in X)

def pad(seq, max_len):
    return seq + [0] * (max_len - len(seq))

X = torch.tensor([pad(s, MAX_LEN) for s in X])
y = torch.tensor([pad(s, MAX_LEN) for s in y])


In [24]:
import torch.nn as nn

class NERModel(nn.Module):
    def __init__(self, vocab_size, target_size, embedding_dim, hidden_dim, num_layers):
        super(NERModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, target_size) # since bidirectional we multiply by 2 
        
    def forward(self, x):
        embed = self.embedding(x)
        lstm_output, _ = self.lstm(embed)
        x = self.fc(lstm_output)
        return x
    

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = NERModel(vocab_size=len(word2idx), target_size=len(tag2idx), embedding_dim=128, hidden_dim=256, num_layers=2)
model = model.to(device)
model

Using device: cpu


NERModel(
  (embedding): Embedding(35171, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=18, bias=True)
)

In [26]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
from tqdm import tqdm

EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for i in tqdm(range(len(X))):
        
        optimizer.zero_grad()
        x_batch = X[i].unsqueeze(0).to(device)  # Move to GPU
        y_batch = y[i].to(device)  # Move to GPU
        outputs = model(x_batch)  # Add batch dimension as lstm requires 3D input
        loss = loss_fn(outputs.view(-1, len(tag2idx)), y_batch.view(-1)) # Reshape for loss computation
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

In [27]:
import torch

def predict(sentence):
    # Load the model properly
    loaded_model = NERModel(vocab_size=len(word2idx), target_size=len(tag2idx), embedding_dim=128, hidden_dim=256, num_layers=2)
    loaded_model.load_state_dict(torch.load("ner_model.pth", map_location=torch.device('cpu')))
    loaded_model = loaded_model.to(device)
    loaded_model.eval()
    
    with torch.no_grad():
        tokens = sentence.split()
        encoded = [word2idx.get(w, word2idx["<UNK>"]) for w in tokens]
        padded = encoded + [0] * (MAX_LEN - len(encoded))
        x = torch.tensor([padded]).to(device)

        logits = loaded_model(x)
        preds = torch.argmax(logits, dim=-1)[0]

        return [(w, idx2tag[p.item()]) for w, p in zip(tokens, preds)]


In [None]:
# torch.save(model.state_dict(), "ner_model.pth")

In [31]:
import spacy
from spacy import displacy
from IPython.display import HTML

def visualize_predictions(sentence):
    """Visualize BiLSTM NER predictions using spaCy displacy style"""
    predictions = predict(sentence)
    
    # Create spaCy-style doc format
    tokens = sentence.split()
    entities = []
    char_pos = 0
    
    for i, (token, tag) in enumerate(predictions):
        if tag != 'O':  # Only include actual entities
            start = sentence.find(token, char_pos)
            end = start + len(token)
            entities.append({
                'start': start,
                'end': end,
                'label': tag
            })
            char_pos = end
    
    # Create doc object for displacy
    doc_dict = {
        'text': sentence,
        'ents': entities,
        'title': None
    }
    
    # Force jupyter=False to avoid IPython display import issues
    html = displacy.render(doc_dict, style='ent', manual=True, jupyter=False)
    return HTML(html)


In [34]:
visualize_predictions("The Airline industry is facing challenges in 2023. This affects companies like Delta and American Airlines.")