# Reconhecimento de Entidades Nomeadas usando LSTM

In [1]:
!pip install -q datasets gensim

In [2]:
import numpy as np
import pandas as pd

## Carregando os datasets de treinamento

In [3]:
from datasets import load_dataset

In [4]:
train_dataset = load_dataset("conll2003", split="train")

In [5]:
words = train_dataset['tokens']
ner_tags = train_dataset['ner_tags']

In [6]:
df_train = pd.DataFrame(train_dataset)

In [7]:
df_train[['tokens','ner_tags']].head()

Unnamed: 0,tokens,ner_tags
0,"[EU, rejects, German, call, to, boycott, Briti...","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,"[Peter, Blackburn]","[1, 2]"
2,"[BRUSSELS, 1996-08-22]","[5, 0]"
3,"[The, European, Commission, said, on, Thursday...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,"[Germany, 's, representative, to, the, Europea...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


In [8]:
int2tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}

In [9]:
tag2int = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [10]:
tag2tensor = {'O':      [1,0,0,0,0,0,0,0,0],
              'B-PER':  [0,1,0,0,0,0,0,0,0], 
              'I-PER':  [0,0,1,0,0,0,0,0,0], 
              'B-ORG':  [0,0,0,1,0,0,0,0,0], 
              'I-ORG':  [0,0,0,0,1,0,0,0,0], 
              'B-LOC':  [0,0,0,0,0,1,0,0,0], 
              'I-LOC':  [0,0,0,0,0,0,1,0,0], 
              'B-MISC': [0,0,0,0,0,0,0,1,0], 
              'I-MISC': [0,0,0,0,0,0,0,0,1]}

In [11]:
words = train_dataset['tokens']
ner_tags = train_dataset['ner_tags']

for i in range(0,len(words[0])):
    print(f'Word: {words[0][i]} | text label: {int2tag[ner_tags[0][i]]} | one-hot label {tag2tensor[int2tag[ner_tags[0][i]]]}')

Word: EU | text label: B-ORG | one-hot label [0, 0, 0, 1, 0, 0, 0, 0, 0]
Word: rejects | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0]
Word: German | text label: B-MISC | one-hot label [0, 0, 0, 0, 0, 0, 0, 1, 0]
Word: call | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0]
Word: to | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0]
Word: boycott | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0]
Word: British | text label: B-MISC | one-hot label [0, 0, 0, 0, 0, 0, 0, 1, 0]
Word: lamb | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0]
Word: . | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0]


## Usando word2vec para os embeddings

In [12]:
import gensim.downloader as api

In [13]:
word2vec = api.load("word2vec-google-news-300", )

In [14]:
def tokens2vec(tokens):
    vectors = []
    for token in tokens:
        try:
            vector = word2vec[token]
            vectors.append(vector)
        except KeyError:
            vectors.append(np.zeros(word2vec.vector_size))
    return vectors

In [15]:
words_vectors = [tokens2vec(word) for word in words]

In [16]:
for i in range(0,len(words[0])):
    print(f'Word: {words[0][i]} | text label: {int2tag[ner_tags[0][i]]} | one-hot label {tag2tensor[int2tag[ner_tags[0][i]]]} | word2vec {words_vectors[0][i][:3]}')

Word: EU | text label: B-ORG | one-hot label [0, 0, 0, 1, 0, 0, 0, 0, 0] | word2vec [ 0.03735352 -0.203125    0.21289062]
Word: rejects | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0] | word2vec [0.00982666 0.2265625  0.28125   ]
Word: German | text label: B-MISC | one-hot label [0, 0, 0, 0, 0, 0, 0, 1, 0] | word2vec [0.30664062 0.11035156 0.16699219]
Word: call | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0] | word2vec [-0.11816406  0.08154297  0.15039062]
Word: to | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0] | word2vec [0. 0. 0.]
Word: boycott | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0] | word2vec [-0.04272461  0.08203125  0.12207031]
Word: British | text label: B-MISC | one-hot label [0, 0, 0, 0, 0, 0, 0, 1, 0] | word2vec [-0.06542969 -0.02038574  0.01452637]
Word: lamb | text label: O | one-hot label [1, 0, 0, 0, 0, 0, 0, 0, 0] | word2vec [-0.08007812  0.15625    -0.35351562]
Word: . | text label: O | one-hot label [1, 0, 0, 0,

## Criando o DataLoader

In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [18]:
X_train, y_train = [], []
for vectors, tags in zip(words_vectors, ner_tags):
    for vec, tag in zip(vectors, tags):
        X_train.append(torch.tensor(vec,  dtype=torch.float32))
        y_train.append(torch.tensor(tag2tensor[int2tag[tag]],  dtype=torch.float32))

In [19]:
dataset = TensorDataset(torch.stack(X_train), torch.stack(y_train))

In [20]:
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

## Classe da rede LSTM

In [21]:
import torch.nn as nn
import torch.optim as optim

In [22]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.unsqueeze(1)  
        lstm_out, _ = self.lstm(x)
        last_hidden_state = lstm_out[:, -1, :]
        output = self.fc(last_hidden_state)        
        return output

## Instanciando a rede LSTM

In [23]:
model = LSTMModel(input_size=300, hidden_size=128, output_size=9)

In [24]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Treinando a rede LSTM

In [25]:
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in data_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(data_loader):.4f}')

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/10], Loss: 0.2506
Epoch [2/10], Loss: 0.1929
Epoch [3/10], Loss: 0.1764
Epoch [4/10], Loss: 0.1657
Epoch [5/10], Loss: 0.1582
Epoch [6/10], Loss: 0.1527
Epoch [7/10], Loss: 0.1484
Epoch [8/10], Loss: 0.1455
Epoch [9/10], Loss: 0.1429
Epoch [10/10], Loss: 0.1408


In [26]:
model.eval()

LSTMModel(
  (lstm): LSTM(300, 128)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)

## Testando a rede LSTM

In [27]:
from datasets import Dataset

In [28]:
test = pd.read_csv('test.csv')

In [29]:
test.head()

Unnamed: 0,ID,tokens
0,2619,"['(', '52.76', '/', '53.18', ')']"
1,456,"['WESTERN', 'CONFERENCE']"
2,102,"['Wasim', 'Akram', 'b', 'Harris', '4']"
3,3037,"['Mansfield', '21', '5', '9', '7', '21', '22',..."
4,1126,"['--', 'New', 'York', 'Commodities', 'Desk', '..."


In [30]:
test_dataset = Dataset.from_pandas(test)

In [31]:
words = test_dataset['tokens']

In [32]:
words = [eval(text) for text in words]

In [33]:
words[0]

['(', '52.76', '/', '53.18', ')']

In [34]:
words_vectors = [tokens2vec(word) for word in words]

In [35]:
X_test = []
for vectors in words_vectors:
    for vec in vectors:
        X_test.append(torch.tensor(vec,  dtype=torch.float32))

In [36]:
dataset_test = TensorDataset(torch.stack(X_test))

In [37]:
test_loader = DataLoader(dataset_test, batch_size=16, shuffle=True)

In [38]:
predicted_list = []
with torch.no_grad():
    for inputs in test_loader:
        outputs = model(*inputs)
        _, predicted = torch.max(outputs, 1) 
        predicted_list.extend(predicted.numpy())

In [39]:
predicted_list = np.array(predicted_list)

In [40]:
predicted_list[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 5, 2, 3, 0, 0, 0, 0, 0, 5, 0, 3,
       3, 2, 0, 3, 0, 0, 6, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0,
       0, 0, 2, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3,
       8, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 5, 0, 3, 0, 0])

In [41]:
y_pred = []
for word in words:
    aux = len(word)
    y_pred.append(predicted_list[:aux])
    predicted_list = predicted_list[aux:]

In [42]:
y_pred = [array.tolist() for array in y_pred]

In [43]:
pred = pd.DataFrame({'ner_tags': y_pred}, index=test.ID) 

In [44]:
pred.to_csv('ner-solution.csv')