In [2]:
from data_preprocessing import *

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F

In [4]:
Xtrain = torch.tensor(X_train[:50],dtype=torch.long)
Xtest = torch.tensor(X_test[:50],dtype=torch.long)
ytrain = torch.tensor(y_train[:50],dtype=torch.long)
ytest = torch.tensor(y_test[:50], dtype=torch.long)
Xdev = torch.tensor(X_dev[:50],dtype=torch.long)
ydev = torch.tensor(y_dev[:50],dtype=torch.long)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
# the pre-processing steps for data before beginning training the model

loader_train = data.DataLoader(data.TensorDataset(Xtrain, ytrain), shuffle=True, batch_size=4)
loader_dev = data.DataLoader(data.TensorDataset(Xdev, ydev), shuffle=True, batch_size=4)
loader_test = data.DataLoader(data.TensorDataset(Xtest, ytest), shuffle=True, batch_size=4)

In [16]:
class BahdanauAttention(nn.Module):
    
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim=hidden_dim
        self.w1=nn.Linear(hidden_dim, hidden_dim)
        self.w2=nn.Linear(hidden_dim, hidden_dim)
        self.V=nn.Linear(hidden_dim,1)
    
    def forward (self, hidden, encoder_outputs):
        seq_len=encoder_outputs.shape[0]
       
        hidden=hidden.unsqueeze(1).repeat(1, seq_len, 1)
        encoder_outputs = encoder_outputs.permute(1,0,2)
        
        scores = self.V(torch.tanh(self.w1(hidden) + self.w2(encoder_outputs)))
        attention = scores.squeeze(2)
        return F.softmax(attention,dim=-1)

In [17]:
class RNNAttentionNER(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_size, output_dim):

        super(RNNAttentionNER, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers=2, dropout=0.5)
        self.attention = BahdanauAttention(hidden_size)
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        outputs, hidden = self.rnn(embedded)
        #print("Das ist hidden",outputs.shape)
        seq_len, batch_size, _ = outputs.size()
        logits = []
        
        for i in range(seq_len):
            
            attention_weights = self.attention(hidden[1], outputs)
            
            #context = torch.bmm(attention_weights, outputs)
            context = torch.bmm(attention_weights.unsqueeze(1), outputs.permute(1, 0, 2)).squeeze(1)
            logit = self.fc(torch.cat((hidden[1], context), dim=1))
            logits.append(logit)
        logits = torch.stack(logits, dim=0)
        return logits

In [10]:
EMBEDDING_DIM = 100

HIDDEN_DIM = 128

OUTPUT_DIM = TAG_COUNT

In [18]:
model_rnn=RNNAttentionNER(len_uniq_words,EMBEDDING_DIM,HIDDEN_DIM,OUTPUT_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer_rnn = optim.Adam(model_rnn.parameters())

In [19]:
num_epochs=10
for epoch in range(num_epochs):
    model_rnn.train()
    train_loss=0.0
    #print(epoch)
    for x_batch, y_batch in loader_train:
        
        optimizer_rnn.zero_grad()
        tag_scores = model_rnn(x_batch)
        predictions=tag_scores.view(-1,tag_scores.shape[-1])
        
        tags=y_batch.view(-1)
        #print("true tag",tags)
        loss = criterion(predictions, tags)
        loss.backward()
        optimizer_rnn.step()
        train_loss += loss.item()
    train_loss/=len(loader_train)
    
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss}")

Epoch 1, Train Loss: 1.2523417656238263
Epoch 2, Train Loss: 0.6063597179376162
Epoch 3, Train Loss: 0.4307935833930969
Epoch 4, Train Loss: 0.31366980648957765
Epoch 5, Train Loss: 0.30020449597101945
Epoch 6, Train Loss: 0.2959983085210507
Epoch 7, Train Loss: 0.2743983303125088
Epoch 8, Train Loss: 0.2678031852612129
Epoch 9, Train Loss: 0.278996358697231
Epoch 10, Train Loss: 0.26663968654779285
