# Neural Network

This is the core of our project. For running this, you need to have created three accordingly formatted splits of the data by running these files from the repository:
* preprocessing.sh
* splitting_data.py

You don't need to run the following files because this Notebook accesses them directly:
* data_loading.py
* tokenizer.py
* embeddings.py

## Imports

In [1]:
import datasets
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tokenizer import tokenize_and_encode, encode_pos
from embeddings import embedding_model

## Data
Please insert the names of the three splits below:

In [2]:
training_file = "sample_train.tsv"
validation_file = "sample_val.tsv"
test_file = "sample_test.tsv"

## Loading the data

In [4]:
data_files = {
            "train": training_file,
            "test": test_file,
            "validation": validation_file
        }

data_sets = datasets.load_dataset("data_loading.py", data_files=data_files)

Using custom data configuration default-eb9e404412238167
Reusing dataset sample (C:\Users\Luisa\.cache\huggingface\datasets\sample\default-eb9e404412238167\0.0.0\b17a92fd5558fd4c73932ea4beb2edc3fd0b9014fe60a7232b02b67dc71bf1b0)


## Tokenizing the data

In [6]:
# tokens
tokenized_train = tokenize_and_encode("train", data_sets)
tokenized_test = tokenize_and_encode("test", data_sets)
tokenized_validation = tokenize_and_encode("validation", data_sets)
# lists of dictionaries with keys: "input_ids", "token_type_ids", "attention_mask"

# pos-tags
pos_encoded_train = encode_pos("train", data_sets)
pos_encoded_test = encode_pos("test", data_sets)
pos_encoded_validation = encode_pos("validation", data_sets)

print(tokenized_train[0]) # first sentence, tokenized
print(tokenized_train[0]["input_ids"].shape)
print(pos_encoded_train[0]) # first sentence, pos, tokenized
print(pos_encoded_train[0]["input_ids"].shape)

{'input_ids': tensor([[  101, 20802, 15624,  6718,  3128,  5571,   117, 19869, 10753,   117,
          1347,  2518,   117,  1118, 13509, 18745,  8745,  4553, 26197,  1105,
         20802, 13284,  1179, 26197,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0

## Embeddings
!Attention: This takes very long and I am not sure whether it is even needed or better done inside the network! (so maybe don't run it)

In [None]:
embeddings_train = list()
for sentence in tokenized_train:
    embeddings_train.append(embedding_model(sentence["input_ids"]))
    
embeddings_test = list()
for sentence in tokenized_train:
    embeddings_test.append(embedding_model(sentence["input_ids"]))
    
embeddings_validation = list()
for sentence in tokenized_validation:
    embeddings_validation.append(embedding_model(sentence["input_ids"]))
    
embeddings_train[0] # embeddings for first sentence

In [None]:
print(embeddings_train[0][0].shape)
embeddings_train[0][1].shape

## Defining Hyperparameters

In [25]:
sequence_length = 73
# others will be extracted from the code below in the end

## Network architecture
For the architecture of the network, we decided to go for an LSTM because it work well for sequence processing while at the same time handling vanishing and exploding gradients better than a regular RNN.<br>

I found the following architecture online. <br>
My problem is that I don't know where to do the embeddings.
Many implementations I have seen have an embedding layer inside the network and they use nn.Embedding. But this doesn't exactly go with my embeddings implementation (see Embeddings above). How am I supposed to incorporate the embeddings?<br>

Also, I have no idea how to decide on any of the hyperparameters. (I am very unexperienced as you can probably already tell)<br>

And: How do I include the tracking tool wandb?<br>

Note: The below implementation does not work. I haven't had time to debug yet. Any suggestions are very welcome.

In [7]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        ''' Initialize the layers of this model.'''
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim

        # embedding layer that turns words into a vector of a specified size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # the LSTM takes embedded word vectors (of a specified size) as inputs 
        # and outputs hidden states of size hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # the linear layer that maps the hidden state output dimension 
        # to the number of tags we want as output, tagset_size (in this case this is 3 tags)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        # initialize the hidden state (see code below)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):
        ''' At the start of training, we need to initialize a hidden state;
           there will be none because the hidden state is formed based on perviously seen data.
           So, this function defines a hidden state with all zeroes and of a specified size.'''
        # The axes dimensions are (n_layers, batch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        ''' Define the feedforward behavior of the model.'''
        # create embedded word vectors for each word in a sentence
        embeds = self.word_embeddings(sentence)
        
        # get the output and hidden state by passing the lstm over our word embeddings
        # the lstm takes in our embeddings and hiddent state
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        # get the scores for the most likely tag for a word
        tag_outputs = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_outputs, dim=1)
        
        return tag_scores

## Model instantiation

In [20]:
EMBEDDING_DIM = 64
HIDDEN_DIM = 6

# instantiate our model
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, sequence_length, sequence_length)

# define our loss and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

## Training loop

In [24]:
n_epochs = 3

for epoch in range(n_epochs):
    
    epoch_loss = 0.0
    
    # get all sentences and corresponding tags in the training data
    for n, sentence in enumerate(tokenized_train):
        
        # zero the gradients
        model.zero_grad()

        # zero the hidden state of the LSTM, this detaches it from its history
        model.hidden = model.init_hidden()

        # prepare the inputs for processing by out network
        x = sentence["input_ids"]
        print(x)
        print(x.shape)
        y_gold = pos_encoded_train[n]["input_ids"]
        print(y_gold)
        print(y_gold.shape)

        # forward pass to get tag scores
        tag_scores = model(x) # doesn't work for whatever reason

        # compute the loss, and gradients 
        loss = loss_function(tag_scores, y_gold)
        epoch_loss += loss.item()
        loss.backward()
        
        # update the model parameters with optimizer.step()
        optimizer.step()

tensor([[  101, 20802, 15624,  6718,  3128,  5571,   117, 19869, 10753,   117,
          1347,  2518,   117,  1118, 13509, 18745,  8745,  4553, 26197,  1105,
         20802, 13284,  1179, 26197,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]])
torch.Size([1, 73])
tensor([[101,  18,  18,  18,   1,  18,   1,  18,  17,   1,  12,  20,  18,  18,
           6,  18,  18, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,


IndexError: index out of range in self