# To-Do

[X] Load the data  
[ ] Design the LSTM model  
  
    [X] Embedding layer. 
    [X] Linear Layer  
[ X ] Train function  
   
    [X] Cross Entropy loss  
    [X] Adam Optmizer  
    
[__] COMET


#### Evaluation
[  ] Tagging accuracy on a given sentence

[  ] Accuracy on __Development__ Corpus after __each__ epoch

[  ] Accuracy on __Trainning__ Corpus after __each__ epoch

[  ] Accuracy on __Test__ Corpus

In [1]:
# importing comet first
from comet_ml import Experiment

experiment = Experiment(project_name="LSTM-PosTagger")

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/lflage/lstm-postagger/9548949d0273489eadde2c80936127aa



In [2]:
import data
import torch
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
from torch.nn import functional
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader


##
# Loading the corpus into the following variables
#    train_dataloader           DataLoader for iterating over the training data
#    dev_dataloader             DataLoader for iterating over the development data
#    test_dataloader            DataLoader for iterating over the test data
#    vocabulary                 Vocabulary of words in the sentences in the data
#    tagset                     Vocabulary of POS tags in the data
#    pretrained_embeddings      Pretrained fasttext word embeddings 
##

train_dataloader,dev_dataloader,test_dataloader,vocabulary,tagset,pretrained_embeddings = data.load(
'corpus/de_gsd-ud-train.conllu',
'corpus/de_gsd-ud-dev.conllu',
'corpus/de_gsd-ud-test.conllu'
)

In [4]:
##
# Print information of Embeddings, Tagset and Vocab
##

print("\nPretrained Embeddings shape:")
print(pretrained_embeddings.shape)

print("\nTagset size:{}\n".format(len(tagset)))
print('Tags:\n{}'.format(tagset.lookup_tokens(range(0,len(tagset)))))

print("\nVocab size: {}".format(len(vocabulary)))

print("\nVocab sample:")
print(vocabulary.lookup_tokens(range(0,50)))




Pretrained Embeddings shape:
torch.Size([50506, 300])

Tagset size:18

Tags:
['NOUN', 'PUNCT', 'DET', 'ADP', 'PROPN', 'VERB', 'ADJ', 'ADV', 'PRON', 'AUX', 'CCONJ', 'NUM', '_', 'PART', 'SCONJ', 'X', 'SYM', 'INTJ']

Vocab size: 50506

Vocab sample:
['<unk>', '<pad>', '</s>', ',', 'der', 'die', 'in', 'und', 'dem', '-', 'von', 'zu', 'den', 'das', 'im', 'mit', 'ist', 'er', 'des', 'an', '"', ')', '(', 'ein', 'eine', 'auf', 'als', 'für', 'sich', 'wurde', 'auch', 'war', 'nach', 'bei', 'sie', 'es', 'nicht', 'aus', 'bis', 'sind', 'einer', 'werden', 'zum', 'durch', 'wird', 'ich', 'am', 'einen', 'einem', 'zur']


In [5]:
for minibatch in train_dataloader:

    print('Shape of the batch Tensor objects: {}\n'.format(minibatch.size()))
    print('\nFirst POS Tags:\n')
    print(' '.join(tagset.lookup_tokens(minibatch[0][1].flatten().tolist())))
    print('\nFirst Sentence:\n')
    print(' '.join(vocabulary.lookup_tokens(minibatch[0][0].flatten().tolist())))
    break

Shape of the batch Tensor objects: torch.Size([1, 2, 16])


First POS Tags:

ADV ADJ NOUN PUNCT ADJ NOUN DET NOUN PUNCT ADV VERB PRON PRON NOUN ADP PUNCT

First Sentence:

sehr gute beratung , schnelle behebung der probleme , so stelle ich mir kundenservice vor </s>


In [6]:
class LSTMPosTagger(nn.Module):
    def __init__(self,
        embeddings,
        hidden_dim,
        tagset_size):
        super(LSTMPosTagger, self).__init__()
        
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding.from_pretrained(embeddings)

        # The LSTM receives the word embedding vector a input and outputs
        # a vector of size int(hidden_dim). This size can be changed to test
        # it's influence on the model performance
        self.lstm = nn.LSTM(self.word_embeddings.embedding_dim, hidden_dim)

        # A Linear layer that receives the output of the LSTM model with
        # size int(hidden_dim) and outputs a vector of size int(tagset_size) 
        self.hid_to_tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        '''sentence is a list of indices for the words in the pre trained embedding
        model. Embeds '''
        embeds = self.word_embeddings(sentence)
        ''''''
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_scores = self.hid_to_tag(lstm_out.view(len(sentence), -1))
        
        return tag_scores

In [7]:
def train_model_comet(model: Module, 
                train_data: DataLoader,
                num_epochs: int,
                optimizer_type,
                loss_function,
                learning_rate: float) -> None:
    """
    runs one commplete training run, i.e. trains the model on your training data for
    :param model: a pytorch model
    :param train_data: a dataloader for getting the training instances
    :param num_epochs: the number of epochs to train
    :param optimizer_type: the type of optimizer to use for training
    :param loss_function: the type of loss function to use
    :param learning_rate: the learning rate for the optimizer
    :return:
    """
    
    
    print(f'--------- Start Training ------------')

    # Important: bring model into training mode
    model.train()
    experiment = Experiment()
    
    optimizer = optimizer_type(params=model.parameters(), lr=learning_rate)

    # run training for specified number of epochs; use tqdm to keep track of progress / estimated run time 
    with experiment.train():
        step=0

        for epoch in tqdm(range(num_epochs), desc='Classifier Training\n'):
            # Cummulative loss per batch
            cum_loss = 0
            # Number of correct predictions
            correct = 0
            # Number of total tokens predicted
            total = 0
            
            print(f'---------- Started Epoch {epoch} -----------')

            for batch in train_data:
                # get the input instances 
                input_attributes = batch[0][0].to(device)
                # get the corresponding labels
                gold_labels = batch[0][1].to(device)
                
                # compute model predictions with current model parameters
                model_output = model(input_attributes)
    
                # Compute Loss for current batch
                loss = loss_function(model_output, gold_labels)
                cum_loss += loss.item()
                  
                #Important: otherwise you add up your gradients for all batches and for all epochs
                optimizer.zero_grad()
    
                loss.backward()
    
                # Update parameters
                optimizer.step()
            
                ##################################################
                # COMET
                # Compute train accuracy
                # Torch.max returns a namedtuple where of (value,indices)
                # where ```values``` is the maximum value of each rou of the
                # input tensor in the given dimension 
                _, predicted = torch.max(model_output.data, 1)

                batch_total = gold_labels.size(0)
                total += batch_total

                batch_correct = (predicted == gold_labels.data).sum()
                correct += batch_correct

                 # Log batch_accuracy to Comet.ml; step is each batch
            step+=1
            experiment.log_metric("mean_epoch_accuracy", batch_correct / batch_total, step=step)
                ################################################
                
            mean_loss_per_epoch = cum_loss/len(train_data)
            experiment.log_metric('Mean_loss_per_epoch',mean_loss_per_epoch,step)
            print(mean_loss_per_epoch)

### Hyperparameters

In [8]:
# Hyper Parameters
HIDDEN_SIZE = 300

NUM_EPOCHS = 100

LEARNING_RATE = 0.01

# Inputs
EMBEDDINGS = pretrained_embeddings

OPTIMIZER = optim.Adam

LOSS_FUNCTION = nn.functional.cross_entropy


In [10]:
HyperParameters = {'HiddenSize': HIDDEN_SIZE,
                   'NumEpochs': NUM_EPOCHS,
                   'LearningRate': LEARNING_RATE
                  }

experiment.log_parameters(HyperParameters)

### Initializing the model

In [11]:
POS_TAGGER = LSTMPosTagger(pretrained_embeddings,HIDDEN_SIZE,len(tagset))

device = torch.device('cpu')

# Training the model on Development Data

In [None]:
train_model_comet(POS_TAGGER,
            dev_dataloader,
            NUM_EPOCHS,
            OPTIMIZER,
            LOSS_FUNCTION,
            LEARNING_RATE)

torch.save(POS_TAGGER.state_dict(),'./LSTM_PosTagger_DEV.pt')

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/lflage/general/e0f7f4bb701d46d4a8028fa08ffa4059
COMET INFO:   Uploads:
COMET INFO:     environment details      : 1
COMET INFO:     filename                 : 1
COMET INFO:     git metadata             : 1
COMET INFO:     git-patch (uncompressed) : 1 (56 KB)
COMET INFO:     installed packages       : 1
COMET INFO:     os packages              : 1
COMET INFO: ---------------------------


--------- Start Training ------------


COMET INFO: Experiment is live on comet.ml https://www.comet.ml/lflage/general/a81873a349a846f2ba14cb9516df5b0e



Classifier Training
:   0%|          | 0/100 [00:00<?, ?it/s][A[A

---------- Started Epoch 0 -----------


In [None]:
# Saving the Model

torch.save(POS_TAGGER.state_dict(),'./LSTM_PosTagger_DEV.pt')

# Test Eval

In [None]:
def compute_accuracy(y_true, y_pred):
    correct_predictions = 0
    # iterate over each label and check
    for true, predicted in zip(y_true, y_pred):
        if true == predicted:
            correct_predictions += 1
    # compute the accuracy
    accuracy = correct_predictions/len(y_true)
    return accuracy


In [None]:
POS_TAGGER.eval()

for batch in test_dataloader:
    input_attributes = batch[0][0].to(device)
    gold_labels = batch[0][1].to(device)
    
    model_output = POS_TAGGER(input_attributes)
    
    print(model_output.data)
    print('\n\n\nPREDICTED')
    _, predicted = torch.max(model_output.data, 1)
    print(predicted)

    break

In [3]:
POS_TAGGER.eval()

POS_TAGGER('')

with experiment.test():
    with torch.no_grad():  
        correct=0
        total=0
           
        test_predictions = None
        test_targets = None
        
        for batch in test_dataloader:
            print(f'--------- Evaluate Model ------------')
            input_attributes = batch[0][0].to(device)
            # get the corresponding labels
            gold_labels = batch[0][1].to(device)

            model_output = POS_TAGGER(input_attributes)
            
            print(gold_labels.size())
            break

        # run trained model on test instances

        # compute evaluation metrics to evaluate the model performance based on predictions of the model

NameError: name 'POS_TAGGER' is not defined

In [None]:
POS_TAGGER.eval()

for batch in test_dataloader:
    input_attributes = batch[0][0].to(device)
    gold_labels = batch[0][1].to(device)
    
    model_output = POS_TAGGER(input_attributes)
    
    print(model_output.data)
    print('\n\n\nPREDICTED')
    _, predicted = torch.max(model_output.data, 1)
    print(predicted)
    print( '\n\n\nGOLD LABEL SIZE:')
    print(gold_labels.size())
    break