In [62]:
%pip install transformers



In [63]:
%pip install seqeval



In [64]:
import json
import pickle
import time
import datetime
import random
import os
import csv

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import f1_score

import matplotlib.pyplot as plt

device = torch.device("cpu")

SEED_VAL = 42

random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL);  # Semicolon prevents jupyter from displaying last line as output

In [65]:
from google.colab import drive

drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [108]:
SNIPS_PATH = "drive/MyDrive/datasets"
TRAIN_PATH = f"{SNIPS_PATH}/train_filtered.csv"
VAL_PATH = f"{SNIPS_PATH}/val_filtered.csv"
TEST_PATH = f"{SNIPS_PATH}/test_filtered.csv"
#df = pd.read_csv(TEST_PATH,sep=',')

# def load_snips_file(file_path):
#     list_pair =[]
#     with open(file_path,'r',encoding="utf8") as f:
#         for line in f:
#             split_line = line.split(',')
#             pair = split_line[1],split_line[2].strip()
#             list_pair.append(pair)
#     return list_pair


In [109]:
df_train = pd.read_csv(TRAIN_PATH, skipinitialspace=True)
df_valid = pd.read_csv(VAL_PATH, skipinitialspace=True)
df_test = pd.read_csv(TEST_PATH, skipinitialspace=True)

In [110]:
df_train['Subject'] = df_train['Subject'].apply(lambda x: None if x == 'me' else x)
df_valid['Subject'] = df_valid['Subject'].apply(lambda x: None if x == 'me' else x)
df_test['Subject'] = df_test['Subject'].apply(lambda x: None if x == 'me' else x)

df_train.dropna(inplace=True)
df_train = df_train.reset_index(drop=True)
df_valid.dropna(inplace=True)
df_valid = df_valid.reset_index(drop=True)
df_test.dropna(inplace=True)
df_test = df_test.reset_index(drop=True)

df_train

Unnamed: 0,Command,Subject,Target
0,get me the fork,fork,me
1,Bring me the soap,soap,me
2,give me the apple,apple,me
3,pass me the fork.,fork,me
4,Hand me the apple.,apple,me
...,...,...,...
380,Go to the bathroom cabinet and bring me the it...,item,"cabinet, me, right, melon"
381,When you see the person raising their right ar...,"arm, melon","bedroom, person"
382,go to the bathtub and find the small dish and ...,"dish, it","bathtub, Robin, cabinet"
383,"Head to the counter, locate the pickles, and b...","pickles, them","counter, Hayden, drawer"


In [111]:
df_train = df_train.replace({r'\r|\n': ''}, regex=True)
df_valid = df_valid.replace({r'\r|\n': ''}, regex=True)
df_test = df_test.replace({r'\r|\n': ''}, regex=True)

In [70]:
# df_train.drop('id', axis=1, inplace=True)
# df_test.drop('id', axis=1, inplace=True)
# df_valid.drop('id', axis=1, inplace=True)

In [112]:
df_train.head()

Unnamed: 0,Command,Subject,Target
0,get me the fork,fork,me
1,Bring me the soap,soap,me
2,give me the apple,apple,me
3,pass me the fork.,fork,me
4,Hand me the apple.,apple,me


In [72]:
# df_train = df_train[df_train['logical_form'].str.contains('bring|put', na=False)].reset_index()
# df_test = df_test[df_test['logical_form'].str.contains('bring|put', na=False)].reset_index()
# df_valid = df_valid[df_valid['logical_form'].str.contains('bring|put', na=False)].reset_index()

## Intent ID Lookup

In order to input the intent labels into our model we will need to **Create a dictionary mapping each intent name to an integer ID (starting with 0), and assign the dictionary to intent_labeltoid**.

In [113]:
#ograniczamy się do bring i put

intents = df_train['Target'].unique().tolist()
intents_valid = df_valid['Target'].unique().tolist()
intents_test = df_test['Target'].unique().tolist()

intents = [*intents, *intents_valid, *intents_test]

#usuwanie duplikatów
intents = list(set(intents))

intent_labeltoid = {intents[i]: i  for i in range(len(intents))}
intent_labeltoid

{'me, cereal': 0,
 'bedroom, sink': 1,
 'person, pointing': 2,
 'MICROWAVE, TO, ALEX, TABLE': 3,
 'me, armchair': 4,
 'bathroom, person, left': 5,
 'cupboard, him, stove': 6,
 'me, top, cookies, shower': 7,
 'cabinet, bar': 8,
 'me, dresser, apple': 9,
 'table, banana, me': 10,
 'nightstand': 11,
 'me, side, counter': 12,
 'me, napkin, armchair': 13,
 'me, couch, tv': 14,
 'drawer': 15,
 'stove, Robin': 16,
 'chair, him': 17,
 'me, bathroom': 18,
 'drawer, bathroom': 19,
 'table, dishwasher': 20,
 'me, desk': 21,
 'microwave, bathtub': 22,
 'human, right, bathroom': 23,
 'me, chips, chair': 24,
 'me, table': 25,
 'dresser': 26,
 'me, spoon': 27,
 'table, Tracy, bed': 28,
 'knife, top, machine, me': 29,
 'bathroom': 30,
 'machine, choco': 31,
 'me, nightstand': 32,
 'bed': 33,
 'sofa': 34,
 'Jordan, nightstand': 35,
 'for, them': 36,
 'rack, shower': 37,
 'me, counter': 38,
 'me, top, tea, airchair': 39,
 'dresser, me': 40,
 'microwave, me': 41,
 'Michael': 42,
 'me, top, machine': 43,


In [114]:
df_train_list = df_train.values.tolist()
df_test_list = df_test.values.tolist()
df_valid_list = df_valid.values.tolist()
for lista in [df_train_list, df_test_list, df_valid_list]:
    for item in lista:
        item.reverse()
        #do trenowania subject
        # item.pop(0)
        #do trenowania target
        item.pop(1)
df_valid_list

[['me', 'bring me the soap'],
 ['me', 'Bring me the apple.'],
 ['sofa', 'bring the pear to the sofa'],
 ['chair', 'Put a coke on the high chair.'],
 ['shower', 'Put the big dish on the shower.'],
 ['table', 'Move the apple to the side table'],
 ['sink', 'get the large dishes from the sink'],
 ['me, sink', 'Bring me the shampoo from the sink.'],
 ['microwave', 'Take the fork over to the microwave.'],
 ['cabinet, bar', 'take the bag from the cabinet to the bar'],
 ['cabiinet', 'Move the cereal to the bathroom cabiinet'],
 ['me, sofa', 'Bring me the lightest objects on the sofa.'],
 ['bed, drawer', 'relocate knife from bed to cutlery drawer.'],
 ['me, stove', 'Bring me the rightmost object on the stove'],
 ['table', 'Please take that melon to that coffee  table'],
 ['me, bathroom', 'bring me the biggest thing from the bathroom'],
 ['me', 'I need you to retrieve the tuna fish for me.'],
 ['tv, her', 'Find Robin near the tv and give her the sponge.'],
 ['table, room', 'take this melon to th

In [115]:
#How many training examples are there for each intent?
df_train['Target'].value_counts()

Target
me                         31
me, table                  14
table                       8
me, sink                    6
me, dresser                 6
                           ..
right, glass, couch, me     1
side, flakes, sofa, me      1
knife, top, machine, me     1
table, dishwasher           1
stove                       1
Name: count, Length: 234, dtype: int64

In [76]:
# def create_mini_training_set(examples_per_intent):
#     intent_array = np.array(df_train_list)[:,0]
#     mini_batch =[]
#     for intent in intents:
#         add = intent_array[intent_array==intent]
#         shuffled_indicies=np.random.RandomState(seed=42).permutation(len(add))
#         class_indicies=shuffled_indicies[:examples_per_intent]
#         sampled_set = np.array(df_train_list)[class_indicies]
#         mini_batch.append(sampled_set)
#     mini_batch = np.array(mini_batch)
#     mini_set = mini_batch.transpose(1,0,2).reshape(-1,mini_batch.shape[2])
#     return mini_set

In [116]:
import re

def get_pad_length():
    all_train_examples_sentences = df_train['Command']
    word_length = []
    for sentence in all_train_examples_sentences:
        number_words = len(re.findall(r'\b\w+\b|<\w+>', sentence))
        word_length.append(number_words)
    return max(word_length)

PAD_LEN = get_pad_length()

In [117]:
PAD_LEN

20

In [118]:
INTENT_DIM = 302

## BERT Tokenizer


In [119]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

test_utterance = "give <object> to me"

print(tokenizer.encode_plus(
            test_utterance, add_special_tokens=True, max_length=PAD_LEN, pad_to_max_length=True,
            truncation=True, return_attention_mask=True, return_tensors='pt'
    ))

{'input_ids': tensor([[ 101, 2507, 1026, 4874, 1028, 2000, 2033,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}




In [120]:
def examples_to_dataset(examples):
    input_ids = []
    attention_masks = []
    labels = []
    for instance in examples:
        token_dict = tokenizer.encode_plus(
                instance[1], add_special_tokens=True, max_length=PAD_LEN, pad_to_max_length=True,
                truncation=True, return_attention_mask=True, return_tensors='pt')
        input_ids.append(token_dict['input_ids'])
        attention_masks.append(token_dict['attention_mask'])
        labels.append(torch.tensor(intent_labeltoid[instance[0]]).type(torch.LongTensor))

    input_ids = torch.cat(input_ids)
    attention_masks = torch.cat(attention_masks)
    labels = torch.stack(labels)


    dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset

In [121]:
#prepare the validation/test dataloaders
val_dataset = examples_to_dataset(df_valid_list)
test_dataset = examples_to_dataset(df_test_list)
BATCH_SIZE = 50
validation_dataloader = DataLoader(val_dataset, sampler=RandomSampler(df_valid_list), batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(df_test_list), batch_size=BATCH_SIZE)

# Training Setup

BERT will output a matrix tensor of shape batch size by #intents, which consists of the intent probabilities for each utterance in the batch. In order to compute the accuracy of the predictions, this prediction matrix is compared with the tensor of correct label IDs (a tensor of size #intents).



**The method `accuracy()`, which takes in the predictions (shape batch size by #intents) as a numpy array, and the correct labels as a numpy array (shape #intents), and returns the floating point accuracy of those predictions in the range \[0,1\]**

In [122]:
def get_accuracy(preds, labels):
    pred_convd = np.argmax(preds,1).flatten()
    labels_flat = labels.flatten()
    correct_labels = np.equal(pred_convd,labels_flat).sum()
    accuracy_value = correct_labels/len(labels)
    return accuracy_value

In [123]:
# Quick tests for the implementation of accuracy.

preds1 = np.array([[1,2,3], [1,3,2], [3,2,1]])

assert get_accuracy(preds1, np.array([2,1,0])) == 1.0
assert get_accuracy(preds1, np.array([2,2,0])) == 2/3
assert get_accuracy(preds1, np.array([3,2,1])) == 0.0

In [124]:
count=0
for batch in tqdm(list(test_dataloader)):
    a,b,c= batch[0],batch[1],batch[2]
    count+=1
    if count ==1:
        break
print(a);print(b);print(c)
print(batch)
print(len(list(test_dataloader)))

  0%|          | 0/3 [00:00<?, ?it/s]

tensor([[  101,  2131,  2033,  1996,  5442,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3288,  2033,  1996,  4524,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2507,  2033,  1996, 25742,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3531,  3413,  2033,  1996, 14757,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2404,  1996, 24857,  2006,  1996,  2793,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3288,  2033,  2019,  6207,  3531,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  3413,  2745,  1037,  5572, 15

In [125]:
def evaluate(model, dataloader):
    model.eval()

    accuracy=[]

    for batch in tqdm(list(dataloader)):
        b_input_ids,b_input_mask,b_labels = batch


        with torch.no_grad():
            (loss, logits) = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels, return_dict=False)

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        #logit_probability =torch.nn.Softmax(logits)
        batch_accuracy = get_accuracy(logits, label_ids)
        accuracy.append(batch_accuracy)
    avg_accuracy = np.mean(accuracy)  # TODO Compute final accuracy
    print("Validation Accuracy: {}".format(avg_accuracy))
    return avg_accuracy


- The input IDs, input mask, and labels are obtained from the dataloader. These inputs are passed through the model to get a prediction. After which the loss is computed for each batch.
- Thus the batch losses over time is monitored to compute the average training loss for each epoch. After every 5 batches,the validation accuracy is computed.

In [126]:

def train(model, dataloader, epochs):
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch_i in range(0, EPOCHS):
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

        model.train()
        #n_iteration = 0
        accuracy = []
        total_train_loss = []

        for step, batch in tqdm(list(enumerate(train_dataloader))):
            # get input IDs, input mask, and labels from batch
            b_input_ids,b_input_mask,b_labels = batch

            model.zero_grad()
            #pass inputs through model
            (loss, logits) = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels, return_dict=False)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            # Add to total_train_loss
            total_train_loss.append(loss)
            #logit_probability =torch.nn.Softmax(logits)
            batch_accuracy = get_accuracy(logits, label_ids)
            accuracy.append(batch_accuracy)
            #n_iteration += 1
        # Compute average train loss
        new_loss = [x.cpu().detach().numpy() for x in total_train_loss]
        avg_train_loss = np.mean(new_loss)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Average Training accuracy: {0:.2f}".format(np.mean(accuracy)))
    #validation_accuracy =evaluate(bert_model, validation_dataloader)



In [127]:
BATCH_SIZE = 16

EPOCHS = 10


train_dataset = examples_to_dataset(df_train_list)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)


bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = INTENT_DIM,
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

train(bert_model, train_dataloader, EPOCHS)

print("Evaluating on test set:")
print("Test accuracy:", evaluate(bert_model, test_dataloader))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.






  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 5.75
  Average Training accuracy: 0.01


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 5.51
  Average Training accuracy: 0.07


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 5.28
  Average Training accuracy: 0.12


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 5.13
  Average Training accuracy: 0.13


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 5.10
  Average Training accuracy: 0.10


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 5.02
  Average Training accuracy: 0.12


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 4.92
  Average Training accuracy: 0.12


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 4.89
  Average Training accuracy: 0.14


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 4.78
  Average Training accuracy: 0.17


  0%|          | 0/25 [00:00<?, ?it/s]

  Average training loss: 4.80
  Average Training accuracy: 0.14
Evaluating on test set:


  0%|          | 0/3 [00:00<?, ?it/s]

Validation Accuracy: 0.07333333333333333
Test accuracy: 0.07333333333333333
