In [1]:
!pip install transformers



In [2]:
!git clone https://github.com/itsZiang/data.git

Cloning into 'data'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 11 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 3.46 MiB | 4.74 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [3]:
import os
import pandas as pd
import numpy as np
import shutil
import sys
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
import json

# Function to load the list of unique acts from the text file
def load_acts_from_txt(file_path):
    with open(file_path, 'r') as file:
        acts = file.read().splitlines()
    return acts

# Function to convert a list of acts to a one-hot encoded vector
def convert_to_one_hot(acts, all_acts):
    one_hot_vector = [0] * len(all_acts)
    for act in acts:
        if act in all_acts:
            one_hot_vector[all_acts.index(act)] = 1
    return one_hot_vector

# Function to load and process the JSON files into DataFrames
def load_json_data(json_path, all_acts):
    with open(json_path, 'r') as f:
        json_data = json.load(f)

    data = []
    for entry in json_data:
        utterance = entry["utterance"]
        acts = entry["acts"]
        one_hot_encoded_acts = convert_to_one_hot(acts, all_acts)
        # Concatenate the utterance with the one-hot encoded vector
        data.append([utterance] + one_hot_encoded_acts)

    # Create a DataFrame from the processed data
    column_names = ["utterance"] + all_acts
    df = pd.DataFrame(data, columns=column_names)
    return df

# Paths to your JSON files and the acts.txt file
train_json_path = '/kaggle/working/data/data_act_detection_train.json'
test_json_path = '/kaggle/working/data/data_act_detection_test.json'
dev_json_path = '/kaggle/working/data/data_act_detection_dev.json'
acts_txt_path = '/kaggle/working/data/acts_name.txt'

# Load the unique acts from the acts.txt file
all_acts = load_acts_from_txt(acts_txt_path)

# Load and process the JSON files into separate DataFrames
df_train = load_json_data(train_json_path, all_acts)
df_test = load_json_data(test_json_path, all_acts)
df_valid = load_json_data(dev_json_path, all_acts)

In [5]:
print(f"Train: {df_train.shape}, Test: {df_test.shape}, Valid: {df_valid.shape}")

Train: (113552, 37), Test: (14744, 37), Valid: (14748, 37)


In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
df_valid.head(1)

Unnamed: 0,utterance,Booking-NoBook,Police-Request,Attraction-Inform,Booking-Inform,general-greet,Hospital-Inform,Hotel-Select,general-thank,Train-Request,Train-OfferBooked,Hotel-Recommend,Train-OfferBook,Restaurant-NoOffer,Hospital-Request,Booking-Request,Attraction-Select,Restaurant-Recommend,general-reqmore,Attraction-Recommend,Taxi-Inform,Taxi-Request,general-welcome,general-bye,Train-Inform,Hotel-NoOffer,Hotel-Inform,Train-NoOffer,Restaurant-Select,Hotel-Request,Attraction-NoOffer,Police-Inform,Attraction-Request,Restaurant-Inform,Booking-Book,Restaurant-Request,Train-Select
0,I'm looking for a local place to dine in the c...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
# Hyperparameters
MAX_LEN = 64
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 6
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [9]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.utterance = list(df['utterance'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.utterance)

    def __getitem__(self, index):
        utterance = str(self.utterance[index])
        utterance = " ".join(utterance.split())
        inputs = self.tokenizer.encode_plus(
            utterance,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'utterance': utterance
        }

In [10]:
target_list = all_acts
target_list

['Booking-NoBook',
 'Police-Request',
 'Attraction-Inform',
 'Booking-Inform',
 'general-greet',
 'Hospital-Inform',
 'Hotel-Select',
 'general-thank',
 'Train-Request',
 'Train-OfferBooked',
 'Hotel-Recommend',
 'Train-OfferBook',
 'Restaurant-NoOffer',
 'Hospital-Request',
 'Booking-Request',
 'Attraction-Select',
 'Restaurant-Recommend',
 'general-reqmore',
 'Attraction-Recommend',
 'Taxi-Inform',
 'Taxi-Request',
 'general-welcome',
 'general-bye',
 'Train-Inform',
 'Hotel-NoOffer',
 'Hotel-Inform',
 'Train-NoOffer',
 'Restaurant-Select',
 'Hotel-Request',
 'Attraction-NoOffer',
 'Police-Inform',
 'Attraction-Request',
 'Restaurant-Inform',
 'Booking-Book',
 'Restaurant-Request',
 'Train-Select']

In [11]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN, target_list)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN, target_list)

In [12]:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=4
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

test_data_loader = torch.utils.data.DataLoader(test_dataset,
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=4
)

In [13]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 36)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [15]:

from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)



In [16]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader),
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [17]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)

In [18]:
!mkdir /kaggle/working/output

In [19]:
data_dir = "/kaggle/working"

In [20]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join(data_dir,"output","best_model.bin"))
        best_accuracy = val_acc

Epoch 1/6


  0%|          | 0/1775 [00:00<?, ?it/s]

train_loss=0.1331, val_loss=0.0577 train_acc=0.9658, val_acc=0.9849
Epoch 2/6


  0%|          | 0/1775 [00:00<?, ?it/s]

train_loss=0.0486, val_loss=0.0363 train_acc=0.9871, val_acc=0.9898
Epoch 3/6


  0%|          | 0/1775 [00:00<?, ?it/s]

train_loss=0.0346, val_loss=0.0295 train_acc=0.9898, val_acc=0.9909
Epoch 4/6


  0%|          | 0/1775 [00:00<?, ?it/s]

train_loss=0.0288, val_loss=0.0269 train_acc=0.9911, val_acc=0.9913
Epoch 5/6


  0%|          | 0/1775 [00:00<?, ?it/s]

train_loss=0.0256, val_loss=0.0257 train_acc=0.9917, val_acc=0.9914
Epoch 6/6


  0%|          | 0/1775 [00:00<?, ?it/s]

train_loss=0.0234, val_loss=0.0255 train_acc=0.9923, val_acc=0.9914


In [21]:
# Loading pretrained model (best model)
model = BERTClass()
model.load_state_dict(torch.load(os.path.join(data_dir,"output","best_model.bin")))
model = model.to(device)


  model.load_state_dict(torch.load(os.path.join(data_dir,"output","best_model.bin")))


In [22]:
test_acc, test_loss = eval_model(test_data_loader, model, optimizer)

In [23]:
test_acc

0.9913599505636944

In [24]:
from sklearn.metrics import confusion_matrix, classification_report

In [25]:
def get_predictions(model, data_loader):
    """
    Outputs:
      predictions -
    """
    model = model.eval()

    utterances = []
    predictions = []
    prediction_probs = []
    target_values = []

    with torch.no_grad():
      for data in data_loader:
        utterance = data["utterance"]
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data["targets"].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)
        # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
        outputs = torch.sigmoid(outputs).detach().cpu()
        # thresholding at 0.5
        preds = outputs.round()
        targets = targets.detach().cpu()

        utterances.extend(utterance)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        target_values.extend(targets)

    predictions = torch.stack(predictions)
    prediction_probs = torch.stack(prediction_probs)
    target_values = torch.stack(target_values)

    return utterances, predictions, prediction_probs, target_values

In [26]:
utterances, predictions, prediction_probs, target_values = get_predictions(model, test_data_loader)


In [27]:
print(classification_report(target_values, predictions, target_names=target_list))

                      precision    recall  f1-score   support

      Booking-NoBook       0.98      0.96      0.97       131
      Police-Request       0.00      0.00      0.00         0
   Attraction-Inform       0.89      0.91      0.90      1522
      Booking-Inform       0.94      0.89      0.92       564
       general-greet       1.00      0.01      0.02       240
     Hospital-Inform       0.00      0.00      0.00         0
        Hotel-Select       0.64      0.70      0.67        80
       general-thank       0.98      0.94      0.96       940
       Train-Request       0.90      0.90      0.90      1077
   Train-OfferBooked       0.92      0.81      0.86       297
     Hotel-Recommend       0.77      0.63      0.69       140
     Train-OfferBook       0.92      0.85      0.88       380
  Restaurant-NoOffer       0.91      0.91      0.91       111
    Hospital-Request       0.00      0.00      0.00         0
     Booking-Request       0.94      0.94      0.94       321
   Attr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
