## Part 2 : Text classification

## 0. Initialization

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from utils import *
import torch.nn as nn
from tqdm import tqdm
import torch.optim as optim
from model.classifier import Classifier
from torch.utils.data import DataLoader
from dataset.PhraseDataset import PhraseDataset
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

%load_ext autoreload
%autoreload 2

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
DATA_DIR = './data/'
RESULTS_DIR = './results/'
EMBEDDINGS_DIR = DATA_DIR + 'embeddings/'

In [None]:
record_df = pd.read_csv(DATA_DIR + 'overview-of-recordings-label.csv')

## 1. Embeddings

> The objective in this section is to leverage the BERT tokenizer and model to generate embeddings for the text phrases. Subsequently, a custom classifier will be employed to predict the label for each phrase.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

#### 1.1 Create embeddings

In [None]:
# Create the embeddings for the train, validate and test sets if they don't exist

if not os.path.exists(EMBEDDINGS_DIR):
    # Create the output directory if it doesn't exist
    os.makedirs(EMBEDDINGS_DIR)
    for split in ['train', 'validate', 'test']:
        os.makedirs(EMBEDDINGS_DIR + split)

    for idx, row in tqdm(record_df.iterrows(), total=len(record_df)):
        # Save the embeddings in a folder structure that is similar to the original data
        save_path = os.path.join(EMBEDDINGS_DIR, row.split, row.file_name.replace(".wav", ".pt"))
        sentence = row.phrase
        tokenized_text = tokenizer.tokenize(sentence)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        model.eval()
        # Get hidden states
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
            hidden_states = outputs.last_hidden_state

        # take the mean of the embeddings over the whole sentence
        sentence_embedding = torch.mean(hidden_states[0], dim=0)
        
        # !! NOTE: we add the label at the end of the embedding (hence 768 for the embedding size and 1 for the label)
        # Also note that for the large bert, the embedding size is 1024 instead of 768
        sentence_embedding_with_label = torch.cat((sentence_embedding, torch.tensor([row.label])))
        
        # Save the embedding
        torch.save(sentence_embedding_with_label, save_path)

else:
    print("Embeddings already exist")

#### 1.2 Load embeddings

In [None]:
train_embeddings, train_labels = load_embeddings(os.path.join(EMBEDDINGS_DIR, "train"))
valid_embeddings, valid_labels = load_embeddings(os.path.join(EMBEDDINGS_DIR, "validate"))
test_embeddings, test_labels = load_embeddings(os.path.join(EMBEDDINGS_DIR, "test"))
print("Train embeddings shape: ", train_embeddings.shape, "Train labels shape: ", train_labels.shape)
print("Valid embeddings shape: ", valid_embeddings.shape, "Valid labels shape: ", valid_labels.shape)
print("Test embeddings shape: ", test_embeddings.shape, "Test labels shape: ", test_labels.shape)

#### 1.3 Initialize classifier and learning

In [None]:
NUM_EPOCH = 5000
lr = 0.0001
embeddings_trained = False

# Create the classifier
classifier = Classifier(train_embeddings.shape[1], len(record_df.prompt.unique())).to(device)
if os.path.exists(RESULTS_DIR + 'embeddings.pt'):
    embeddings_trained = True
    classifier.load_state_dict(torch.load(RESULTS_DIR + 'embeddings.pt'))
    print("Loaded classifier from disk")

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=lr)

#### 1.4 Train classifier

In [None]:
if not embeddings_trained:
    train_losses, valid_losses, valid_accs = train_classifier(classifier, criterion, optimizer, NUM_EPOCH, train_embeddings, train_labels, valid_embeddings, valid_labels, device)

#### 1.5 Test classifier

In [None]:
test_acc, cm = test_classifier(classifier, test_embeddings, test_labels, device)

#### 1.6 Save results

In [None]:
# Save the model
if not embeddings_trained:
    torch.save(classifier.state_dict(), RESULTS_DIR + "embeddings.pt")

if not os.path.exists(RESULTS_DIR + "embeddings"):
    os.makedirs(RESULTS_DIR + "embeddings")

# Save the losses and accuracies as numpy arrays
if not embeddings_trained:
    np.save(RESULTS_DIR + "embeddings/tr_losses.npy", np.array(train_losses))
    np.save(RESULTS_DIR + "embeddings/val_accs.npy", np.array(valid_accs))
    np.save(RESULTS_DIR + "embeddings/val_losses.npy", np.array(valid_losses))
np.save(RESULTS_DIR + "embeddings/test_acc.npy", np.array(test_acc))
np.save(RESULTS_DIR + "embeddings/cm.npy", np.array(cm))

## 2. BERT

> The goal in this section is to perform direct phrase classification using a pre-trained and well-known transformer model: BERT. 

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=len(record_df.prompt.unique())).to(device)

#### 2.1 Load dataset

In [None]:
max_seq_length = 37
batch_size = 256

train_df = record_df[record_df.split == 'train']
valid_df = record_df[record_df.split == 'validate']
test_df = record_df[record_df.split == 'test']

train_dataset = PhraseDataset(list(train_df.phrase), train_df.label.values, tokenizer, max_seq_length, device)
valid_dataset = PhraseDataset(list(valid_df.phrase), valid_df.label.values, tokenizer, max_seq_length, device)
test_dataset = PhraseDataset(list(test_df.phrase), test_df.label.values, tokenizer, max_seq_length, device)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#### 2.2 Initialize learning

In [None]:
NUM_EPOCH = 15
lr = 1e-5
large_bert_trained = False

if os.path.exists(RESULTS_DIR + 'large_bert.pt'):
    large_bert_trained = True
    model.load_state_dict(torch.load(RESULTS_DIR + 'large_bert.pt'))
    print("Loaded large bert from disk")

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=lr)

#### 2.3 Train model

In [None]:
if not large_bert_trained:
    train_losses, valid_losses, valid_accs = train_transformer(model, optimizer, NUM_EPOCH, train_loader, valid_loader, device)

#### 2.4 Test model

In [None]:
test_acc, cm = test_transformer(model, test_loader, device)

#### 2.5 Save results

In [None]:
# Save the model
if not large_bert_trained:
    torch.save(model.state_dict(), RESULTS_DIR + "large_bert.pt")

if not os.path.exists(RESULTS_DIR + "large_bert"):
    os.makedirs(RESULTS_DIR + "large_bert")

# Save the losses and accuracies as numpy arrays
if not large_bert_trained:
    np.save(RESULTS_DIR + "large_bert/tr_losses.npy", np.array(train_losses))
    np.save(RESULTS_DIR + "large_bert/val_accs.npy", np.array(valid_accs))
    np.save(RESULTS_DIR + "large_bert/val_losses.npy", np.array(valid_losses))
np.save(RESULTS_DIR + "large_bert/test_acc.npy", np.array(test_acc))
np.save(RESULTS_DIR + "large_bert/cm.npy", np.array(cm))