In [1]:
import os 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import Dataset, DataLoader, random_split
import pickle
import whisper
from tqdm import tqdm

# Speech to emotion

For this part of the project we used the dataset "IEMOCAP: Interactive emotional dyadic motion capture database."
The dataset contains conversations audio, transcription, video and motion-capture.
We discarded the transcription, video and motion-capture features and we used only the audio.

We performed a transcription of the audio using Whisper.

In [4]:
speech_to_text = whisper.load_model("base.en", in_memory=True)
def parse_audio(path):
    audio = whisper.load_audio(path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(speech_to_text.device)
    options = whisper.DecodingOptions(fp16=False, language="en")
    result = whisper.decode(speech_to_text, mel, options)
    text = result.text
    return [text, mel]

In [3]:
sessions = ["Session1","Session2","Session3","Session4", "Session5"]
missing_sessions = []
for i in sessions:
    if not os.path.exists(f"./{i}.pkl"):
        missing_sessions.append(i)

for i in missing_sessions:
    dir = f"./{i}/dialog/EmoEvaluation/"
    files = os.listdir(dir)
    files_dictionary = {}
    for i in os.listdir(dir):
        if i.endswith(".txt"):
            file = open(dir + i, "r")
            for line in file:
                if line.startswith("["):
                    line = line.split()
                    path = "_".join(line[3].split("_")[:-1])
                    if path in files_dictionary:
                        files_dictionary[path][line[3]] = [line[4]]
                    else:
                        files_dictionary[path] = {line[3]: [line[4]]}
            file.close()
    audio_path = f"./{i}/sentences/wav/"
    for key in tqdm(files_dictionary):
        for file in files_dictionary[key]:
            path = audio_path + key + "/" + file + ".wav"
            files_dictionary[key][file] = files_dictionary[key][file] + parse_audio(path)
    print(f"Saving {i}")
    pickle.dump(files_dictionary, open(f"{i}.pkl", "wb"))
    

In [2]:
session1 = pickle.load(open("Session1.pkl", "rb"))
session2 = pickle.load(open("Session2.pkl", "rb"))
session3 = pickle.load(open("Session3.pkl", "rb"))
session4 = pickle.load(open("Session4.pkl", "rb"))
session5 = pickle.load(open("Session5.pkl", "rb"))

# Data analysis

We are only interested in emotions that are clearly identifiable.
We proceeded in removing emotions that are not common and we removed al the emotions flagged as "other".

In [3]:
emotions = {}
for i in [session1, session2, session3, session4, session5]:
    for j in i.keys():
        for k in i[j].keys():
            if i[j][k][0] not in emotions.keys():
                emotions[i[j][k][0]] = 1
            else:
                emotions[i[j][k][0]] += 1
print("All emotions")
print("-" * 50)
for i in emotions.keys():
    print(i, emotions[i])
print("-" * 50)

print("filtered emotions")
print("-" * 50)
emotions_to_remove = []
for i in emotions.keys():
    if emotions[i] < 150:
        emotions_to_remove.append(i)
        emotions["xxx"] += emotions[i]
for i in emotions_to_remove:
    del emotions[i]
for i in emotions.keys():
    print(i, emotions[i])
print("-" * 50)
for i in [session1, session2, session3, session4, session5]:
    for j in i.keys():
        for k in i[j].keys():
            if i[j][k][0] in emotions_to_remove:
                i[j][k][0] = "xxx"
emotions = {}
for i in [session1, session2, session3, session4, session5]:
    for j in i.keys():
        for k in i[j].keys():
            if i[j][k][0] not in emotions.keys():
                emotions[i[j][k][0]] = 1
            else:
                emotions[i[j][k][0]] += 1

All emotions
--------------------------------------------------
ang 1103
xxx 2507
fru 1849
neu 1708
sur 107
sad 1084
exc 1041
hap 595
fea 40
dis 2
oth 3
--------------------------------------------------
filtered emotions
--------------------------------------------------
ang 1103
xxx 2659
fru 1849
neu 1708
sad 1084
exc 1041
hap 595
--------------------------------------------------


In [4]:
all_data = []
for i in [session1, session2, session3, session4, session5]:
    for j in i.keys():
        for k in i[j].keys():
            if i[j][k][0] != "xxx":
                all_data.append(i[j][k])
print("Total data points", len(all_data))

Total data points 7380


# Model

As in the previous part we extracted features from the text using BERT, we are using the output of the last four layers of the model and we are combining them together as input to the part of the network responsible for the text.
For the part of the model responsible in extracting information from the audio we pass the mel spectrogram.

In [5]:
class TextAudioDataloader(Dataset):
    def __init__(self, text_model, tokenizer, data):
        self.text_model = text_model
        self.tokenizer = tokenizer
        self.data = data
        self.emotion_to_one_hot = {
            "ang": torch.tensor([1, 0, 0, 0, 0, 0], dtype=torch.float32),
            "fru": torch.tensor([0, 1, 0, 0, 0, 0], dtype=torch.float32),
            "neu": torch.tensor([0, 0, 1, 0, 0, 0], dtype=torch.float32),
            "sad": torch.tensor([0, 0, 0, 1, 0, 0], dtype=torch.float32),
            "exc": torch.tensor([0, 0, 0, 0, 1, 0], dtype=torch.float32),
            "hap": torch.tensor([0, 0, 0, 0, 0, 1], dtype=torch.float32),
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        with torch.no_grad():
            emotion, text, audio = self.data[idx]
            encoded_input = self.tokenizer(text, return_tensors="pt").to(
                self.text_model.device
            )
            output_text_features = self.text_model(**encoded_input)
            all_hidden_states = torch.stack(output_text_features["hidden_states"])
            concatenate_pooling = torch.cat(
                (
                    all_hidden_states[-1],
                    all_hidden_states[-2],
                    all_hidden_states[-3],
                    all_hidden_states[-4],
                ),
                -1,
            )
            concatenate_pooling = concatenate_pooling[:, 0]
            return (
                concatenate_pooling.squeeze(0).float(),
                audio.unsqueeze(0),
                self.emotion_to_one_hot[emotion],
            )

In [6]:
config = AutoConfig.from_pretrained("bert-base-uncased")
config.update({"output_hidden_states": True})
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", config=config)
text_to_embeddings = AutoModel.from_pretrained(
    "bert-base-uncased", config=config, torch_dtype=torch.float16
)
text_to_embeddings = text_to_embeddings.to("mps")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


In [7]:
dataset = TextAudioDataloader(text_to_embeddings, tokenizer, all_data)

In [8]:
train_dataset, validation_dataset, test_dataset = random_split(
    dataset, [0.7, 0.2, 0.1]
)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

We implemented a simple neural network that given the audio and the text performs a classification of the emotion.

We are using a CNN to extract features from the mel spectrogram and we are applying and a MLP for the output produced by BERT.
The output of the two networks are then concatenated and an MLP is used to perform classification.

We are going to test different combinations of the network:
- Only audio
- Only text
- Text and audio
- Text and audio, with a 20% probability of masking the audio during training.

In [21]:
import pytorch_lightning as pl
from torch import nn
import torch.nn.functional as F
import random

class AudioAndTextModel(pl.LightningModule):
    def __init__(self, num_classes, dropout=0.0, text=False, audio=False, probability_removing_audio=0.0):
        super().__init__()
        self.save_hyperparameters()
        self.num_classes = num_classes
        self.text = text
        self.audio = audio
        self.probability_removing_audio = probability_removing_audio

        if audio:
            self.audio_model = nn.Sequential(
                nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2)),
                nn.SiLU(),
                nn.MaxPool2d(kernel_size=(4, 4)),
                nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2)),
                nn.SiLU(),
                nn.MaxPool2d(kernel_size=(4, 4)),
                nn.Flatten(),
                nn.Linear(1472, 512),
                nn.Dropout(dropout),
                nn.SiLU(),
                nn.Linear(512, 256),
                nn.Dropout(dropout),
                nn.SiLU(),
            )

        if text:
            self.text_linear = nn.Sequential(
                nn.Linear(3072, 256),
                nn.Dropout(dropout),
                nn.SiLU(),
            )

        if text and audio:
            self.linear = nn.Sequential(
                nn.Linear(512, 256),
                nn.Dropout(dropout),
                nn.SiLU(),
                nn.Linear(256, num_classes),
            )
        else:
            self.linear = nn.Sequential(
                nn.Linear(256, num_classes),
            )

    def forward(self, audio, text):
        if self.text and not self.audio:
            features = self.text_linear(text)
        elif self.audio and not self.text:
            features = self.audio_model(audio)
        else:
            text_features = self.text_linear(text)
            if random.random() < self.probability_removing_audio and self.training:
                audio_features = torch.zeros_like(text_features)
            else:
                audio_features = self.audio_model(audio)
            features = torch.cat((audio_features, text_features), dim=1)

        return self.linear(features)

    def training_step(self, batch, batch_idx):
        text, audio, labels = batch
        logits = self(audio, text)

        loss = F.cross_entropy(logits, labels, label_smoothing=0.3)
        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        accuracy = self.accuracy(logits, labels)
        self.log(
            "train_acc",
            accuracy,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return loss

    def validation_step(self, batch, batch_idx):
        text, audio, labels = batch
        logits = self(audio, text)

        loss = F.cross_entropy(logits, labels)
        self.log(
            "val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        accuracy = self.accuracy(logits, labels)
        self.log(
            "val_acc", accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        top_2_accuracy = self.top_k_accuracy(logits, labels, k=2)
        self.log(
            "val_top_2_acc", top_2_accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
    
    def test_step(self, batch, batch_idx):
        text, audio, labels = batch
        logits = self(audio, text)

        loss = F.cross_entropy(logits, labels)
        self.log(
            "test_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        accuracy = self.accuracy(logits, labels)
        self.log(
            "test_acc", accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )

        top_2_accuracy = self.top_k_accuracy(logits, labels, k=2)
        self.log(
            "test_top_2_acc", top_2_accuracy, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
    
    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        text, audio, labels = batch
        logits = self(audio, text)
        return torch.softmax(logits, dim=1)

    def accuracy(self, logits, labels):
        return torch.sum(
            torch.argmax(logits, dim=1) == torch.argmax(labels, dim=1)
        ).item() / len(labels)
    
    def top_k_accuracy(self, logits, labels, k=2):
        y_true = torch.argmax(labels, dim=1).detach().cpu().numpy()
        # print(y_true[:, np.newaxis])
        y_pred = logits.detach().cpu().numpy()
        y_pred = np.argsort(y_pred, axis=-1)[:, -k:]
        # print(y_pred)
        return np.mean(np.isin(y_true[:, np.newaxis], y_pred))

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-4)
        return optimizer

In [22]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

model = AudioAndTextModel(6, text=True)
early_stopping = EarlyStopping(monitor="val_acc", mode="max", patience=10)
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="{epoch}-{val_acc:.2f}-{val_loss:.2f}.ckpt",
    save_top_k=1,
    monitor="val_acc",
    mode="max",
    save_last=True,
)
trainer = pl.Trainer(
    accelerator="mps",
    max_epochs=1000,
    accumulate_grad_batches=4,
    callbacks=[early_stopping, checkpoint_callback],
)

trainer.fit(model, train_loader, test_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/opt/homebrew/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints exists and is not empty.

  | Name        | Type       | Params
-------------------------------------------
0 | text_linear | Sequential | 786 K 
1 | linear      | Sequential | 1.5 K 
-------------------------------------------
788 K     Trainable params
0         Non-trainable params
788 K     Total params
3.153     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/homebrew/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/opt/homebrew/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [23]:
trainer.test(ckpt_path=checkpoint_callback.best_model_path, dataloaders=test_loader, verbose=True)

Restoring states from the checkpoint path at /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints/epoch=11-val_acc=0.51-val_loss=1.30.ckpt.ckpt


Loaded model weights from the checkpoint at /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints/epoch=11-val_acc=0.51-val_loss=1.30.ckpt.ckpt
/opt/homebrew/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss_epoch': 1.3008297681808472,
  'test_acc_epoch': 0.5094850659370422,
  'test_top_2_acc_epoch': 0.9932249188423157}]

In [24]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

model = AudioAndTextModel(6, audio=True)
early_stopping = EarlyStopping(monitor="val_acc", mode="max", patience=10)
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints_audio",
    filename="{epoch}-{val_acc:.2f}-{val_loss:.2f}.ckpt",
    save_top_k=1,
    monitor="val_acc",
    mode="max",
    save_last=True,
)
trainer = pl.Trainer(
    accelerator="mps",
    max_epochs=1000,
    accumulate_grad_batches=4,
    callbacks=[early_stopping, checkpoint_callback],
)

trainer.fit(model, train_loader, test_loader)
trainer.test(ckpt_path=checkpoint_callback.best_model_path, dataloaders=test_loader, verbose=True)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type       | Params
-------------------------------------------
0 | audio_model | Sequential | 890 K 
1 | linear      | Sequential | 1.5 K 
-------------------------------------------
891 K     Trainable params
0         Non-trainable params
891 K     Total params
3.567     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Restoring states from the checkpoint path at /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints_audio/epoch=14-val_acc=0.41-val_loss=1.48.ckpt.ckpt
Loaded model weights from the checkpoint at /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints_audio/epoch=14-val_acc=0.41-val_loss=1.48.ckpt.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss_epoch': 1.4836596250534058,
  'test_acc_epoch': 0.4146341383457184,
  'test_top_2_acc_epoch': 0.9471544623374939}]

In [25]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

model = AudioAndTextModel(6, audio=True, text=True)
early_stopping = EarlyStopping(monitor="val_acc", mode="max", patience=10)
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints_audio_text",
    filename="{epoch}-{val_acc:.2f}-{val_loss:.2f}.ckpt",
    save_top_k=1,
    monitor="val_acc",
    mode="max",
    save_last=True,
)
trainer = pl.Trainer(
    accelerator="mps",
    max_epochs=1000,
    accumulate_grad_batches=4,
    callbacks=[early_stopping, checkpoint_callback],
)

trainer.fit(model, train_loader, test_loader)
trainer.test(ckpt_path=checkpoint_callback.best_model_path, dataloaders=test_loader, verbose=True)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type       | Params
-------------------------------------------
0 | audio_model | Sequential | 890 K 
1 | text_linear | Sequential | 786 K 
2 | linear      | Sequential | 132 K 
-------------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.239     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/opt/homebrew/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
Restoring states from the checkpoint path at /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints_audio_text/epoch=6-val_acc=0.57-val_loss=1.19.ckpt.ckpt
Loaded model weights from the checkpoint at /Users/vladimirkarpenko/Documents/git/ATML-NLP/checkpoints_audio_text/epoch=6-val_acc=0.57-val_loss=1.19.ckpt.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

: 

In [None]:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

model = AudioAndTextModel(6, audio=True, text=True, probability_removing_audio=0.2)
early_stopping = EarlyStopping(monitor="val_acc", mode="max", patience=10)
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints_audio_text_prob_0.2",
    filename="{epoch}-{val_acc:.2f}-{val_loss:.2f}.ckpt",
    save_top_k=1,
    monitor="val_acc",
    mode="max",
    save_last=True,
)
trainer = pl.Trainer(
    accelerator="mps",
    max_epochs=1000,
    accumulate_grad_batches=4,
    callbacks=[early_stopping, checkpoint_callback],
)

trainer.fit(model, train_loader, test_loader)
trainer.test(ckpt_path=checkpoint_callback.best_model_path, dataloaders=test_loader, verbose=True)