# CM3070, Exploratory Data Analysis

```
University of London
BSc Computer Science
CM3070, Final Project
Hudson Leonardo MENDES
hlm12@student.london.ac.uk
```


## 1. Environment


### 1.1. Dependencies


In [None]:
!cat ../setup.cfg

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import nltk

nltk.download("punkt")
nltk.download("stopwords")

### 1.2. Paths & Locations

In [None]:
import pathlib

dir_data = pathlib.Path("../data")

## 2. Extraction, Transformation & Load


In [None]:
%%capture
%pip install -e '..[etl]'

### 2.1. [E]xtraction


In [None]:
from hlm12erc.etl import ETL, KaggleDataset

ds_kaggle = KaggleDataset(owner="zaber666", name="meld-dataset", subdir="MELD-RAW/MELD.Raw")
etl = ETL(dataset=ds_kaggle).into(uri_or_folderpath=dir_data)

In [None]:
!ls {str(dir_data)} | head -n 10

In [None]:
import pandas as pd

df_raw = pd.read_csv(dir_data / "train.csv", index_col=0)
df_raw

In [None]:
import io
import base64
from IPython.display import display, HTML
from PIL import Image

df_sample = df_raw.groupby(["label"], group_keys=False).apply(lambda x: x.sample(min(len(x), 3)))
df_sample = df_sample.sort_values(["label"])

table_rows = []
for i, row in df_sample.iterrows():
    speaker_cell = f'<td>{row["speaker"]}</td>'
    text_cell = f'<td>{row["x_text"]}</td>'
    image_path = dir_data / row["x_visual"]
    with Image.open(image_path) as img:
        width, height = img.size
        crop_top = height // 2 - height // 10
        crop_bottom = height // 2 + height // 10
        img_cropped = img.crop((0, crop_top, width, crop_bottom))
        buffer = io.BytesIO()
        img_cropped.save(buffer, format="JPEG")
        image_data = base64.b64encode(buffer.getvalue()).decode()
    image_cell = f'<td><img src="data:image/jpeg;base64,{image_data}" width="100"></td>'
    audio_cell = f'<td><audio controls src="{dir_data / row["x_audio"]}" /></td>'
    label_cell = f'<td>{row["label"]}</td>'
    table_rows.append(f"<tr>{speaker_cell}{text_cell}{image_cell}{audio_cell}{label_cell}</tr>")

table_html = (
    "<table><tr><th>Speaker</th><th>Text</th><th>Image</th><th>Audio</th><th>Emotion</th></tr>"
    + "".join(table_rows)
    + "</table>"
)
display(HTML(table_html))


## 3. Statistical Analysis


### 3.1. Basic Data Features

In [None]:
from nltk.tokenize import word_tokenize

pd.DataFrame.from_dict(
    dict(
        max_char_length=max(df_raw.x_text.map(lambda x: len(x))),
        max_token_count=max(df_raw.x_text.map(lambda x: len(word_tokenize(x)))),
        count_label_neutral=len(df_raw[df_raw.label == "neutral"]),
        count_label_joy=len(df_raw[df_raw.label == "joy"]),
        count_label_sadness=len(df_raw[df_raw.label == "sadness"]),
        count_label_fear=len(df_raw[df_raw.label == "fear"]),
        count_label_anger=len(df_raw[df_raw.label == "anger"]),
        count_label_surprise=len(df_raw[df_raw.label == "surprise"]),
        count_label_disgust=len(df_raw[df_raw.label == "disgust"]),
    ),
    orient="index",
)


### 3.2. Measures of Spread


### 3.3. Types of Distribution


## 4. Data Visualization


### 4.1. Wordclouds (with & without Stopwords)


In [None]:
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords

# Get the list of stopwords
stop_words = set(stopwords.words("english"))

# Combine all text into a single string and remove special chars
text = re.sub(r"[^\w\s\d]", "", " ".join(df_raw.x_text.tolist()).lower())

# Generate the word cloud with stopwords
wordcloud_with_stopwords = WordCloud(width=750, height=1000, background_color="white").generate(text)

# Remove stopwords from the text
text_without_stopwords = " ".join([word for word in text.split() if word not in stop_words])

# Generate the word cloud without stopwords
wordcloud_without_stopwords = WordCloud(width=750, height=1000, background_color="white").generate(
    text_without_stopwords
)

# Display the word clouds side by side
fig, axs = plt.subplots(1, 2, figsize=(15, 10))
axs[0].imshow(wordcloud_with_stopwords, interpolation="bilinear")
axs[0].set_title("with Stopwords")
axs[0].axis("off")
axs[1].imshow(wordcloud_without_stopwords, interpolation="bilinear")
axs[1].set_title("WITHOUT Stopwords")
axs[1].axis("off")
plt.show()

### 4.2. Sentence Length Distribution

In [None]:
import matplotlib.pyplot as plt
import nltk

# Tokenize the sentences using the nltk tokenizer
df_raw["tokens"] = df_raw["x_text"].apply(nltk.word_tokenize)
df_raw["sentence_length"] = df_raw["tokens"].apply(len)

# Get the unique emotion labels
labels = df_raw["label"].unique()

# Create a grid of histograms
num_rows = (len(labels) + 3) // 4
fig, axs = plt.subplots(num_rows, 4, figsize=(20, 5 * num_rows))
for i, label in enumerate(labels):
    # Filter the data by label
    df = df_raw[df_raw["label"] == label]["sentence_length"]

    # Plot the histogram
    axs[i // 4, i % 4].hist(df, bins=50)
    axs[i // 4, i % 4].set_title(label)
    axs[i // 4, i % 4].set_xlabel("Sentence Length (Number of Tokens)")
    axs[i // 4, i % 4].set_ylabel("Frequency")

    # Set the x-axis limit to the maximum sentence length
    axs[i // 4, i % 4].set_xlim([0, df_raw["sentence_length"].max()])

# Plot the combined distribution
df = df_raw["sentence_length"]
axs[-1, -1].hist(df, bins=50, color="orange")
axs[-1, -1].set_title("[all emotions]")
axs[-1, -1].set_xlabel("Sentence Length (Number of Tokens)")
axs[-1, -1].set_ylabel("Frequency")
axs[-1, -1].set_xlim([0, df_raw["sentence_length"].max()])

# Remove the unused subplots
for i in range(len(labels) + 1, num_rows * 4):
    axs.flat[i].set_visible(False)

# Adjust the spacing between subplots
plt.subplots_adjust(hspace=0.5, wspace=0.3)

# Show the plot
plt.show()

### 4.3. LDA-based Topic Modelling

In [None]:
import logging
import pyLDAvis.gensim_models
import gensim
import nltk
import re
from nltk.corpus import stopwords

# Set the logging level to WARNING
gensim.models.ldamodel.logger.setLevel(logging.WARNING)

# Tokenize the sentences using the nltk tokenizer, lowercase them, remove special chars and remove stopwords
stop_words = set(stopwords.words("english"))
utterances = [re.sub(r"[^\w\s\d]", "", utterance) for utterance in df_raw["x_text"]]
tokens = [nltk.word_tokenize(x) for x in utterances]
tokens = [[word.lower() for word in sentence] for sentence in tokens]
tokens = [[word for word in sentence if word not in stop_words] for sentence in tokens]

# Create a dictionary from the tokens
dictionary = gensim.corpora.Dictionary(tokens)

# Create a bag-of-words corpus from the dictionary and tokens
corpus = [dictionary.doc2bow(sentence) for sentence in tokens]

# Train the LDA model on the corpus
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=10,
    per_word_topics=True,
)

# Create an interactive visualization of the topics and their associated keywords
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

### 4.4. 3D PCA Embedding Representations

In [None]:
import torch
import tensorflow_hub as hub
import torchvision.transforms as transforms
from transformers import AutoTokenizer, AutoModel

# Loading USE4
use4_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Loading SimCSE
simcse_model_name = "ZurichNLP/unsup-simcse-xlm-roberta-base"
simcse_tokenizer = AutoTokenizer.from_pretrained(simcse_model_name)
simcse_model = AutoModel.from_pretrained(simcse_model_name)

# Loading ResNet50
resnet50_model = torch.hub.load("pytorch/vision:v0.9.0", "resnet50", pretrained=True)
resnet50_transforms = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.2706, 0.2010, 0.1914], std=[0.1857, 0.1608, 0.1667]),
    ]
)


In [None]:
import itertools
import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences

from tensorflow.python.ops.numpy_ops import np_config

np_config.enable_numpy_behavior()


def plot_3d_embeddings_using(
    ax,
    title: str,
    df: pd.DataFrame,
    p: str,
    emb_fn: callable,
    bsz: int,
):
    # Get the embeddings for each utterance in df_raw.x_text
    embeddings = []
    for i in tqdm(range(0, len(df), bsz), desc=title):
        batch = list(df[p][i : i + bsz])
        batch = emb_fn(batch)
        embeddings.append(batch)
    embeddings = list(itertools.chain(*[e.tolist() for e in embeddings]))
    embeddings = pad_sequences(embeddings, dtype="float32", padding="post")

    # Perform PCA on the normalized embeddings to reduce them to 3 dimensions
    pca = PCA(n_components=3)
    pca_embeddings = pca.fit_transform(embeddings)

    # Normalize the embeddings to lie on the surface of the unit sphere
    norms = np.linalg.norm(pca_embeddings, axis=1, keepdims=True)
    normalized_embeddings = pca_embeddings / norms

    # Create a 3D scatter plot of the PCA embeddings, color-coded by label
    colors = dict(
        neutral="gray",
        surprise="yellow",
        fear="black",
        sadness="blue",
        joy="green",
        disgust="purple",
        anger="red",
    )
    for i, label in enumerate(list(df.label)):
        ax.scatter(
            normalized_embeddings[i, 0],
            normalized_embeddings[i, 1],
            normalized_embeddings[i, 2],
            c=colors[label],
        )

    ax.set_title(title)

In [None]:
import torch
import matplotlib.pyplot as plt
import librosa
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# USE4 Embeddings
embedding_fn_use4 = use4_model


# SimCSE Embeddings
def embedding_fn_simcse(x):
    inputs = simcse_tokenizer(x, padding=True, truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = simcse_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.numpy()


# ResNet50 Embeddings
def embedding_fn_resnet50(image_paths):
    img_tensors = []
    for image_path in image_paths:
        img = Image.open(str(dir_data / image_path))
        img_tensor = resnet50_transforms(img)
        img_tensor = img_tensor.unsqueeze(0)
        img_tensors.append(img_tensor)
    with torch.no_grad():
        embeddings = resnet50_model(torch.cat(img_tensors, dim=0))
    embeddings = torch.nn.functional.normalize(embeddings, dim=1).squeeze(0)
    return embeddings.numpy()


# MEL Feature Audio Embeddings
def embedding_fn_audiomel(audio_paths):
    embeddings = []
    for audio_path in audio_paths:
        y, sr = librosa.load(str(dir_data / audio_path), sr=16000)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
        log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max).flatten()
        embeddings.append(log_mel_spec)
    embeddings = pad_sequences(embeddings, padding="post", dtype="float32")
    return embeddings


# Plotting PCA vector endpoints for a batch of sentences
fig = plt.figure(figsize=(15, 15))
ax1 = fig.add_subplot(2, 2, 1, projection="3d")
ax2 = fig.add_subplot(2, 2, 2, projection="3d")
ax3 = fig.add_subplot(2, 2, 3, projection="3d")
ax4 = fig.add_subplot(2, 2, 4, projection="3d")

df_sample = df_raw.sample(len(df_raw) // 4)
plot_3d_embeddings_using(ax1, title="USE4", df=df_sample, p="x_text", emb_fn=embedding_fn_use4, bsz=64)
plot_3d_embeddings_using(ax2, title="SimCSE", df=df_sample, p="x_text", emb_fn=embedding_fn_simcse, bsz=16)
plot_3d_embeddings_using(ax3, title="ResNet50", df=df_sample, p="x_visual", emb_fn=embedding_fn_resnet50, bsz=4)
plot_3d_embeddings_using(ax4, title="AudioMEL", df=df_sample, p="x_audio", emb_fn=embedding_fn_audiomel, bsz=32)
plt.show()


# 5. Conclusions

The visualisation of representations composed by out-of-the box feature extraction mechanism
demonstrate clearly how the data examples are scattered in the feature space and give a strong
indication of how simplistic separation approaches are unlikely to succesfully separate each
emotion class.

SimCSE seems to offer, out-of-the box, superior representations for the text modality and, as
discussed by the SPCL-ERC-CL[3] paper, it can be further improved by designing a customised
objective function that is more suitable for the task at hand, as well as finetuning it in the
MELD training data.

The visual representations provided <conclude about ResNet50 representations>.

MEL Features to represent the audio are <conclude about MEL audio representations>.

Without further experimentation, it is not possible to conclude whether the representations
employed during this exploratory data analysis are sufficient to separate the emotion classes
or if alternartive representations can outform them significantly.

As set out for by the project design, both the quality of the representations as well as the
quality of the model that fuses these representations will be evaluated, which might hopefully
address this challenge.

## References

[1] Poria, Soujanya, et al. ‘MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations’. ArXiv [Cs.CL], 2019, http://arxiv.org/abs/1810.02508. arXiv.

[2] Chen, Sheng-Yeh, et al. ‘EmotionLines: An Emotion Corpus of Multi-Party Conversations’. ArXiv [Cs.CL], 2018, http://arxiv.org/abs/1802.08379. arXiv.

