# Generate Utterance Embeddings Using BERT
This notebook demonstrates how to generate embeddings for each utterance in a given JSON file using a pre-trained BERT model.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch transformers numpy

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [3]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import json
import os

In [4]:
file_path = '/content/drive/MyDrive/CSCI535 Project/Dataset/text/train.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')
model = BertModel.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [7]:
for conversation in data:
    for utterance in conversation["conversation"]:
        speaker = utterance["speaker"]
        text = utterance['text']
        video_name = utterance['video_name']
        input_text = f"{speaker}: {text}"
        file_path = f"/content/drive/MyDrive/CSCI535 Project/Dataset/Processed/text/train-emotion/{video_name.replace('.mp4', '')}.npy"

        if not os.path.exists(file_path):
            embeddings = generate_embeddings(input_text)
            np.save(file_path, embeddings)

In [8]:
!ls -1 /content/drive/MyDrive/CSCI535\ Project/Dataset/Processed/text/train-emotion | wc -l

11053


In [9]:
file_path = '/content/drive/MyDrive/CSCI535 Project/Dataset/text/test.json'

with open(file_path, 'r') as file:
    test_data = json.load(file)

In [10]:
for conversation in test_data:
    for utterance in conversation["conversation"]:
        speaker = utterance["speaker"]
        text = utterance['text']
        video_name = utterance['video_name']
        input_text = f"{speaker}: {text}"
        file_path = f"/content/drive/MyDrive/CSCI535 Project/Dataset/Processed/text/test-emotion/{video_name.replace('.mp4', '')}.npy"

        if not os.path.exists(file_path):
            embeddings = generate_embeddings(input_text)
            np.save(file_path, embeddings)

In [11]:
!ls -1 /content/drive/MyDrive/CSCI535\ Project/Dataset/Processed/text/test-emotion | wc -l

2566


In [12]:
import numpy as np

file_path = '/content/drive/MyDrive/CSCI535 Project/Dataset/Processed/text/train-emotion/dia1001utt1.npy'

embeddings = np.load(file_path)

In [13]:
embeddings

array([[ 1.31882057e-01,  1.17628254e-01,  2.86628157e-01,
        -3.67349759e-02, -3.03158253e-01, -3.24291646e-01,
        -3.18960398e-01, -4.31475812e-04, -2.70649679e-02,
         4.07138020e-01,  6.28090231e-03, -4.63601887e-01,
        -7.35488057e-01, -4.09481339e-02, -3.30650598e-01,
         2.12377995e-01,  2.83184946e-01, -3.06289554e-01,
        -7.66448975e-02,  2.70541787e-01,  2.90697515e-01,
         3.39344084e-01,  3.49191993e-01,  3.24189886e-02,
         4.78806615e-01,  1.65704235e-01, -1.75203726e-01,
         4.42673355e-01,  2.92807192e-01, -8.89715850e-01,
        -4.40962501e-02, -2.60006368e-01,  2.13796437e-01,
         4.30529658e-03, -3.29857618e-01,  1.75298676e-01,
        -4.94188815e-01,  1.38984784e-03, -2.06953511e-01,
        -7.42530823e-02, -1.62269875e-01, -6.59087479e-01,
        -7.80163884e-01,  1.18921474e-01,  2.84846783e-01,
         4.26717728e-01,  6.86971605e-01, -6.72645792e-02,
        -1.07410746e-02, -1.30038619e+00, -3.06505471e-0

In [14]:
embeddings.shape

(1, 768)