# Text Extraction

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import h5py
import numpy as np


In [4]:
path_to_csv = r'../train_test_validate_split.csv'
df = pd.read_csv(path_to_csv)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode text to BERT features with variable max_length
def encode_text_for_bert(text, max_length):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state
    feature_vector = torch.mean(embeddings, dim=1)
    return feature_vector.squeeze().cpu().numpy()

In [7]:
# Process the dataframe and store results in an HDF5 file
with h5py.File(r'text_features.h5', 'w') as h5f:
    for index, row in df.iterrows():
        # Define group name based on video_id and clip_id
        group_name = f"{row['video_id']}_{row['clip_id']}"
        if group_name not in h5f:
            grp = h5f.create_group(group_name)

            # Store annotation, split, and original text as attributes
            grp.attrs['label'] = row['annotation']
            grp.attrs['split'] = row['split']
            grp.attrs['text'] = row['text']
        else:
            grp = h5f[group_name]

        # Loop over desired max_lengths, extract features, and store in datasets
        for max_length in [128, 256, 512]:
            bert_features = encode_text_for_bert(row['text'], max_length)
            dataset_name = f'bert_text_features_{max_length}'
            grp.create_dataset(dataset_name, data=bert_features, dtype=np.float64)
        print(f"Processed {group_name}.")

print("Feature extraction and storage process completed.")

Processed 201582_3.
Processed j1m6ctAgjsM_22.
Processed UK_IXtJ2BqI_17.
Processed 283935_4.
Processed dqragS38hCk_6.
Processed mxFU6TrHChY_2.
Processed 25640_6.
Processed ShgrdU9WAJE_6.
Processed Gljee9uq_Rc_20.
Processed muOYAxkG-Zo_7.
Processed npIVLL_fTf0_11.
Processed 238063_10.
Processed z441aDJvAcU_12.
Processed 259260_3.
Processed Ua4g9q0r-dI_2.
Processed pfCPogxnUfw_24.
Processed JDgqyOkzXHw_1.
Processed 252177_3.
Processed 261902_7.
Processed 243981_11.
Processed mW8eL4e7Wrg_11.
Processed yKdIZR5xfcc_10.
Processed 44780_9.
Processed 234053_10.
Processed EmmuWoCUgXs_12.
Processed 241172_0.
Processed sqADHmnM164_1.
Processed RST6PgpsLws_6.
Processed BmDybDBTe7o_7.
Processed vRhj2bLo1ho_5.
Processed 238100_6.
Processed PwapK9d8IGk_4.
Processed cnllFPRyBFs_5.
Processed hyazktfsZew_15.
Processed 252177_8.
Processed TM7cHOHfF70_5.
Processed EEUGfVTyTQM_6.
Processed Rt9rN1ntS3E_8.
Processed 6G8JJ69aN6o_4.
Processed 65kkuNV921k_22.
Processed ZtocGyL3Tfc_24.
Processed oW1OEsP7Dds_8.
Pr