### Run Emotion-English-DistilRoBERTa-base on multiple text documents

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

# Specify the path to the directory containing the text files
folder_path = "/content/drive/MyDrive/HardDisk_ Neuroscience Data (PhD)/Laboratory/Dataset & Stimuli Collection/All Text Files/"

# Create an empty dictionary to store the text files
text_data = {}

# Iterate through all files in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith(".srt"):  # Process only srt files
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, 'r', encoding='latin-1') as f:
                text_data[filename] = f.read()
        except Exception as e:
            print(f"Could not read file {filename} with latin-1 encoding: {e}")


# Display the keys (filenames) to verify the files have been loaded
print("Loaded files:")
for key in text_data.keys():
    print(key)

# You can access the content of a file like this:
# print(text_data["your_filename.srt"])

Loaded files:
01_500_Days_of_Summer.srt
03_The_Usual_Suspects.srt
02_Citizen_Four.srt
08_Split.srt
07_Back_to_the_Future.srt
10_12_Years_A_Slave.srt
05_The_Shawshank_Redemption.srt
09_Little_Miss_Sunshine.srt
04_Pulp_Fiction.srt
06_The_Prestige.srt
Merlin.S01E01.RiVER.English.srt


In [4]:
import re
import pandas as pd
from datetime import datetime, timedelta

parsed_dataframes = {}

for filename, subtitle_content in text_data.items():
    lines = subtitle_content.strip().split('\n')

    subtitles = []
    current_subtitle = {}
    for line in lines:
        if line.isdigit():
            if current_subtitle:
                subtitles.append(current_subtitle)
            current_subtitle = {'index': int(line)}
        elif '-->' in line:
            times = line.split(' --> ')
            start_time_str = times[0].replace(',', '.')
            end_time_str = times[1].replace(',', '.')

            # Convert to datetime objects to calculate duration
            dummy_date = datetime(2000, 1, 1) # Use a dummy date for calculation
            try:
                start_time = datetime.strptime(start_time_str, '%H:%M:%S.%f')
                end_time = datetime.strptime(end_time_str, '%H:%M:%S.%f')
                duration = (end_time - start_time).total_seconds()
            except ValueError as e:
                print(f"Could not parse time in file {filename}: {line} - {e}")
                start_time_str = None
                end_time_str = None
                duration = None


            current_subtitle['start_time'] = start_time_str
            current_subtitle['end_time'] = end_time_str
            current_subtitle['duration'] = duration
            current_subtitle['text'] = ''
        elif line.strip() == '':
            continue
        else:
            if 'text' in current_subtitle:
                if current_subtitle['text']:
                    current_subtitle['text'] += ' ' + line.strip()
                else:
                    current_subtitle['text'] = line.strip()

    if current_subtitle:
        subtitles.append(current_subtitle)

    # Create a pandas DataFrame for the current file
    df = pd.DataFrame(subtitles)

    # Store the DataFrame in the dictionary
    parsed_dataframes[filename] = df

# You can now access the DataFrame for each file, for example:
# display(parsed_dataframes['01_500_Days_of_Summer.srt'].head())

In [None]:
from datetime import datetime, timedelta
import pandas as pd
import os

# Function to convert HH:MM:SS.fff string to total seconds
def time_to_seconds(time_str):
    if time_str is None:
        return None
    # Handle potential comma instead of dot for milliseconds
    time_str = time_str.replace(',', '.')
    try:
        # Split hours, minutes, seconds and milliseconds
        parts = time_str.split(':')
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds_parts = parts[2].split('.')
        seconds = int(seconds_parts[0])
        milliseconds = int(seconds_parts[1]) if len(seconds_parts) > 1 else 0

        total_seconds = (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000)
        return total_seconds
    except Exception as e:
        print(f"Could not convert time string to seconds: {time_str} - {e}")
        return None


# Iterate through the parsed_dataframes dictionary and add the 'start_time_seconds' column
for filename, df in parsed_dataframes.items():
    if 'start_time' in df.columns:
        df['start_time_seconds'] = df['start_time'].apply(time_to_seconds)
        # Ensure duration is in seconds (it should already be from the previous parsing)
        if 'duration' in df.columns:
            # If duration needs re-calculation based on new time format, do it here
            # For now, assuming it was correctly calculated as total_seconds
            pass
        else:
            print(f"Warning: 'duration' column not found in {filename}. It was expected from previous steps.")
    else:
        print(f"Warning: 'start_time' column not found in {filename}. Cannot convert to seconds.")

# Specify the directory to save the files. This is the same directory the files were loaded from.
save_directory = "/content/drive/MyDrive/HardDisk_ Neuroscience Data (PhD)/Laboratory/Dataset & Stimuli Collection/All Text Files/"

# Iterate through the modified parsed_dataframes dictionary and save the DataFrames to CSV files
for filename, df in parsed_dataframes.items():
    # Create a new filename for the saved CSV, e.g., "01_500_Days_of_Summer_parsed.csv"
    # Replace the original extension (.srt) with _parsed.csv
    base_filename = os.path.splitext(filename)[0]
    save_filename = f"{base_filename}_parsed.csv"
    save_path = os.path.join(save_directory, save_filename)

    # Save the DataFrame to a CSV file
    try:
        df.to_csv(save_path, index=False)
        print(f"Successfully saved {save_filename} to {save_directory}")
    except Exception as e:
        print(f"Could not save {save_filename}: {e}")

In [7]:
# install the transformers library
!pip install transformers



In [8]:
# import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [9]:
# load tokenizer and model, create trainer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [10]:
# specify your filename - this is not used when processing all files from parsed_dataframes
# file_name = "/content/YOUR_FILENAME.csv"
# text_column = "text"  # select the column in your csv that contains the text to be classified

# Create an empty list to hold all subtitle texts
all_pred_texts = []

# Iterate through the parsed_dataframes dictionary
for filename, df in parsed_dataframes.items():
    # Extract the 'text' column, drop any missing values, convert to string, and extend the list
    if 'text' in df.columns:
        all_pred_texts.extend(df['text'].dropna().astype('str').tolist())
    else:
        print(f"Warning: 'text' column not found in DataFrame for {filename}. Skipping this file.")


# Assign the combined list to pred_texts, which is used by subsequent cells
pred_texts = all_pred_texts

print(f"Collected {len(pred_texts)} subtitles from all files.")

Collected 15624 subtitles from all files.


### Classify texts with model

In [11]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [12]:
# Run predictions
predictions = trainer.predict(pred_dataset)



  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkhawajamuhammadalizahid[0m ([33mkhawajamuhammadalizahid-institute-of-space-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [14]:
# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

In [15]:
# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])

In [16]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores,  anger, disgust, fear, joy, neutral, sadness, surprise)), columns=['text','pred','label','score', 'anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'])
df.head()

Unnamed: 0,text,pred,label,score,anger,disgust,fear,joy,neutral,sadness,surprise
0,www.DHIYAFARIS.com,4,neutral,0.893475,0.003203,0.001218,0.002056,0.018282,0.893475,0.021051,0.060716
1,This is a story of boy meets girl.,4,neutral,0.884392,0.00366,0.010187,0.003164,0.056003,0.884392,0.005289,0.037304
2,"The boy, Tom Hansen of Margate, New Jersey,",3,joy,0.565535,0.004914,0.010432,0.002232,0.565535,0.343215,0.023237,0.050436
3,grew up believing that he'd never truly be happy...,5,sadness,0.706395,0.040543,0.01589,0.014621,0.016283,0.051291,0.706395,0.154976
4,"until the day he met ""the one.""",4,neutral,0.37887,0.032521,0.128443,0.205363,0.062095,0.37887,0.015266,0.177441


### Export results

In [17]:
import os

# Specify the directory to save the files. This is the same directory the files were loaded from.
# This variable should be defined in a previous cell, e.g., where you loaded the files.
# Assuming 'save_directory' is defined and contains the correct path.
save_directory = "/content/drive/MyDrive/HardDisk_ Neuroscience Data (PhD)/Laboratory/Dataset & Stimuli Collection/All Text Files/" # Ensure this variable is correctly set

# Define the filename for your output CSV
YOUR_FILENAME = "distilROBERTA_TextEmotionLabels_AllFiles.csv"

# Construct the full save path
save_path = os.path.join(save_directory, YOUR_FILENAME)

# save results to csv
try:
    df.to_csv(save_path, index=False)
    print(f"Successfully saved {YOUR_FILENAME} to {save_directory}")
except Exception as e:
    print(f"Could not save {YOUR_FILENAME}: {e}")

Successfully saved distilROBERTA_TextEmotionLabels_AllFiles.csv to /content/drive/MyDrive/HardDisk_ Neuroscience Data (PhD)/Laboratory/Dataset & Stimuli Collection/All Text Files/


In [20]:
# download file
from google.colab import files
files.download(save_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>