In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install necessary packages
!pip install -q transformers
!pip install -q spacy
!pip install -q sentencepiece


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import required libraries
import os,shutil
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import random
import plotly.express as px
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import torch

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

Path of the folder containing transcript files

In [None]:
# Set the directory containing the transcription files
directory = "/content/drive/MyDrive/Transcription"

In [None]:
# Initialize a list to store file data
file_data = []

# Loop through each file in the directory for .text file
for file in os.listdir(directory):
    if file.endswith("txt"):
        file_path = os.path.join(directory, file)
        # Open the file and read its content
        with open(file_path, "r") as f: 
            text_doc = f.read()
            docs = re.split(r'\n\n',text_doc) 
            team = file.split(".")[0]
            
            # Combine the smaller documents so that each combined document is less than 512 tokens
            combined_docs = [] 
            current_doc = "" 
            for doc in docs: 
                if len(current_doc + doc) < 512:
                    current_doc += doc
                else:
                    combined_docs.append(current_doc) 
                    current_doc = doc
            combined_docs.append(current_doc)
            
            # Add the combined documents and their corresponding team names to the file_data list
            for i, doc in enumerate(combined_docs): #enumerate = loop, it add all the lines based on the filename, 
                file_data.append({'Filenames': team, 'Texts': doc}) #it saves as dictionary, Filenames = key, Texts = Value

# Create a DataFrame from the file_data list
df = pd.DataFrame(file_data)
print(df.shape)
print(df.Filenames.unique())
# Display the first 10 rows of the DataFrame
df.head(10)

(324, 2)
['Audio A' 'Audio B' 'Audio C' 'Audio D' 'Audio E' 'Audio F' 'Audio G'
 'Audio H' 'Audio I' 'Audio J' 'Audio K' 'Audio L' 'Audio M']


Unnamed: 0,Filenames,Texts
0,Audio A,"Good morning, scholars. Good morning, everyone..."
1,Audio A,hero. This is a figure that I think is a lot m...
2,Audio A,come back from that. The only thing about his ...
3,Audio A,It's a little bit of a serious book. You got t...
4,Audio A,the Zook and the Dauphin. Probably the goofbal...
5,Audio A,is a very common character as well. What you l...
6,Audio A,works of literature then class. Drama or trage...
7,Audio A,little gems of wisdom. So the irony here is th...
8,Audio A,"higher ones, when they consider themselves bet..."
9,Audio A,of comments? How can he do it? How can he get ...


## T5

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-emotion")

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
# Create a deep copy (to create a different file) of the original DataFrame
df1 = df.copy(deep=True)
# Initialize lists to store emotions and confidence scores
emotions = []
confidence = []

# Loop through each row in the DataFrame
for idx in tqdm(range(len(df1))): #idx = index number, tqdm = to show progress bar
    # Extract the speech text
    speech = df1.iloc[idx,1] #iloc = it take the text from column 2, ([1] = 2)
    try:
        # Tokenize the speech and generate an output label
        input_ids = tokenizer.encode_plus(speech, max_length=512, truncation=True, padding='max_length', return_tensors='pt') #truncation = to remove the text that is more than 512, padding = fill the remaining text if it is less than 512, return_tensors='pt' = converting the text into PyTorch format
        output_label = model.generate(input_ids=input_ids['input_ids'], attention_mask=input_ids['attention_mask'], max_length=2)
        dec = [tokenizer.decode(ids) for ids in output_label]
        label = re.sub('<pad>','',dec[0])
        emotions.append(label.strip())
        # Calculate confidence scores for the emotion prediction
        output = model(input_ids=input_ids['input_ids'], attention_mask=input_ids['attention_mask'], decoder_input_ids=torch.ones((1, 1)).long())
        logits = output.logits.squeeze(0)
        probs = torch.softmax(logits, dim=-1)
        max_value, max_index = torch.max(probs, dim=1)
        score = round(max_value.item(),6)
        confidence.append(score)
    except Exception as e:
        # In case of any errors, append empty string and 0.0 for emotion and confidence, respectively
        emotions.append("")
        confidence.append(0.0)

# Add the predicted emotions and confidence scores to the DataFrame
df1["Emotions_T5"] = emotions
df1['Emotions_T5'].replace({'joy': 'happiness', 'love': 'disgust'}, inplace=True)
df1["Confidence_T5"] = confidence
df['Emotions_T5'] = df1['Emotions_T5']
df["Confidence_T5"] = df1["Confidence_T5"]

# Group the DataFrame by filenames and emotions
df1 = df1.groupby(['Filenames', 'Emotions_T5']).size().reset_index(name='Counts')
# Print the null values, DataFrame shape and unique emotions
print(df.isnull().sum(),df.shape,df1.isnull().sum(),df1.shape)


100%|██████████| 324/324 [27:34<00:00,  5.11s/it]

Filenames        0
Texts            0
Emotions_T5      0
Confidence_T5    0
dtype: int64 (324, 4) Filenames      0
Emotions_T5    0
Counts         0
dtype: int64 (45, 3)





In [None]:

# df['Emotions_T5'].replace({'joy': 'happiness', 'love': 'disgust'}, inplace=True)

In [None]:
df1.Emotions_T5.unique()

array(['anger', 'fear', 'happiness', 'sadness', 'surprise', 'disgust'],
      dtype=object)

In [None]:
# Define color scale for emotion categories
color_scale = {
    'disgust': 'whitesmoke',
    'happiness': 'springgreen',
    'surprise': 'torquise',
    'anger': 'crimson',
    'sadness': 'black',
    'fear': 'darkviolet'
}

# Create and display a pie chart of the T5 emotion distribution by filenames
fig = px.pie(df1, values='Counts', names='Emotions_T5', color='Emotions_T5', color_discrete_map=color_scale,
             labels={'Emotions_T5': 'Emotions_T5', 'Counts': 'Counts', 'Filenames': 'Filenames'},
             title='T5 Emotion Distribution by Filenames')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()


In [None]:
# Create and display a 3D scatter plot of the T5 emotion distribution by filenames
fig = px.scatter_3d(df1, x='Filenames', y='Emotions_T5', z='Counts',size='Counts',
                    color='Emotions_T5',title='T5 Emotion Prediction by Filenames')
fig.show()

In [None]:
# Create and display a 3D scatter plot of the T5 emotion confidence distribution by filenames
fig = px.scatter_3d(df, x='Filenames', y='Emotions_T5', z='Confidence_T5',size='Confidence_T5',
                    color='Emotions_T5',title='T5 Emotion Confidence Score by Filenames')
fig.show()

In [None]:
# Create and display a histogram of the T5 emotion distribution by filenames
fig = px.histogram(df, x="Emotions_T5", color="Filenames", title="T5 Emotion Prediction by Filenames",
                   labels={"Emotions_T5": "Emotions", "Counts": "Counts"}, width=800, 
                   category_orders={"Emotions_T5": df['Emotions_T5'].value_counts().index})

fig.update_layout(barmode="group", bargap=0.1, bargroupgap=0, xaxis={'showgrid': False, 'zeroline': False})

fig.show()

## BERT

In [None]:
# Import necessary libraries and models
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline
from tqdm import tqdm
import tensorflow as tf

# Load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
# Create an emotion classification pipeline
emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')


Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/501M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [None]:
# Initialize lists to store emotions and confidence values
emotions = []
confidence = []

# Create a copy of the original dataframe
df1 = df.copy(deep=True)
# Process each text in the dataframe and predict emotions and confidence
for idx in tqdm(range(len(df1))):
    speech = df1.iloc[idx,1]
    # Prepare input data, Perform predictions and store the results
    input_ids = []
    attention_masks = []
    for i in range(0, len(speech), 512):
        chunk = speech[i:i+512]
        encoded = tokenizer.encode_plus(chunk, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='tf')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    if len(input_ids) > 1:
        input_ids = tf.concat(input_ids, axis=0)
        attention_masks = tf.concat(attention_masks, axis=0)
    elif len(input_ids) == 1:
        input_ids = input_ids[0]
        attention_masks = attention_masks[0]
    else:
        emotions.append("")
        confidence.append(0.0)
        continue

    try:
        outputs = model({'input_ids': input_ids, 'attention_mask': attention_masks})[0]
        predicted_class = tf.argmax(outputs, axis=1).numpy()[0]
        predicted_scores = tf.nn.softmax(outputs, axis=1).numpy()[0]
        emotions.append(model.config.id2label[predicted_class])
        confidence.append(predicted_scores[predicted_class])
    except:
        emotions.append("")
        confidence.append(0.0)
    
# Map the predicted emotions to a unified set
mappings = {
    'neutral' : 'fear',
    'admiration': 'happiness',
    'amusement': 'happiness',
    'anger': 'anger',
    'annoyance': 'anger',
    'approval': 'happiness',
    'caring': 'sadness',
    'confusion': 'surprise',
    'curiosity': 'surprise',
    'desire': 'happiness',
    'disappointment': 'sadness',
    'disapproval': 'anger',
    'disgust': 'disgust',
    'embarrassment': 'disgust',
    'excitement': 'happiness',
    'fear': 'fear',
    'gratitude': 'happiness',
    'grief': 'sadness',
    'joy': 'happiness',
    'love': 'happiness',
    'nervousness': 'fear',
    'optimism': 'happiness',
    'pride': 'happiness',
    'realization': 'surprise',
    'relief': 'happiness',
    'remorse': 'sadness',
    'sadness': 'sadness',
    'surprise': 'surprise'
}

# Update the emotions list with the mapped values
emotions = [*map(mappings.get,emotions)]
# Add the predicted emotions and confidence values to the dataframe
df1["Emotions_BERT"] = np.array(emotions)
df1["Confidence_BERT"] = np.array(confidence)
df['Emotions_BERT'] = df1['Emotions_BERT']
df['Confidence_BERT'] = df1['Confidence_BERT']
# Group the dataframe by filenames, emotions, and confidence values
df1 = df1.groupby(['Filenames','Emotions_BERT','Confidence_BERT']).size().reset_index(name='Counts')
# Check for missing values and dataframe shapes
print(df.isnull().sum(),df.shape,df1.isnull().sum(),df1.shape)

100%|██████████| 324/324 [17:00<00:00,  3.15s/it]

Filenames          0
Texts              0
Emotions_T5        0
Confidence_T5      0
Emotions_BERT      0
Confidence_BERT    0
dtype: int64 (324, 6) Filenames          0
Emotions_BERT      0
Confidence_BERT    0
Counts             0
dtype: int64 (324, 4)





In [None]:
# Define a color scale for visualizations
color_scale = {
    'fear': 'whitesmoke',
    'happiness': 'springgreen',
    'surprise': 'torquise',
    'anger': 'crimson',
    'sadness': 'black',
    'disgust': 'darkviolet'
}

# Create and display a pie chart of the BERT emotion distribution by filenames
fig = px.pie(df1, values='Counts', names='Emotions_BERT', color='Emotions_BERT', 
             color_discrete_map=color_scale,color_discrete_sequence=px.colors.qualitative.Pastel,
             labels={'Emotions_BERT': 'Emotions_BERT', 'Counts': 'Counts', 'Filenames': 'Filenames'},
             title='BERT Emotion Distribution by Filenames')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
# Create and display a 3D scatter plot of the BERT emotion distribution by filenames
fig = px.scatter_3d(df1, x='Filenames', y='Emotions_BERT', z='Counts',size='Counts',
                    color='Emotions_BERT',title='BERT Emotion Prediction by Filenames')
fig.show()

In [None]:
# Create and display a 3D scatter plot of the BERT emotion confidence distribution by filenames
fig = px.scatter_3d(df, x='Filenames', y='Emotions_BERT', z='Confidence_BERT',size='Confidence_BERT',
                    color='Emotions_BERT',title='BERT Emotion Confidence Score by Filenames')
fig.show()

In [None]:
# Create and display a histogram of the BERT emotion distribution by filenames
fig = px.histogram(df, x="Emotions_BERT", color="Filenames", title="BERT Emotion Distribution by Filenames",
                   labels={"Emotions_BERT": "Emotions", "Counts": "Counts"}, width=800, 
                   category_orders={"Emotions_BERT": df['Emotions_BERT'].value_counts().index})

fig.update_layout(barmode="group", bargap=0.1, bargroupgap=0, xaxis={'showgrid': False, 'zeroline': False})

fig.show()

## XLNet

In [None]:
# Import necessary libraries and models
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification, pipeline

# Load tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-base-cased')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/565M [00:00<?, ?B/s]


The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetForSequenceClassification: ['lm_loss']
- This IS expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFXLNetForSequenceClassification were not initialized from the mo

In [None]:
# Create a copy of the original dataframe
df1 = df.copy(deep=True)
# Initialize lists to store emotions and confidence values
emotions_probs = []
emotion_labels = []

# Process each text in the dataframe and predict emotions and confidence
for idx in tqdm(range(len(df1))):
    speech = df1.iloc[idx,1]
    # Perform predictions and store the results
    try:
        input_ids = tokenizer.encode(speech,add_special_tokens=True,max_length=512,truncation=True,return_tensors='tf')
        output = model(input_ids=input_ids, training=False)
        dec = output.logits
        label = tf.argmax(dec, axis=1).numpy()[0]
        if label==0:
            emotions_probs.append(dec[0][0].numpy())
        else:
            emotions_probs.append(dec[0][1].numpy())
        label_pos_neg = {0:"negetive",1:"positive"}
        emotion_labels.append(label_pos_neg[label])
    except:
        emotion_labels.append("")
        emotions_probs.append(0.0)

# Add the predicted emotions and confidence values to the dataframe
df1["Emotions_XLNet"] = np.array(emotion_labels)
df1["Confidence_XLNet"] = np.array(emotions_probs)
df['Emotions_XLNet'] = df1['Emotions_XLNet']
df['Confidence_XLNet'] = df1['Confidence_XLNet']
# Group the dataframe by filenames, emotions, and confidence values
df1 = df1.groupby(['Filenames','Emotions_XLNet','Confidence_XLNet']).size().reset_index(name='Counts')

# Check for missing values and dataframe shapes
print(df.isnull().sum(),df.shape,df1.isnull().sum(),df1.shape)

100%|██████████| 324/324 [06:09<00:00,  1.14s/it]

Filenames           0
Texts               0
Emotions_T5         0
Confidence_T5       0
Emotions_BERT       0
Confidence_BERT     0
Emotions_XLNet      0
Confidence_XLNet    0
dtype: int64 (324, 8) Filenames           0
Emotions_XLNet      0
Confidence_XLNet    0
Counts              0
dtype: int64 (324, 4)





In [None]:
# Define a color scale for visualizations
color_scale = {
    'positive': 'whitesmoke',
    'negative': 'crimson'
}

# Create and display a pie chart of the XLNet emotion detection confidence based on each emotion category
fig = px.pie(df1, values='Counts', names='Emotions_XLNet', color='Emotions_XLNet', color_discrete_map=color_scale,
             labels={'Emotions_XLNet': 'Emotions_XLNet', 'Counts': 'Counts', 'Confidence_XLNet': 'Confindence_XLNet'},
             title='XLNet Emotion Detection Confidance based on Each Emotion Category')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
# Create and display a 3D scatter plot of the XLNet emotion distribution by filenames
fig = px.scatter_3d(df1, x='Filenames', y='Emotions_XLNet', z='Counts',size='Counts',
                    color='Emotions_XLNet',title='XLNet Emotion Prediction by Filenames')
fig.show()

In [None]:
# Create and display a 3D scatter plot of the XLNet emotion confidence distribution by filenames
fig = px.scatter_3d(df, x='Filenames', y='Emotions_XLNet', z='Confidence_XLNet',
                    color='Emotions_XLNet',title='XLNet Emotion Confidence Score by Filenames')
fig.show()

In [None]:
# Create and display a histogram of the XLNet emotion distribution by filenames
fig = px.histogram(df, x="Emotions_XLNet", color="Filenames", title="XLNet Emotion Prediction by Filenames",
                   labels={"Emotions_XLNet": "Emotions", "Counts": "Counts"}, width=800, 
                   category_orders={"Emotions_XLNet": df['Emotions_XLNet'].value_counts().index})

fig.update_layout(barmode="group", bargap=0.1, bargroupgap=0, xaxis={'showgrid': False, 'zeroline': False})

fig.show()

In [None]:
# Save the final dataframe to an Excel file
df.to_excel('final_dataframe.xlsx', index=False)
# Read the saved Excel file and display the last 15 rows
df2 = pd.read_excel("final_dataframe.xlsx")
df2.tail(15)

Unnamed: 0,Filenames,Texts,Emotions_T5,Confidence_T5,Emotions_BERT,Confidence_BERT,Emotions_XLNet,Confidence_XLNet
309,Audio M,if the base is stronger?The top will be stable...,happiness,0.472825,sadness,0.783162,negetive,0.370691
310,Audio M,Make sure every teammate is getting a chanceto...,happiness,0.984692,surprise,0.71866,negetive,0.380967
311,Audio M,or you may walk around to see what other group...,happiness,0.978525,surprise,0.441104,negetive,0.273903
312,Audio M,About two and a half centimeters.I think it's ...,happiness,0.97969,surprise,0.916611,negetive,0.433965
313,Audio M,"It's pretty light.You know he's...Jose, rememb...",happiness,0.908424,sadness,0.913711,negetive,0.425988
314,Audio M,"Now, gentlemen, you've got six minutes.What is...",happiness,0.9571,surprise,0.835219,negetive,0.53773
315,Audio M,"You need to communicate better than help me, o...",happiness,0.956536,sadness,0.980424,negetive,0.595219
316,Audio M,Five minute warning.Five minute warning.Gentle...,happiness,0.376339,surprise,0.570149,positive,0.228957
317,Audio M,What do you need to do in the next four minute...,happiness,0.991547,sadness,0.9043,negetive,0.714944
318,Audio M,"So the interesting thing is,she does.So the in...",happiness,0.601173,happiness,0.742564,positive,0.209331


In [None]:
# Create and display a 3D scatter plot of all emotion distributions by filenames
fig = px.scatter_3d(df, x='Emotions_T5', y='Emotions_XLNet', z='Emotions_BERT',
                    color='Filenames',title='All Emotion Prediction by Filenames')
fig.show()

In [None]:
#