In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install necessary packages
!pip install -q transformers
!pip install -q spacy
!pip install -q sentencepiece


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Import required libraries
import os,shutil
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import random
import plotly.express as px
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import torch

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(42)

Path of the folder containing transcript files

In [4]:
# Set the directory containing the transcription files
directory = "/content/drive/MyDrive/transcription_files"

In [5]:
# Initialize a list to store file data
file_data = []

# Loop through each file in the directory
for file in os.listdir(directory):
    # Check if the file has a .txt extension
    if file.endswith("txt") :
        # Construct the full file path
        file_path = os.path.join(directory, file)
        # Open the file and read its content
        with open(file_path, "r") as f:
            text_doc = f.read()
            # Split the text document into smaller documents
            docs = re.split(r'\n\n',text_doc)
            # Extract the team name from the file name
            team = file.split(".")[0]
            
            # Combine the smaller documents so that each combined document is less than 512 tokens
            combined_docs = []
            current_doc = ""
            for doc in docs:
                if len(current_doc + doc) < 512:
                    current_doc += doc
                else:
                    combined_docs.append(current_doc)
                    current_doc = doc
            combined_docs.append(current_doc)
            
            # Add the combined documents and their corresponding team names to the file_data list
            for i, doc in enumerate(combined_docs):
                file_data.append({'Filenames': team, 'Texts': doc})

# Create a DataFrame from the file_data list
df = pd.DataFrame(file_data)
# Print DataFrame shape and unique file names
print(df.shape)
print(df.Filenames.unique())
# Display the first 10 rows of the DataFrame
df.head(10)

(14, 2)
['March19' 'April25' 'March28' 'April19' 'April16' 'April4' 'Apri2']


Unnamed: 0,Filenames,Texts
0,March19,
1,March19,Today is March 19th. 47. 48. I got it. I apolo...
2,April25,
3,April25,April 25th 2019 47 48 Is that the case that ha...
4,March28,
5,March28,"Let's say. March 28, 2019. 48. 47. Shuffle the..."
6,April19,
7,April19,"April 16, 2019. Say your numbers. 47. 48. So p..."
8,April16,
9,April16,"April 16, 2019. Say your numbers. 47. 48. So p..."


## T5

In [6]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-emotion")

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [7]:
# Create a deep copy of the original DataFrame
df1 = df.copy(deep=True)
# Initialize lists to store emotions and confidence scores
emotions = []
confidence = []

# Loop through each row in the DataFrame
for idx in tqdm(range(len(df1))):
    # Extract the speech text
    speech = df1.iloc[idx,1]
    try:
        # Tokenize the speech and generate an output label
        input_ids = tokenizer.encode_plus(speech, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        output_label = model.generate(input_ids=input_ids['input_ids'], attention_mask=input_ids['attention_mask'], max_length=2)
        dec = [tokenizer.decode(ids) for ids in output_label]
        label = re.sub('<pad>','',dec[0])
        emotions.append(label.strip())
        # Calculate confidence scores for the emotion prediction
        output = model(input_ids=input_ids['input_ids'], attention_mask=input_ids['attention_mask'], decoder_input_ids=torch.ones((1, 1)).long())
        logits = output.logits.squeeze(0)
        probs = torch.softmax(logits, dim=-1)
        max_value, max_index = torch.max(probs, dim=1)
        score = round(max_value.item(),6)
        confidence.append(score)
    except Exception as e:
        # In case of any errors, append empty string and 0.0 for emotion and confidence, respectively
        emotions.append("")
        confidence.append(0.0)

# Add the predicted emotions and confidence scores to the DataFrame
df1["Emotions_T5"] = emotions
df1['Emotions_T5'].replace({'joy': 'happiness', 'love': 'disgust'}, inplace=True)
df1["Confidence_T5"] = confidence
df['Emotions_T5'] = df1['Emotions_T5']
df["Confidence_T5"] = df1["Confidence_T5"]

# Group the DataFrame by filenames and emotions
df1 = df1.groupby(['Filenames', 'Emotions_T5']).size().reset_index(name='Counts')
# Print the null values, DataFrame shape and unique emotions
print(df.isnull().sum(),df.shape,df1.isnull().sum(),df1.shape)


100%|██████████| 14/14 [01:29<00:00,  6.41s/it]

Filenames        0
Texts            0
Emotions_T5      0
Confidence_T5    0
dtype: int64 (14, 4) Filenames      0
Emotions_T5    0
Counts         0
dtype: int64 (10, 3)





In [8]:

# df['Emotions_T5'].replace({'joy': 'happiness', 'love': 'disgust'}, inplace=True)

In [9]:
df1.Emotions_T5.unique()

array(['fear', 'happiness', 'sadness'], dtype=object)

In [10]:
# Define color scale for emotion categories
color_scale = {
    'disgust': 'whitesmoke',
    'happiness': 'springgreen',
    'surprise': 'torquise',
    'anger': 'crimson',
    'sadness': 'black',
    'fear': 'darkviolet'
}

# Create and display a pie chart of the T5 emotion distribution by filenames
fig = px.pie(df1, values='Counts', names='Emotions_T5', color='Emotions_T5', color_discrete_map=color_scale,
             labels={'Emotions_T5': 'Emotions_T5', 'Counts': 'Counts', 'Filenames': 'Filenames'},
             title='T5 Emotion Distribution by Filenames')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()


In [11]:
# Create and display a 3D scatter plot of the T5 emotion distribution by filenames
fig = px.scatter_3d(df1, x='Filenames', y='Emotions_T5', z='Counts',size='Counts',
                    color='Emotions_T5',title='T5 Emotion Distribution by Filenames')
fig.show()

In [12]:
# Create and display a 3D scatter plot of the T5 emotion confidence distribution by filenames
fig = px.scatter_3d(df, x='Filenames', y='Emotions_T5', z='Confidence_T5',size='Confidence_T5',
                    color='Emotions_T5',title='T5 Emotion Confidence Distribution by Filenames')
fig.show()

In [13]:
# Create and display a histogram of the T5 emotion distribution by filenames
fig = px.histogram(df, x="Emotions_T5", color="Filenames", title="T5 Emotion Distribution by Filenames",
                   labels={"Emotions_T5": "Emotions", "Counts": "Counts"}, width=800, 
                   category_orders={"Emotions_T5": df['Emotions_T5'].value_counts().index})

fig.update_layout(barmode="group", bargap=0.1, bargroupgap=0, xaxis={'showgrid': False, 'zeroline': False})

fig.show()

## BERT

In [17]:
# Import necessary libraries and models
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline
from tqdm import tqdm
import tensorflow as tf

# Load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
# Create an emotion classification pipeline
emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [21]:
# Initialize lists to store emotions and confidence values
emotions = []
confidence = []

# Create a copy of the original dataframe
df1 = df.copy(deep=True)
# Process each text in the dataframe and predict emotions and confidence
for idx in tqdm(range(len(df1))):
    speech = df1.iloc[idx,1]
    # Prepare input data, Perform predictions and store the results
    input_ids = []
    attention_masks = []
    for i in range(0, len(speech), 512):
        chunk = speech[i:i+512]
        encoded = tokenizer.encode_plus(chunk, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='tf')
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    if len(input_ids) > 1:
        input_ids = tf.concat(input_ids, axis=0)
        attention_masks = tf.concat(attention_masks, axis=0)
    elif len(input_ids) == 1:
        input_ids = input_ids[0]
        attention_masks = attention_masks[0]
    else:
        emotions.append("")
        confidence.append(0.0)
        continue

    try:
        outputs = model({'input_ids': input_ids, 'attention_mask': attention_masks})[0]
        predicted_class = tf.argmax(outputs, axis=1).numpy()[0]
        predicted_scores = tf.nn.softmax(outputs, axis=1).numpy()[0]
        emotions.append(model.config.id2label[predicted_class])
        confidence.append(predicted_scores[predicted_class])
    except:
        emotions.append("")
        confidence.append(0.0)
    
# Map the predicted emotions to a unified set
mappings = {
    'neutral' : 'fear',
    'admiration': 'happiness',
    'amusement': 'happiness',
    'anger': 'anger',
    'annoyance': 'anger',
    'approval': 'happiness',
    'caring': 'sadness',
    'confusion': 'surprise',
    'curiosity': 'surprise',
    'desire': 'happiness',
    'disappointment': 'sadness',
    'disapproval': 'anger',
    'disgust': 'disgust',
    'embarrassment': 'disgust',
    'excitement': 'happiness',
    'fear': 'fear',
    'gratitude': 'happiness',
    'grief': 'sadness',
    'joy': 'happiness',
    'love': 'happiness',
    'nervousness': 'fear',
    'optimism': 'happiness',
    'pride': 'happiness',
    'realization': 'surprise',
    'relief': 'happiness',
    'remorse': 'sadness',
    'sadness': 'sadness',
    'surprise': 'surprise'
}

# Update the emotions list with the mapped values
emotions = [*map(mappings.get,emotions)]
# Add the predicted emotions and confidence values to the dataframe
df1["Emotions_BERT"] = np.array(emotions)
df1["Confidence_BERT"] = np.array(confidence)
df['Emotions_BERT'] = df1['Emotions_BERT']
df['Confidence_BERT'] = df1['Confidence_BERT']
# Group the dataframe by filenames, emotions, and confidence values
df1 = df1.groupby(['Filenames','Emotions_BERT','Confidence_BERT']).size().reset_index(name='Counts')
# Check for missing values and dataframe shapes
print(df.isnull().sum(),df.shape,df1.isnull().sum(),df1.shape)

100%|██████████| 14/14 [11:47<00:00, 50.55s/it]

Filenames          0
Texts              0
Emotions_T5        0
Confidence_T5      0
Emotions_BERT      7
Confidence_BERT    0
dtype: int64 (14, 6) Filenames          0
Emotions_BERT      0
Confidence_BERT    0
Counts             0
dtype: int64 (7, 4)





In [23]:
# Define a color scale for visualizations
color_scale = {
    'fear': 'whitesmoke',
    'happiness': 'springgreen',
    'surprise': 'torquise',
    'anger': 'crimson',
    'sadness': 'black',
    'disgust': 'darkviolet'
}

# Create and display a pie chart of the BERT emotion distribution by filenames
fig = px.pie(df1, values='Counts', names='Emotions_BERT', color='Emotions_BERT', 
             color_discrete_map=color_scale,color_discrete_sequence=px.colors.qualitative.Pastel,
             labels={'Emotions_BERT': 'Emotions_BERT', 'Counts': 'Counts', 'Filenames': 'Filenames'},
             title='BERT Emotion Distribution by Filenames')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

In [24]:
# Create and display a 3D scatter plot of the BERT emotion distribution by filenames
fig = px.scatter_3d(df1, x='Filenames', y='Emotions_BERT', z='Counts',size='Counts',
                    color='Emotions_BERT',title='BERT Emotion Distribution by Filenames')
fig.show()

In [25]:
# Create and display a 3D scatter plot of the BERT emotion confidence distribution by filenames
fig = px.scatter_3d(df, x='Filenames', y='Emotions_BERT', z='Confidence_BERT',size='Confidence_BERT',
                    color='Emotions_BERT',title='BERT Emotion Confidence Distribution by Filenames')
fig.show()

In [26]:
# Create and display a histogram of the BERT emotion distribution by filenames
fig = px.histogram(df, x="Emotions_BERT", color="Filenames", title="BERT Emotion Distribution by Filenames",
                   labels={"Emotions_BERT": "Emotions", "Counts": "Counts"}, width=800, 
                   category_orders={"Emotions_BERT": df['Emotions_BERT'].value_counts().index})

fig.update_layout(barmode="group", bargap=0.1, bargroupgap=0, xaxis={'showgrid': False, 'zeroline': False})

fig.show()

## XLNet

In [27]:
# Import necessary libraries and models
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification, pipeline

# Load tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-base-cased')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/565M [00:00<?, ?B/s]


The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetForSequenceClassification: ['lm_loss']
- This IS expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFXLNetForSequenceClassification were not initialized from the mo

In [28]:
# Create a copy of the original dataframe
df1 = df.copy(deep=True)
# Initialize lists to store emotions and confidence values
emotions_probs = []
emotion_labels = []

# Process each text in the dataframe and predict emotions and confidence
for idx in tqdm(range(len(df1))):
    speech = df1.iloc[idx,1]
    # Perform predictions and store the results
    try:
        input_ids = tokenizer.encode(speech,add_special_tokens=True,max_length=512,truncation=True,return_tensors='tf')
        output = model(input_ids=input_ids, training=False)
        dec = output.logits
        label = tf.argmax(dec, axis=1).numpy()[0]
        if label==0:
            emotions_probs.append(dec[0][0].numpy())
        else:
            emotions_probs.append(dec[0][1].numpy())
        label_pos_neg = {0:"negetive",1:"positive"}
        emotion_labels.append(label_pos_neg[label])
    except:
        emotion_labels.append("")
        emotions_probs.append(0.0)

# Add the predicted emotions and confidence values to the dataframe
df1["Emotions_XLNet"] = np.array(emotion_labels)
df1["Confidence_XLNet"] = np.array(emotions_probs)
df['Emotions_XLNet'] = df1['Emotions_XLNet']
df['Confidence_XLNet'] = df1['Confidence_XLNet']
# Group the dataframe by filenames, emotions, and confidence values
df1 = df1.groupby(['Filenames','Emotions_XLNet','Confidence_XLNet']).size().reset_index(name='Counts')

# Check for missing values and dataframe shapes
print(df.isnull().sum(),df.shape,df1.isnull().sum(),df1.shape)

100%|██████████| 14/14 [00:35<00:00,  2.56s/it]

Filenames           0
Texts               0
Emotions_T5         0
Confidence_T5       0
Emotions_BERT       7
Confidence_BERT     0
Emotions_XLNet      0
Confidence_XLNet    0
dtype: int64 (14, 8) Filenames           0
Emotions_XLNet      0
Confidence_XLNet    0
Counts              0
dtype: int64 (14, 4)





In [29]:
# Define a color scale for visualizations
color_scale = {
    'positive': 'whitesmoke',
    'negative': 'crimson'
}

# Create and display a pie chart of the XLNet emotion detection confidence based on each emotion category
fig = px.pie(df1, values='Counts', names='Emotions_XLNet', color='Emotions_XLNet', color_discrete_map=color_scale,
             labels={'Emotions_XLNet': 'Emotions_XLNet', 'Counts': 'Counts', 'Confidence_XLNet': 'Confindence_XLNet'},
             title='XLNet Emotion Detection Confidance based on Each Emotion Category')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(showlegend=False)
fig.show()

In [30]:
# Create and display a 3D scatter plot of the XLNet emotion distribution by filenames
fig = px.scatter_3d(df1, x='Filenames', y='Emotions_XLNet', z='Counts',size='Counts',
                    color='Emotions_XLNet',title='XLNet Emotion Distribution by Filenames')
fig.show()

In [31]:
# Create and display a 3D scatter plot of the XLNet emotion confidence distribution by filenames
fig = px.scatter_3d(df, x='Filenames', y='Emotions_XLNet', z='Confidence_XLNet',
                    color='Emotions_XLNet',title='XLNet Emotion Confidence Distribution by Filenames')
fig.show()

In [32]:
# Create and display a histogram of the XLNet emotion distribution by filenames
fig = px.histogram(df, x="Emotions_XLNet", color="Filenames", title="XLNet Emotion Distribution by Filenames",
                   labels={"Emotions_XLNet": "Emotions", "Counts": "Counts"}, width=800, 
                   category_orders={"Emotions_XLNet": df['Emotions_XLNet'].value_counts().index})

fig.update_layout(barmode="group", bargap=0.1, bargroupgap=0, xaxis={'showgrid': False, 'zeroline': False})

fig.show()

In [33]:
# Save the final dataframe to an Excel file
df.to_excel('final_dataframe.xlsx', index=False)
# Read the saved Excel file and display the last 15 rows
df2 = pd.read_excel("final_dataframe.xlsx")
df2.tail(15)

Unnamed: 0,Filenames,Texts,Emotions_T5,Confidence_T5,Emotions_BERT,Confidence_BERT,Emotions_XLNet,Confidence_XLNet
0,March19,,fear,0.788205,,0.0,negetive,-0.124668
1,March19,Today is March 19th. 47. 48. I got it. I apolo...,sadness,0.969682,surprise,0.951585,negetive,-0.012146
2,April25,,fear,0.788205,,0.0,negetive,-0.124668
3,April25,April 25th 2019 47 48 Is that the case that ha...,happiness,0.972224,happiness,0.904033,negetive,0.091641
4,March28,,fear,0.788205,,0.0,negetive,-0.124668
5,March28,"Let's say. March 28, 2019. 48. 47. Shuffle the...",happiness,0.995478,fear,0.998229,negetive,0.150446
6,April19,,fear,0.788205,,0.0,negetive,-0.124668
7,April19,"April 16, 2019. Say your numbers. 47. 48. So p...",fear,0.984622,surprise,0.996351,negetive,-0.00248
8,April16,,fear,0.788205,,0.0,negetive,-0.124668
9,April16,"April 16, 2019. Say your numbers. 47. 48. So p...",fear,0.984622,surprise,0.996351,negetive,-0.00248


In [34]:
# Create and display a 3D scatter plot of all emotion distributions by filenames
fig = px.scatter_3d(df, x='Emotions_T5', y='Emotions_XLNet', z='Emotions_BERT',
                    color='Filenames',title='All Emotion Distribution by Filenames')
fig.show()

In [35]:
#