# Imports and Functions

In [1]:
import os, re, time, pickle, collections, importlib, datetime, torch
import pandas as pd, numpy as np, pickle

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (if you haven't already)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from chardet import detect
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import defaultdict, Counter
from wordebd import WORDEBD
from vocab import Vocab, Vectors
from munch import Munch
from cnnlstmseq import CNNLSTMseq
%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read(),
    return detect(rawdata['encoding'])

def detect_misspelling(source):
    pass
def replace_spelling(source):
    return re.sub("Åf", "'", source)

In [3]:
# referenced from DialogueGCN, mastodon code
def preprocess_text(x):
    for punct in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\'':
        x = x.replace(punct, ' ')
    x = ' '.join(x.split())
    x = x.lower()

    return x

def load_pretrained_glove():
    print("Loading GloVe...")
    glv_vector = {}
    f = open('/embed/glove/glove.840B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word, coefs = values[0], np.asarray(values[1:], dtype='float')
        try:
            glv_vector[word] = coefs

        except ValueError:
            continue
    f.close()
    start_time = time.time()
    print(f"Took {time.time() - start_time} seconds to load pretrained GloVe model.")
    return glv_vector


def encode_labels(encoder, l):
    return encoder[l]

def load_data_from_npy(file_path):
    try:
        data = np.load(file_path, allow_pickle=True)
        if isinstance(data, np.ndarray):
            if data.ndim == 2:
                return pd.DataFrame(data)
            else:
                raise ValueError("The loaded array is not two-dimensional.")
        else:
            raise TypeError("The loaded object is not a NumPy array.")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error: An exception occurred - {str(e)}")
        return None

def _read_words(data, convmode=None):
    '''    
        Count the occurrences of all words
        @param convmode: str, None for non conversational scope, 'naive' for classic or naive approach, 'conv' for conversation depth into account (one additional dim and nested values)
        @param data: list of examples
        @return words: list of words (with duplicates)
    '''    
    words = []
    if convmode is None:
        for example in data:
            words += example.split()     
    return words

def _data_to_nparray(data, vocab, args):
    '''
        Convert the data into a dictionary of np arrays for speed.
    '''
    raw = np.array([e for e in data["Utterance"]], dtype=object)
    doc_label = np.array([x for x in data["Emotion"]], dtype=np.int64)

    # compute the max text length
    text_len = np.array([len(e) for e in data["Utterance"]])
    max_text_len = max(text_len)
    ids = np.array([e for e in data['Dialogue_ID']])
    ids2 = np.array([e for e in data['Utterance_ID']])

    # initialize the big numpy array by <pad>
    text = vocab.stoi['<pad>'] * np.ones([len(data), max_text_len],
                                     dtype=np.int64)

    del_idx = []
    # convert each token to its corresponding id
    for i in range(len(X_train)):
        text[i, :len(X_train['Utterance'][i])] = [vocab.stoi[x] if x in vocab.stoi else vocab.stoi['<unk>']
                for x in X_train['Utterance'][i]]

        # filter out document with only unk and pad
        if np.max(text[i]) < 2:
            del_idx.append(i)

    vocab_size = vocab.vectors.size()[0]


    ## Curation for padding (string instead of list of list)
    raw = [ ["<pad>" if m == ["<pad>", "<pad>", "<pad>", "<pad>", "<pad>"] else m for m in c ] for c in raw ]

    ids, ids2, text_len, text, doc_label, raw = _del_by_idx( [ids, ids2, text_len, text, doc_label, raw], del_idx, 0)
    new_data = {
        'ids': ids,
        'ids2': ids2,
        'text': text,
        'text_len': text_len,
        'label': doc_label,
#         'raw': raw,
        'vocab_size': vocab_size,
    }

    return new_data

def _del_by_idx(array_list, idx, axis):

    '''        
        Delete the specified index for each array in the array_lists",

        @params: array_list: list of np arrays
        @params: idx: list of int
        @params: axis: int

        @return: res: tuple of pruned np arrays
    '''
    if type(array_list) is not list:
        array_list = [array_list]

    # modified to perform operations in place
    for i, array in enumerate(array_list):
        array_list[i] = np.delete(array, idx, axis)

    if len(array_list) == 1:
        return array_list[0]
    else:
        return array_list

def find_value_ranges(lst):
    value_ranges = []
    start_index = 0

    for i in range(1, len(lst)):
        if lst[i] != lst[i - 1]:
            value_ranges.append((start_index, i - 1))
            start_index = i

    # Add the last range
    value_ranges.append((start_index, len(lst) - 1))

    return value_ranges


In [4]:
os.listdir()

['.git',
 '.gitignore',
 'cnnlstmseq.py',
 'context_encoder - with classifier.ipynb',
 'context_encoder.ipynb',
 'data',
 'embed',
 'emotionClassifier.ipynb',
 'GAT.py',
 'graph.html',
 'lib',
 'LICENSE',
 'README.md',
 'relationtype_encoder.ipynb',
 'runs',
 'utils',
 'vocab.py',
 'wordebd.py',
 '__pycache__']

# Data Preparation

## Initial data cleaning

Get X_train via the CSV file in data

In [5]:
X_train = pd.read_csv('data/train_sent_emo_dya.csv', encoding='MacRoman')
X_train.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Old_Dialogue_ID,Old_Utterance_ID,Season,Episode,StartTime,EndTime
0,also I was the point person on my companyÅfs t...,Chandler,neutral,neutral,0,0,0,0,8,21,"00:16:16,059","00:16:21,731"
1,You mustÅfve had your hands full.,The Interviewer,neutral,neutral,0,1,0,1,8,21,"00:16:21,940","00:16:23,442"
2,That I did. That I did.,Chandler,neutral,neutral,0,2,0,2,8,21,"00:16:23,442","00:16:26,389"
3,So letÅfs talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,0,3,8,21,"00:16:26,820","00:16:29,572"
4,My duties? All right.,Chandler,surprise,positive,0,4,0,4,8,21,"00:16:34,452","00:16:40,917"


Drop Old_Dialogue_ID, Old_Utterance_ID, Season, Episode, StarTime, and EndTime

In [6]:
drop_features = list(X_train.keys()[6:])
X_train = X_train.drop(drop_features, axis=1)

y_train = pd.DataFrame()
y_train["Emotion"] = X_train["Emotion"].copy()
y_train["Dialogue_ID"] = X_train["Dialogue_ID"].copy()
X_train.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also I was the point person on my companyÅfs t...,Chandler,neutral,neutral,0,0
1,You mustÅfve had your hands full.,The Interviewer,neutral,neutral,0,1
2,That I did. That I did.,Chandler,neutral,neutral,0,2
3,So letÅfs talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3
4,My duties? All right.,Chandler,surprise,positive,0,4


In [7]:
y_train.head()

Unnamed: 0,Emotion,Dialogue_ID
0,neutral,0
1,neutral,0
2,neutral,0
3,neutral,0
4,surprise,0


In [8]:
X_train["Utterance"] = X_train["Utterance"].apply(lambda x: replace_spelling(x))
X_train.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also I was the point person on my company's tr...,Chandler,neutral,neutral,0,0
1,You must've had your hands full.,The Interviewer,neutral,neutral,0,1
2,That I did. That I did.,Chandler,neutral,neutral,0,2
3,So let's talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3
4,My duties? All right.,Chandler,surprise,positive,0,4


Check if there are null values

In [9]:
print(X_train.isnull().sum())

Utterance       0
Speaker         0
Emotion         0
Sentiment       0
Dialogue_ID     0
Utterance_ID    0
dtype: int64


### Standardizing Text

In [10]:
def standardize_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove leading and trailing spaces
    text = text.strip()

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    return text

# Apply the standardize_text function to the 'Utterance' column
X_train['Utterance'] = X_train['Utterance'].apply(standardize_text)
X_train.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also i was the point person on my company's tr...,Chandler,neutral,neutral,0,0
1,you must've had your hands full.,The Interviewer,neutral,neutral,0,1
2,that i did. that i did.,Chandler,neutral,neutral,0,2
3,so let's talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3
4,my duties? all right.,Chandler,surprise,positive,0,4


### Lemmatizer

In [11]:
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to expand contractions
def expand_contractions(text):
    # Dictionary of English contractions
    contractions_dict = {
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }
    # Regular expression for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
    
    def replace(match):
        return contractions_dict[match.group(0)]
    
    return contractions_re.sub(replace, text)

# Function to preprocess text data
def preprocess_text(text):
    # Expand contractions in the text
    text = expand_contractions(text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize words
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Re-join tokens into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply the preprocess_text function to the 'Utterance' column
New_Utterances = X_train['Utterance'].apply(preprocess_text)

In [12]:
New_Utterances.head()

0    also point person company transition kl5 gr6 s...
1                                       must hand full
2                                                     
3                             let talk little bit duty
4                                           duty right
Name: Utterance, dtype: object

In [13]:
X_train['Utterance'] = New_Utterances

In [14]:
X_train.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,also point person company transition kl5 gr6 s...,Chandler,neutral,neutral,0,0
1,must hand full,The Interviewer,neutral,neutral,0,1
2,,Chandler,neutral,neutral,0,2
3,let talk little bit duty,The Interviewer,neutral,neutral,0,3
4,duty right,Chandler,surprise,positive,0,4


### Changing labels per speaker

In [15]:
def label_speakers_by_order(df):
    # Create an empty column for the new speaker labels
    df['Speaker_Label'] = None
    
    # Iterate through each unique dialogue
    for dialogue_id in df['Dialogue_ID'].unique():
        # Filter the DataFrame by the current dialogue_id
        dialogue_df = df[df['Dialogue_ID'] == dialogue_id]
        
        # Get the unique speakers in order of appearance
        speakers_order = dialogue_df['Speaker'].unique()
        
        # Assign labels to speakers based on their order
        speaker_labels = {speaker: idx for idx, speaker in enumerate(speakers_order)}
        
        # Apply the labels to the 'Speaker_Label' column
        for speaker in speaker_labels:
            df.loc[(df['Dialogue_ID'] == dialogue_id) & (df['Speaker'] == speaker), 'Speaker_Label'] = speaker_labels[speaker]
    
    return df

# Apply the function to your DataFrame
X_train = label_speakers_by_order(X_train)

Random check

In [16]:
X_train[X_train['Dialogue_ID'] == 25]

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Speaker_Label
186,guess,Charlie,sadness,negative,25,0,0
187,hum another reason thought time end joey,Charlie,neutral,neutral,25,1,0
188,started realize feeling someone else,Charlie,neutral,neutral,25,2,0
189,ok geller last day conference know happens key...,Paleontologist,neutral,neutral,25,3,1


Drop speaker

In [17]:
X_train = X_train.drop('Speaker', axis=1)

In [18]:
X_train.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Speaker_Label
0,also point person company transition kl5 gr6 s...,neutral,neutral,0,0,0
1,must hand full,neutral,neutral,0,1,1
2,,neutral,neutral,0,2,0
3,let talk little bit duty,neutral,neutral,0,3,1
4,duty right,surprise,positive,0,4,0


### Check balance of emotion data

In [19]:
X_train['Emotion'].value_counts()

Emotion
neutral     5960
joy         2312
anger       1500
surprise    1490
sadness      876
disgust      364
fear         338
Name: count, dtype: int64

## Getting label encoders

In [20]:
File1Exists = os.path.isfile("data/dump/label_encoder.pkl")
File2Exists = os.path.isfile("data/dump/label_decoder.pkl")
MakeNewFiles = False

if not(File1Exists) or not(File2Exists) or MakeNewFiles is True:
    print("Files do not exist: Making label_encoder and label_decoder files")
    labels = y_train["Emotion"].unique()
    label_encoder = {label: i for i, label in enumerate(labels)}
    label_decoder = {i: label for i, label in enumerate(labels)}

    pickle.dump(label_encoder, open('data/dump/label_encoder.pkl', 'wb'))
    pickle.dump(label_decoder, open('data/dump/label_decoder.pkl', 'wb'))
    
else:
    print("Files exist: opening label_encoder and label_decoder")
    file1 = open('data/dump/label_encoder.pkl', 'rb')
    file2 = open('data/dump/label_decoder.pkl', 'rb')
    label_encoder = pickle.load(file1)
    label_decoder = pickle.load(file2)
    file1.close()
    file2.close()

Files exist: opening label_encoder and label_decoder


Get the label encoders for each emotion

In [21]:
y_train["Emotion"] = y_train["Emotion"].apply(lambda x: encode_labels(label_encoder, x))

In [22]:
# Print the label_encoder dictionary to see the mapping
print("Label Encoder Mapping (Emotion -> Encoded Label):")
for label, encoded_label in label_encoder.items():
    print(f"{label}: {encoded_label}")

Label Encoder Mapping (Emotion -> Encoded Label):
neutral: 0
surprise: 1
fear: 2
sadness: 3
joy: 4
disgust: 5
anger: 6


From 0 to 14, the Dialogue ID is 0. Emotions are shown as label encoders

In [23]:
y_train[:15]

Unnamed: 0,Emotion,Dialogue_ID
0,0,0
1,0,0
2,0,0
3,0,0
4,1,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


Apply the encoded emotion in X_train

In [24]:
X_train["Emotion"] = y_train["Emotion"].copy()
X_train.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Speaker_Label
0,also point person company transition kl5 gr6 s...,0,neutral,0,0,0
1,must hand full,0,neutral,0,1,1
2,,0,neutral,0,2,0
3,let talk little bit duty,0,neutral,0,3,1
4,duty right,1,positive,0,4,0


In [25]:
## save the data in pickle format ##
checkFile = os.path.isfile("data/dump/train_labels.pkl")
if checkFile is False:
    X_train["Emotion"]

    pickle.dump(X_train["Emotion"],
                open('data/dump/train_labels.pkl', 'wb'))

In [26]:
X_train["Emotion"]

0        0
1        0
2        0
3        0
4        1
        ..
12835    5
12836    5
12837    1
12838    0
12839    4
Name: Emotion, Length: 12840, dtype: int64

## Importing Word2Vec

In [27]:
file_path = os.path.join(os.getcwd(), "data/wiki-news-300d-1M.vec")
if os.path.isfile(file_path):
    print(f"{file_path} exists")
else:
    print(f"The file does not exist in the current directory.")

c:\School Files\23-24 T2\THS-ST2\ER_GAT\data/wiki-news-300d-1M.vec exists


In [28]:
vectors = Vectors(name="wiki-news-300d-1M.vec", url="data/", cache="data/")
vectors.cache(name="data/wiki-news-300d-1M.vec", url="data/", cache="data/")

In [29]:
print(vectors.vectors.shape)

torch.Size([999994, 300])


In [30]:
vocab = Vocab(counter=collections.Counter(_read_words(X_train.Utterance)),
                  vectors=vectors,
                  specials=['<pad>', '<unk>'],
                  min_freq=5)

In [31]:
# print word embedding statistics 
wv_size = vocab.vectors.size() 
print('Total num. of words: {}, word vector dimension: {}'.format( 
   wv_size[0], 
   wv_size[1]))

Total num. of words: 1403, word vector dimension: 300


# Creating an embedding

In [32]:
ebd = WORDEBD(vocab, False)
ebd

WORDEBD(
  (embedding_layer): Embedding(1403, 300)
)

In [33]:
args = Munch({
    "cnn_filter_sizes":[3,4,5],
    "cnn_num_filters":100,
    "cuda":-1,
    "mode":"train",
    "snapshot":'',
})

In [34]:
model = CNNLSTMseq(ebd, args) # ProtoSeq

In [35]:
print("{}, Building embedding".format(
    datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S')), flush=True),
if args.snapshot != '':
    if args.multitask:
        print("{}, Loading pretrained embedding from {}".format(
            datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S'),
            '%s_%s.ebd' % (args.snapshot, args.task),
            ))
        model.load_state_dict(  torch.load( '%s_%s.ebd' % (args.snapshot, args.task) ), strict=False  )
    
    # Load pretrained models,
    else:   
        print("{}, Loading pretrained embedding from {}".format(
            datetime.datetime.now().strftime('%y/%m/%d %H:%M:%S'),
            '{}.ebd'.format(args.snapshot)
            ))
        model.load_state_dict(  torch.load( '{}.ebd'.format(args.snapshot) ), strict=False  )

24/03/07 23:50:51, Building embedding


In [36]:
model.train()

CNNLSTMseq(
  (ebd): WORDEBD(
    (embedding_layer): Embedding(1403, 300)
  )
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (lstm): LSTM(300, 150, bidirectional=True)
)

In [37]:
model.eval()

CNNLSTMseq(
  (ebd): WORDEBD(
    (embedding_layer): Embedding(1403, 300)
  )
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (lstm): LSTM(300, 150, bidirectional=True)
)

# Updated Representation

In [38]:
ranges = find_value_ranges(X_train["Dialogue_ID"])

In [68]:
updated_representations = [] 
 
checkFile = os.path.isfile("embed/updated_representation_list.pkl") 
 
if checkFile is False: 
    for range_pair in ranges: 
        start_idx, end_idx = range_pair 
        conversation = X_train['Utterance'][start_idx:end_idx+1] 
 
        turn_indices = [torch.tensor([vocab.stoi[word] if word in vocab.stoi else vocab.stoi['<unk>'] for word in turn], dtype=torch.long) 
                    for turn in conversation] 
        max_seq_len = max(max(len(turn), 5) for turn in turn_indices) 
        padded_turns = [torch.nn.functional.pad(turn, pad=(0, max_seq_len - len(turn))) for turn in turn_indices] 
 
        # Stack the padded turns along a new dimension 
        batched_input = torch.stack(padded_turns) 
        input_data = {'Utterance': batched_input} 
        output_representation = model(input_data) 
 
        updated_representations.append(output_representation) 
     
     
    file_path = 'embed/updated_representation_list.pkl' 
    # Save the list to a file using pickle 
    with open(file_path, 'wb') as file: 
        pickle.dump(updated_representations, file) 
     
else: 
    file_path = 'embed/updated_representation_list.pkl' 
 
    # Load the list from the file using pickle 
    with open(file_path, 'rb') as file: 
        updated_representations = pickle.load(file)

In [69]:
updated_representations.__sizeof__()

19528

idk what this is for

In [41]:
# checkFile = False
# encoded_speaker_list = [] 
# if checkFile is False: 
#     for range_pair in ranges: 
#         start_idx, end_idx = range_pair 
#         speaker_per_dialog = X_train['Speaker'][start_idx:end_idx+1].copy() 
#         speaker_feature = set(speaker_per_dialog) 
#         speaker_encoder = {feature: i for i, feature in enumerate(speaker_feature)} 
#         speaker_decoder = {i: feature for i, feature in enumerate(speaker_feature)} 

 
#         encoded_speaker = speaker_per_dialog.replace(speaker_encoder) 
#         encoded_speaker_list.append(encoded_speaker) 
 
#     file_path = 'data/dump/speaker_encoder.pkl' 
#     with open(file_path, 'wb') as file: 
#         pickle.dump([encoded_speaker_list, ranges], file) 

# else: 
#     file = open('data/dump/speaker_encoder.pkl',  "rb") 
#     encoded_speaker_list = pickle.load(file) 
#     file.close()

In [70]:
flattened_representation = torch.cat(updated_representations, dim=0)
flattened_representation.shape

torch.Size([12840, 300])

In [71]:
type(flattened_representation)

torch.Tensor

In [44]:
y_train.shape

(12840, 2)

In [45]:
y_train.head()

Unnamed: 0,Emotion,Dialogue_ID
0,0,0
1,0,0
2,0,0
3,0,0
4,1,0


Save Torch Tensor Data

In [46]:
torch.save(flattened_representation, 'data/train_test_data/x_train_contextencoder.pt')

Save y_train data as csv

In [47]:
# Save y_train DataFrame to a CSV file
y_train.to_csv('data/train_test_data/y_train_contextencoder.csv', index=False)

# Testing Data

In [48]:
# Get the testing data
X_test = pd.read_csv('data/test_sent_emo_dya.csv', encoding='MacRoman')

# Drop extra features
drop_features = list(X_test.keys()[6:])
drop_features
X_test = X_test.drop(drop_features, axis=1)

# Make a y_test df
y_test = pd.DataFrame()
y_test["Emotion"] = X_test["Emotion"].copy()
y_test["Dialogue_ID"] = X_test["Dialogue_ID"].copy()

# Fix spelling
X_test["Utterance"] = X_test["Utterance"].apply(lambda x: replace_spelling(x))

In [49]:
X_test.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID
0,Why do all you‚Äôre coffee mugs have numbers o...,Mark,surprise,positive,0,0
1,Oh. That‚Äôs so Monica can keep track. That wa...,Rachel,anger,negative,0,1
2,Y'know what?,Rachel,neutral,neutral,0,2
3,Okay.,Ross,neutral,neutral,1,0
4,"Ross, didn't you say that there was an elevato...",Rachel,neutral,neutral,1,1


In [50]:
y_test.head()

Unnamed: 0,Emotion,Dialogue_ID
0,surprise,0
1,anger,0
2,neutral,0
3,neutral,1
4,neutral,1


In [51]:
print(X_test.isnull().sum())

Utterance       0
Speaker         0
Emotion         0
Sentiment       0
Dialogue_ID     0
Utterance_ID    0
dtype: int64


In [52]:
# Apply the standardize_text function to the 'Utterance' column
X_test['Utterance'] = X_test['Utterance'].apply(standardize_text)

# Apply the preprocess_text function to the 'Utterance' column
X_test['Utterance'] = X_test['Utterance'].apply(preprocess_text)

# Change labeled speakers
X_test = label_speakers_by_order(X_test)

# Drop Speaker
X_test = X_test.drop('Speaker', axis=1)

X_test.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Speaker_Label
0,youäôre coffee mug number bottom,surprise,positive,0,0,0
1,oh thatäôs monica keep track way one missing l...,anger,negative,0,1,1
2,yknow,neutral,neutral,0,2,1
3,okay,neutral,neutral,1,0,0
4,ross say elevator,neutral,neutral,1,1,1


Random check

In [53]:
X_test[X_test['Dialogue_ID'] == 35]

Unnamed: 0,Utterance,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Speaker_Label
117,whatäôs guy fighting really room mean silly,anger,negative,35,0,0
118,ross handle,neutral,neutral,35,1,1
119,well apparently canäôt stand watch two people ...,sadness,negative,35,2,0
120,mean enough silliness,anger,negative,35,3,0
121,okayokay,anger,negative,35,4,0
122,two good point look iäôve known long time iäôv...,sadness,negative,35,5,0
123,really want throw away room,sadness,negative,35,6,0
124,silly,joy,positive,35,7,0
125,whwhat important love silliness,neutral,neutral,35,8,0
126,mon wan na live chandler donäôt ya,neutral,neutral,35,9,0


Checking balance of testing data

In [54]:
X_test['Emotion'].value_counts()

Emotion
neutral     1615
anger        516
joy          495
surprise     352
sadness      263
disgust       99
fear          60
Name: count, dtype: int64

Apply label encoders from before to y_test

In [55]:
y_test["Emotion"] = y_test["Emotion"].apply(lambda x: encode_labels(label_encoder, x))

print("Label Encoder Mapping (Emotion -> Encoded Label):")
for label, encoded_label in label_encoder.items():
    print(f"{label}: {encoded_label}")

Label Encoder Mapping (Emotion -> Encoded Label):
neutral: 0
surprise: 1
fear: 2
sadness: 3
joy: 4
disgust: 5
anger: 6


In [56]:
y_test[:15]

Unnamed: 0,Emotion,Dialogue_ID
0,1,0
1,6,0
2,0,0
3,0,1
4,0,1
5,3,1
6,1,1
7,6,1
8,6,1
9,6,1


In [57]:
X_test["Emotion"] = y_test["Emotion"].copy()
X_test.head()

Unnamed: 0,Utterance,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Speaker_Label
0,youäôre coffee mug number bottom,1,positive,0,0,0
1,oh thatäôs monica keep track way one missing l...,6,negative,0,1,1
2,yknow,0,neutral,0,2,1
3,okay,0,neutral,1,0,0
4,ross say elevator,0,neutral,1,1,1


## Getting updated representation for the Testing Data

In [62]:
ranges = find_value_ranges(X_test["Dialogue_ID"])

In [74]:
updated_representations = [] 
 
checkFile = os.path.isfile("embed/updated_representation_list_testingdata.pkl") 
 
if checkFile is False: 
    for range_pair in ranges: 
        start_idx, end_idx = range_pair 
        conversation = X_test['Utterance'][start_idx:end_idx+1] 
 
        turn_indices = [torch.tensor([vocab.stoi[word] if word in vocab.stoi else vocab.stoi['<unk>'] for word in turn], dtype=torch.long) 
                    for turn in conversation] 
        max_seq_len = max(max(len(turn), 5) for turn in turn_indices) 
        padded_turns = [torch.nn.functional.pad(turn, pad=(0, max_seq_len - len(turn))) for turn in turn_indices] 
 
        # Stack the padded turns along a new dimension 
        batched_input = torch.stack(padded_turns) 
        input_data = {'Utterance': batched_input} 
        output_representation = model(input_data) 
 
        updated_representations.append(output_representation) 
     
     
    file_path = 'embed/updated_representation_list_testingdata.pkl' 
    # Save the list to a file using pickle 
    with open(file_path, 'wb') as file: 
        pickle.dump(updated_representations, file) 
     
else: 
    file_path = 'embed/updated_representation_list_testingdata.pkl' 
 
    # Load the list from the file using pickle 
    with open(file_path, 'rb') as file: 
        updated_representations = pickle.load(file)

In [75]:
updated_representations.__sizeof__()

4680

In [76]:
flattened_representation = torch.cat(updated_representations, dim=0)
print(flattened_representation.shape)
print(type(flattened_representation))

torch.Size([3400, 300])
<class 'torch.Tensor'>


In [77]:
y_test.shape

(3400, 2)

In [80]:
torch.save(flattened_representation, 'data/train_test_data/x_test_contextencoder.pt')

In [81]:
# Save y_train DataFrame to a CSV file
y_test.to_csv('data/train_test_data/y_test_contextencoder.csv', index=False)