In [1]:
#MLP on Complete Dataset

In [2]:
import os
import json
import pandas as pd

def read_json_file(file_path):
    """Reads a JSON file and returns the data."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        print(f"Successfully loaded data from {file_path}")
        return data
    except Exception as e:
        print(f"Failed to load data from {file_path}: {e}")
        return None

def extract_data_to_dataframe(data):
    """Extracts video name and emotion from the JSON data structure and returns a DataFrame."""
    video_names = []
    emotions = []
    
    if not data:
        print("No data provided to extract.")
        return pd.DataFrame()
    
    try:
        for conversation in data:
            for utterance in conversation['conversation']:
                video_name = utterance['video_name']
                emotion = utterance['emotion']
                video_names.append(video_name)
                emotions.append(emotion)
        df = pd.DataFrame({
            'video_name': video_names,
            'emotion': emotions
        })
        print("Data extraction successful.")
        return df
    except Exception as e:
        print(f"Error during data extraction: {e}")
        return pd.DataFrame()

def save_to_csv(df, file_name, directory):
    """Saves the DataFrame to a CSV file in the specified directory."""
    # Ensure the directory exists, create if it does not
    if not os.path.exists(directory):
        os.makedirs(directory)
    full_path = os.path.join(directory, file_name)
    
    try:
        df.to_csv(full_path, index=False)
        print(f"Data successfully saved to {full_path}")
    except Exception as e:
        print(f"Failed to save data to {full_path}: {e}")

def main():
    # Define paths
    train_file_path = '/project/msoleyma_1026/ecp-cause-pair/json_files/train.json'  # Update with the actual path
    test_file_path = '/project/msoleyma_1026/ecp-cause-pair/json_files/test.json'    # Update with the actual path
    save_directory = '/project/msoleyma_1026/ecp-cause-pair/script_mlp/assets'    # Define the directory to save CSV files
    
    # Load JSON data
    train_data = read_json_file(train_file_path)
    test_data = read_json_file(test_file_path)
    
    # Extract data and create DataFrames
    train_df = extract_data_to_dataframe(train_data)
    test_df = extract_data_to_dataframe(test_data)
    
    # Combine train and test datasets
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    
    # Save to CSV files in the specified directory
    save_to_csv(train_df, 'train.csv', save_directory)
    save_to_csv(test_df, 'test.csv', save_directory)
    save_to_csv(combined_df, 'video_emotion_combined.csv', save_directory)

if __name__ == "__main__":
    main()

Successfully loaded data from /project/msoleyma_1026/ecp-cause-pair/json_files/train.json
Successfully loaded data from /project/msoleyma_1026/ecp-cause-pair/json_files/test.json
Data extraction successful.
Data extraction successful.
Data successfully saved to /project/msoleyma_1026/ecp-cause-pair/script_mlp/assets/train.csv
Data successfully saved to /project/msoleyma_1026/ecp-cause-pair/script_mlp/assets/test.csv
Data successfully saved to /project/msoleyma_1026/ecp-cause-pair/script_mlp/assets/video_emotion_combined.csv


In [4]:
#loading concated train and test dataset from "split" folder
import os
import numpy as np

# train and test directories
train_dir = '/project/msoleyma_1026/ecp-cause-pair/concatenated_embedding/train_emotion'
test_dir = '/project/msoleyma_1026/ecp-cause-pair/concatenated_embedding/test_emotion'

# Function to load all .npy files from a directory
def load_embeddings(directory):
    embeddings = []
    for filename in os.listdir(directory):
        if filename.endswith('.npy'):
            file_path = os.path.join(directory, filename)
            embedding = np.load(file_path)
            embeddings.append(embedding)
            print(f'Loaded {filename}')
    return embeddings

# Load the embeddings
train_embeddings = load_embeddings(train_dir)
test_embeddings = load_embeddings(test_dir)

# Print the number of loaded embeddings
print(f'Loaded {len(train_embeddings)} train embeddings')
print(f'Loaded {len(test_embeddings)} test embeddings')

Loaded dia116utt4_concatenated.npy
Loaded dia61utt2_concatenated.npy
Loaded dia644utt8_concatenated.npy
Loaded dia21utt8_concatenated.npy
Loaded dia983utt2_concatenated.npy
Loaded dia607utt15_concatenated.npy
Loaded dia860utt4_concatenated.npy
Loaded dia690utt4_concatenated.npy
Loaded dia530utt12_concatenated.npy
Loaded dia917utt4_concatenated.npy
Loaded dia182utt2_concatenated.npy
Loaded dia409utt11_concatenated.npy
Loaded dia773utt2_concatenated.npy
Loaded dia1279utt5_concatenated.npy
Loaded dia1031utt12_concatenated.npy
Loaded dia38utt6_concatenated.npy
Loaded dia1097utt5_concatenated.npy
Loaded dia1108utt11_concatenated.npy
Loaded dia1134utt9_concatenated.npy
Loaded dia1203utt12_concatenated.npy
Loaded dia1003utt3_concatenated.npy
Loaded dia168utt16_concatenated.npy
Loaded dia509utt4_concatenated.npy
Loaded dia1220utt1_concatenated.npy
Loaded dia389utt12_concatenated.npy
Loaded dia149utt3_concatenated.npy
Loaded dia1091utt6_concatenated.npy
Loaded dia1351utt2_concatenated.npy
Loade

In [5]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


emotions_csv_path = '/project/msoleyma_1026/ecp-cause-pair/script_mlp/assets/video_emotion_combined.csv'
emotions_df = pd.read_csv(emotions_csv_path)
emotion_dict = {row['video_name'].split('.')[0]: row['emotion'] for _, row in emotions_df.iterrows()}

def map_embeddings_to_emotions(file_names, emotion_dict):
    emotions = []
    for file_name in file_names:
        # Extract the video name without the file extension
        video_name = file_name.split('_concatenated')[0]
        # Map the video name to its corresponding emotion
        emotion = emotion_dict.get(video_name, 'Unknown')
        emotions.append(emotion)
    return emotions

# Function to load all .npy files from a directory and return embeddings with their file names
def load_embeddings_and_names(directory):
    embeddings = []
    names = []
    for filename in os.listdir(directory):
        if filename.endswith('.npy'):
            file_path = os.path.join(directory, filename)
            embedding = np.load(file_path)
            embeddings.append(embedding)
            names.append(filename.split('_concatenated.npy')[0])
            print(f'Loaded {filename}')
    return embeddings, names

# Load the embeddings and their file names
train_embeddings, train_files = load_embeddings_and_names(train_dir)
test_embeddings, test_files = load_embeddings_and_names(test_dir)

# map the file names to emotions
Y_train = map_embeddings_to_emotions(train_files, emotion_dict)
Y_test = map_embeddings_to_emotions(test_files, emotion_dict)


Loaded dia116utt4_concatenated.npy
Loaded dia61utt2_concatenated.npy
Loaded dia644utt8_concatenated.npy
Loaded dia21utt8_concatenated.npy
Loaded dia983utt2_concatenated.npy
Loaded dia607utt15_concatenated.npy
Loaded dia860utt4_concatenated.npy
Loaded dia690utt4_concatenated.npy
Loaded dia530utt12_concatenated.npy
Loaded dia917utt4_concatenated.npy
Loaded dia182utt2_concatenated.npy
Loaded dia409utt11_concatenated.npy
Loaded dia773utt2_concatenated.npy
Loaded dia1279utt5_concatenated.npy
Loaded dia1031utt12_concatenated.npy
Loaded dia38utt6_concatenated.npy
Loaded dia1097utt5_concatenated.npy
Loaded dia1108utt11_concatenated.npy
Loaded dia1134utt9_concatenated.npy
Loaded dia1203utt12_concatenated.npy
Loaded dia1003utt3_concatenated.npy
Loaded dia168utt16_concatenated.npy
Loaded dia509utt4_concatenated.npy
Loaded dia1220utt1_concatenated.npy
Loaded dia389utt12_concatenated.npy
Loaded dia149utt3_concatenated.npy
Loaded dia1091utt6_concatenated.npy
Loaded dia1351utt2_concatenated.npy
Loade

In [6]:
Y_test

['neutral',
 'neutral',
 'surprise',
 'neutral',
 'joy',
 'neutral',
 'neutral',
 'joy',
 'surprise',
 'joy',
 'sadness',
 'joy',
 'neutral',
 'neutral',
 'neutral',
 'sadness',
 'neutral',
 'anger',
 'neutral',
 'fear',
 'joy',
 'neutral',
 'neutral',
 'joy',
 'joy',
 'joy',
 'neutral',
 'disgust',
 'neutral',
 'surprise',
 'surprise',
 'sadness',
 'neutral',
 'neutral',
 'sadness',
 'joy',
 'anger',
 'surprise',
 'neutral',
 'neutral',
 'anger',
 'neutral',
 'neutral',
 'sadness',
 'neutral',
 'neutral',
 'joy',
 'anger',
 'joy',
 'joy',
 'neutral',
 'surprise',
 'neutral',
 'surprise',
 'neutral',
 'neutral',
 'joy',
 'neutral',
 'surprise',
 'joy',
 'joy',
 'surprise',
 'joy',
 'neutral',
 'joy',
 'joy',
 'anger',
 'neutral',
 'joy',
 'joy',
 'surprise',
 'sadness',
 'anger',
 'anger',
 'joy',
 'joy',
 'neutral',
 'surprise',
 'neutral',
 'neutral',
 'joy',
 'joy',
 'sadness',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'surprise',
 'neutral',
 'neutral',
 'neutral',
 'neutral

In [7]:
# Assigning to the specified variables
X_train_emotion = train_embeddings
Y_train_emotion = Y_train
X_test_emotion = test_embeddings
Y_test_emotion = Y_test

# Printing the arrays and their shapes
print('X_train_emotion:\n', np.array(X_train_emotion))
print('Y_train_emotion:\n', Y_train_emotion)
print('X_test_emotion:\n', np.array(X_test_emotion))
print('Y_test_emotion:\n', Y_test_emotion)

print('Shape of X_train_emotion:', np.array(X_train_emotion).shape)
#print('Shape of Y_train_emotion:', Y_train_emotion.shape)
print('Shape of X_test_emotion:', np.array(X_test_emotion).shape)
#print('Shape of Y_test_emotion:', Y_test_emotion.shape)


X_train_emotion:
 [[[ 0.00610315 -0.04233586 -0.03267109 ... -0.24763183 -0.17301214
    0.14470007]]

 [[-0.06302285 -0.06895685 -0.01511406 ... -0.0931774  -0.17221317
    0.03556742]]

 [[-0.01879422  0.00930429 -0.01108927 ... -0.0219027  -0.20728719
    0.09586193]]

 ...

 [[-0.03498007 -0.02649954  0.0456823  ... -0.1198136  -0.24607512
    0.10069121]]

 [[ 0.05089436 -0.05664142 -0.01564185 ... -0.09867117 -0.15780176
    0.11430164]]

 [[-0.01059369 -0.12625754 -0.02806858 ... -0.11804007 -0.18786113
    0.11847226]]]
Y_train_emotion:
 ['sadness', 'neutral', 'surprise', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'joy', 'joy', 'surprise', 'surprise', 'joy', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'disgust', 'anger', 'joy', 'sadness', 'neutral', 'joy', 'neutral', 'anger', 'neutral', 'neutral', 'disgust', 'neutral', 'anger', 'sadness', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'joy', 'joy', 'joy', 'joy', 'neutral', 'neutral', 'joy', 'su

In [8]:
#train and test
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Ensure that X_train_emotion and X_test_emotion are numpy arrays of the correct shape
# Flatten the arrays if they are not already 2D (necessary for sklearn)
X_train_emotion = np.array(X_train_emotion)
if len(X_train_emotion.shape) > 2:
    X_train_emotion = X_train_emotion.reshape(X_train_emotion.shape[0], -1)
X_test_emotion = np.array(X_test_emotion)
if len(X_test_emotion.shape) > 2:
    X_test_emotion = X_test_emotion.reshape(X_test_emotion.shape[0], -1)

# Initialize the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(300,100), max_iter=300, activation='relu', solver='adam', learning_rate_init=0.001, random_state=42)

# Fit the classifier on the training data
mlp.fit(X_train_emotion, Y_train_emotion)

# Predict the labels for the test data
Y_pred_test = mlp.predict(X_test_emotion)

# Calculate the accuracy for the test set
test_accuracy = accuracy_score(Y_test_emotion, Y_pred_test)

print(f'Test accuracy: {test_accuracy:.4f}')


Test accuracy: 0.5452


In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Ensure that X_train_emotion and X_test_emotion are numpy arrays of the correct shape
X_train_emotion = np.array(X_train_emotion)
if len(X_train_emotion.shape) > 2:
    X_train_emotion = X_train_emotion.reshape(X_train_emotion.shape[0], -1)
X_test_emotion = np.array(X_test_emotion)
if len(X_test_emotion.shape) > 2:
    X_test_emotion = X_test_emotion.reshape(X_test_emotion.shape[0], -1)

# Initialize the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(300, 100), max_iter=300, activation='relu', solver='adam', learning_rate_init=0.001, random_state=42)

# Fit the classifier on the training data
mlp.fit(X_train_emotion, Y_train_emotion)

# Predict the labels for the training data
Y_pred_train = mlp.predict(X_train_emotion)

# Calculate the accuracy for the training set
train_accuracy = accuracy_score(Y_train_emotion, Y_pred_train)
print(f'Train accuracy: {train_accuracy:.4f}')
train_classification_report = classification_report(Y_train_emotion, Y_pred_train)
print("Training Set Classification Report:")
print(train_classification_report)
train_f1_score = f1_score(Y_train_emotion, Y_pred_train, average='weighted')
print(f'Train F1 Score: {train_f1_score:.4f}')



# Predict the labels for the test data
Y_pred_test = mlp.predict(X_test_emotion)

# Calculate the accuracy for the test set
test_accuracy = accuracy_score(Y_test_emotion, Y_pred_test)
print(f'Test accuracy: {test_accuracy:.4f}')
test_classification_report = classification_report(Y_test_emotion, Y_pred_test)
print("Test Set Classification Report:")
print(test_classification_report)
test_f1_score = f1_score(Y_test_emotion, Y_pred_test, average='weighted')
print(f'Test F1 Score: {test_f1_score:.4f}')


Train accuracy: 0.9999
Training Set Classification Report:
              precision    recall  f1-score   support

       anger       1.00      1.00      1.00      1282
     disgust       1.00      1.00      1.00       335
        fear       1.00      1.00      1.00       317
         joy       1.00      1.00      1.00      1872
     neutral       1.00      1.00      1.00      4808
     sadness       1.00      1.00      1.00       906
    surprise       1.00      1.00      1.00      1533

    accuracy                           1.00     11053
   macro avg       1.00      1.00      1.00     11053
weighted avg       1.00      1.00      1.00     11053

Train F1 Score: 0.9999
Test accuracy: 0.5452
Test Set Classification Report:
              precision    recall  f1-score   support

       anger       0.44      0.36      0.40       333
     disgust       0.09      0.09      0.09        79
        fear       0.15      0.16      0.16        56
         joy       0.50      0.48      0.49       

In [12]:
# Print all the file names for the training data
print("Training file names:")
for file_name in train_files:
    print(file_name)


Training file names:
dia116utt4
dia61utt2
dia644utt8
dia21utt8
dia983utt2
dia607utt15
dia860utt4
dia690utt4
dia530utt12
dia917utt4
dia182utt2
dia409utt11
dia773utt2
dia1279utt5
dia1031utt12
dia38utt6
dia1097utt5
dia1108utt11
dia1134utt9
dia1203utt12
dia1003utt3
dia168utt16
dia509utt4
dia1220utt1
dia389utt12
dia149utt3
dia1091utt6
dia1351utt2
dia478utt7
dia948utt3
dia1049utt11
dia227utt12
dia1226utt2
dia1170utt12
dia1088utt8
dia985utt1
dia602utt1
dia1308utt6
dia471utt12
dia795utt11
dia911utt7
dia556utt3
dia244utt5
dia184utt1
dia775utt1
dia597utt7
dia617utt9
dia72utt9
dia32utt3
dia285utt1
dia618utt18
dia474utt1
dia503utt1
dia1009utt6
dia928utt18
dia453utt20
dia989utt7
dia1167utt8
dia1127utt2
dia1016utt11
dia248utt3
dia1319utt18
dia1050utt2
dia1224utt11
dia1302utt3
dia608utt4
dia1081utt12
dia1121utt1
dia1260utt16
dia1056utt1
dia339utt12
dia1275utt3
dia305utt16
dia456utt11
dia283utt2
dia472utt2
dia432utt8
dia1178utt5
dia360utt4
dia505utt2
dia93utt13
dia741utt6
dia945utt17
dia979utt13
dia39

In [13]:
emotion_dict

{'dia1utt1': 'neutral',
 'dia1utt2': 'neutral',
 'dia1utt3': 'surprise',
 'dia1utt4': 'surprise',
 'dia1utt5': 'anger',
 'dia1utt6': 'neutral',
 'dia1utt7': 'neutral',
 'dia1utt8': 'neutral',
 'dia2utt1': 'sadness',
 'dia2utt2': 'neutral',
 'dia2utt3': 'surprise',
 'dia3utt1': 'surprise',
 'dia3utt2': 'neutral',
 'dia3utt3': 'sadness',
 'dia3utt4': 'neutral',
 'dia3utt5': 'neutral',
 'dia3utt6': 'surprise',
 'dia3utt7': 'joy',
 'dia3utt8': 'neutral',
 'dia3utt9': 'surprise',
 'dia4utt1': 'sadness',
 'dia4utt2': 'sadness',
 'dia4utt3': 'sadness',
 'dia5utt1': 'joy',
 'dia5utt2': 'neutral',
 'dia5utt3': 'joy',
 'dia6utt1': 'neutral',
 'dia6utt2': 'neutral',
 'dia6utt3': 'surprise',
 'dia6utt4': 'neutral',
 'dia6utt5': 'neutral',
 'dia6utt6': 'sadness',
 'dia6utt7': 'neutral',
 'dia6utt8': 'joy',
 'dia6utt9': 'neutral',
 'dia6utt10': 'neutral',
 'dia7utt1': 'sadness',
 'dia7utt2': 'surprise',
 'dia7utt3': 'neutral',
 'dia7utt4': 'surprise',
 'dia7utt5': 'neutral',
 'dia7utt6': 'neutral',


In [14]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Assuming that train_embeddings, train_files, and emotions_df are already defined and correctly ordered
# Let's fit the OneHotEncoder to all unique emotions
encoder = OneHotEncoder(sparse=False)
emotions = emotions_df['emotion'].unique().reshape(-1, 1)
encoder.fit(emotions)

# Create a dictionary mapping from dialogue IDs to lists of tuples (utterance_id, embedding, emotion)
dialogues = {}
for filename, embedding in zip(train_files, train_embeddings):
    parts = filename.split('utt')
    dialogue_id = parts[0]
    utterance_id = int(parts[1])
    if dialogue_id not in dialogues:
        dialogues[dialogue_id] = []
    emotion = emotion_dict[filename]
    dialogues[dialogue_id].append((utterance_id, embedding, emotion))

# Adjust the concatenation code
X_train_cause = []

for dialogue, utterances in dialogues.items():
    utterances.sort(key=lambda x: x[0])  # Sort the utterances by ID to ensure i <= j
    for i in range(len(utterances)):
        for j in range(i, len(utterances)):
            embedding_i = utterances[i][1].reshape(1, -1)  # Reshape to a 2D array with 1 row
            embedding_j = utterances[j][1].reshape(1, -1)  # Reshape to a 2D array with 1 row
            utterance_j_key = 'dia' + dialogue.replace('dia', '') + 'utt' + str(utterances[j][0])

            if utterance_j_key in emotion_dict:
                emotion_j = emotion_dict[utterance_j_key]
                # One-hot encode the emotion
                emotion_encoded = encoder.transform([[emotion_j]])
                # Concatenate along axis 1 to place them side by side
                combined_features = np.concatenate((embedding_j, embedding_i, emotion_encoded), axis=1)
                X_train_cause.append(combined_features.flatten())  # Flatten to get a 1D array
            else:
                print(f"Key not found in emotion_dict: {utterance_j_key}")

# Convert the list to a numpy array if necessary
X_train_cause = np.array(X_train_cause)








In [15]:
X_train_cause

array([[-0.02084827, -0.06246319,  0.00747706, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00999819, -0.0926351 , -0.0242518 , ...,  1.        ,
         0.        ,  0.        ],
       [-0.01629469, -0.05975849, -0.00135158, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.05378028,  0.01437453, -0.02753302, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.01150594, -0.05341088, -0.01951805, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.05389216, -0.02884472, -0.00816385, ...,  0.        ,
         0.        ,  1.        ]])

In [16]:
import json
import pandas as pd
import os

# Helper function to extract the relevant data from JSON
def extract_emotion_cause_data(json_file):
    """ Extracts cause and effect data from the JSON file. """
    with open(json_file, 'r') as file:
        data = json.load(file)
    
    rows = []
    for conversation in data:
        conversation_id = conversation['conversation_ID']
        if 'emotion-cause_pairs' in conversation:
            for pair in conversation['emotion-cause_pairs']:
                effect, cause = pair
                effect_utterance_id = int(effect.split('_')[0])  # Extract utterance ID and convert to int
                cause_utterance_id = int(cause)
                rows.append({
                    'conversation_ID': conversation_id,
                    'cause_utterance_ID': cause_utterance_id,
                    'effect_utterance_ID': effect_utterance_id
                })
    return rows

# Define the directory where CSV files will be saved
output_directory = '/project/msoleyma_1026/ecp-cause-pair/script_mlp/assets'
os.makedirs(output_directory, exist_ok=True)  # Create directory if it does not exist

# Define paths to JSON files
train_json = '/project/msoleyma_1026/ecp-cause-pair/json_files/train.json'
test_json = '/project/msoleyma_1026/ecp-cause-pair/json_files/test.json'

# Extract data
train_data = extract_emotion_cause_data(train_json)
test_data = extract_emotion_cause_data(test_json)

# Convert to pandas DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Define paths for the CSV files
train_csv_path = os.path.join(output_directory, 'train_cause.csv')
test_csv_path = os.path.join(output_directory, 'test_cause.csv')
combined_csv_path = os.path.join(output_directory, 'combined_cause.csv')

# Save to CSV
train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

# Combine both dataframes
combined_df = pd.concat([train_df, test_df])
combined_df.to_csv(combined_csv_path, index=False)

print("CSV files created successfully at:", output_directory)


CSV files created successfully at: /project/msoleyma_1026/ecp-cause-pair/script_mlp/assets


In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load the CSV file
csv_file_path = '/project/msoleyma_1026/ecp-cause-pair/script_mlp/assets/combined_cause.csv'
emotion_cause_df = pd.read_csv(csv_file_path)

# Convert the DataFrame to a dictionary for faster lookup
emotion_cause_dict = {}
for _, row in emotion_cause_df.iterrows():
    conv_id = 'dia' + str(row['conversation_ID'])
    pair = (row['effect_utterance_ID'], row['cause_utterance_ID'])
    if conv_id not in emotion_cause_dict:
        emotion_cause_dict[conv_id] = []
    emotion_cause_dict[conv_id].append(pair)

print("Emotion-cause pairs dictionary (first 5 entries):", list(emotion_cause_dict.items())[:5])

# Assuming train_embeddings, train_files, and emotions_df are already defined and correctly ordered
encoder = OneHotEncoder(sparse=False)
emotions = emotions_df['emotion'].unique().reshape(-1, 1)
encoder.fit(emotions)

dialogues = {}
for filename, embedding in zip(train_files, train_embeddings):
    parts = filename.split('utt')
    dialogue_id = parts[0]
    utterance_id = int(parts[1])
    if dialogue_id not in dialogues:
        dialogues[dialogue_id] = []
    emotion = emotion_dict[filename]
    dialogues[dialogue_id].append((utterance_id, embedding, emotion))

print("Number of dialogues processed:", len(dialogues))

X_train_cause = []
Y_train_cause = []

for dialogue, utterances in dialogues.items():
    utterances.sort(key=lambda x: x[0])
    for i in range(len(utterances)):
        for j in range(i, len(utterances)):
            embedding_i = utterances[i][1].reshape(1, -1)
            embedding_j = utterances[j][1].reshape(1, -1)
            if (utterances[j][0], utterances[i][0]) in emotion_cause_dict.get(dialogue, []):
                label = 1
            else:
                label = 0
            utterance_j_key = dialogue + 'utt' + str(utterances[j][0])
            if utterance_j_key in emotion_dict:
                emotion_j = emotion_dict[utterance_j_key]
                emotion_encoded = encoder.transform([[emotion_j]])
                combined_features = np.concatenate((embedding_j, embedding_i, emotion_encoded), axis=1)
                X_train_cause.append(combined_features.flatten())
                Y_train_cause.append(label)


X_train_cause = np.array(X_train_cause)
Y_train_cause = np.array(Y_train_cause)

print("Shape of X_train_cause:", X_train_cause.shape)
print("Shape of Y_train_cause:", Y_train_cause.shape)



Emotion-cause pairs dictionary (first 5 entries): [('dia1', [(3, 1), (3, 3), (4, 1), (4, 3), (4, 4), (5, 1), (5, 3), (5, 4)]), ('dia2', [(1, 1), (3, 3)]), ('dia3', [(3, 3), (6, 4), (6, 5), (6, 6), (7, 5), (7, 6), (7, 7), (9, 5), (9, 8), (9, 9)]), ('dia5', [(1, 3), (3, 3)]), ('dia6', [(3, 1), (3, 2), (3, 3), (6, 4), (6, 5), (8, 7)])]
Number of dialogues processed: 1113




Shape of X_train_cause: (78200, 5127)
Shape of Y_train_cause: (78200,)


In [18]:
X_train_cause


array([[-0.02084827, -0.06246319,  0.00747706, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00999819, -0.0926351 , -0.0242518 , ...,  1.        ,
         0.        ,  0.        ],
       [-0.01629469, -0.05975849, -0.00135158, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.05378028,  0.01437453, -0.02753302, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.01150594, -0.05341088, -0.01951805, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.05389216, -0.02884472, -0.00816385, ...,  0.        ,
         0.        ,  1.        ]])

In [19]:
# Assuming test_embeddings and test_files are already defined and correctly ordered
# Assuming emotion_model is your trained model for predicting emotions

# Predict emotions for all test utterances
# Flatten the test embeddings if they are not 2D
# Assume test_embeddings is an array of shape (num_samples, sequence_length, num_features)
# We want to flatten the last two dimensions
#test_embeddings_flattened = test_embeddings.reshape(test_embeddings.shape[0], -1)

# Now predict emotions using the flattened embeddings
test_emotions_predicted = mlp.predict(X_test_emotion)


# Create a dictionary mapping test filenames to predicted emotions
test_emotion_dict = {filename: emotion for filename, emotion in zip(test_files, test_emotions_predicted)}

# Initialize X_test_cause and Y_test_cause
X_test_cause = []
Y_test_cause = []

# Process test dialogues similarly to training dialogues
test_dialogues = {}
for filename, embedding in zip(test_files, test_embeddings):
    parts = filename.split('utt')
    dialogue_id = parts[0]
    utterance_id = int(parts[1])
    if dialogue_id not in test_dialogues:
        test_dialogues[dialogue_id] = []
    predicted_emotion = test_emotion_dict[filename]
    test_dialogues[dialogue_id].append((utterance_id, embedding, predicted_emotion))

# Build X_test_cause and Y_test_cause
for dialogue, utterances in test_dialogues.items():
    utterances.sort(key=lambda x: x[0])
    for i in range(len(utterances)):
        for j in range(i, len(utterances)):
            embedding_i = utterances[i][1].reshape(1, -1)
            embedding_j = utterances[j][1].reshape(1, -1)
            label = int((utterances[j][0], utterances[i][0]) in emotion_cause_dict.get(dialogue, []))
            emotion_j = utterances[j][2]
            emotion_encoded = encoder.transform([[emotion_j]])
            combined_features = np.concatenate((embedding_j, embedding_i, emotion_encoded), axis=1)
            X_test_cause.append(combined_features.flatten())
            Y_test_cause.append(label)

X_test_cause = np.array(X_test_cause)
Y_test_cause = np.array(Y_test_cause)



In [20]:
print("Shape of X_train_cause:", X_test_cause.shape)
print("Shape of Y_train_cause:", Y_test_cause.shape)

Shape of X_train_cause: (18004, 5127)
Shape of Y_train_cause: (18004,)


In [21]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Sample snippet to fit an MLPClassifier on the train and test data

# Initialize the MLPClassifier with one hidden layer of 100 units, and a maximum of 300 iterations
mlp= MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', learning_rate_init=0.001, random_state=42)

# Fit the MLPClassifier to your training data
# Replace X_train_cause and Y_train_cause with your actual training data and labels
mlp.fit(X_train_cause, Y_train_cause)

# After fitting, you can predict the test labels
Y_pred_test = mlp.predict(X_test_cause)

# Then evaluate the performance of the classifier on the test data
test_accuracy = accuracy_score(Y_test_cause, Y_pred_test)
print(f'Test Accuracy: {test_accuracy:.4f}')
# For a detailed classification report
test_report = classification_report(Y_test_cause, Y_pred_test)
print("Test Set Classification Report:")
print(test_report)
test_f1_score = f1_score(Y_test_cause, Y_pred_test, average='weighted')  # Calculate weighted F1 score
print(f'Test F1 Score: {test_f1_score:.4f}')  # Print F1 score

# Emotion-Cause Relationship Accuracy
emotion_cause_accuracy = accuracy_score(Y_test_cause, Y_pred_test)
print(f"Emotion-Cause Relationship Accuracy: {emotion_cause_accuracy:.4f}")




Test Accuracy: 0.8817
Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     16211
           1       0.39      0.34      0.36      1793

    accuracy                           0.88     18004
   macro avg       0.66      0.64      0.65     18004
weighted avg       0.87      0.88      0.88     18004

Test F1 Score: 0.8779
Emotion-Cause Relationship Accuracy: 0.8817


In [55]:
# Combined Accuracy
combined_accuracy = (emotion_detection_accuracy + emotion_cause_accuracy) / 2
print(f"Combined Accuracy: {combined_accuracy:.4f}")


Combined Accuracy: 0.7135
