In [1]:
import csv
import json
import os
import sys
import importlib
import random
import ast
import pandas as pd

sys.path.append('../')
if 'playscript_utils' in sys.modules:
    importlib.reload(sys.modules['playscript_utils'])
else:
    import playscript_utils
from playscript_utils import model_name_dict, emotions, filter_csv_by_columns, chat_template

In [2]:
def read_rep_e_data(rep_e_data_path, probing_data_path, emotions):
    # Open the CSV file for writing
    with open(probing_data_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["probing_dataset", "emotion", "text_example", "character"])
        
        # Iterate over each emotion and corresponding JSON file
        for emotion in emotions:
            json_path = os.path.join(rep_e_data_path, f"{emotion}.json")
            
            # Read the JSON file
            with open(json_path, 'r') as json_file:
                text_examples = json.load(json_file)
                
                # Write each text example to the CSV file
                for example in text_examples:
                    writer.writerow(["rep_e", emotion, example, "None"])


In [3]:
def clean_custom_emotion_data(raw_data_path, cleaned_data_path, emotions):
    for emotion in emotions:
        raw_data_emotion_file_path = f"{raw_data_path}{emotion}.txt"
        cleaned_data_emotion_file_path = f"{cleaned_data_path}{emotion}.csv"
        with open(raw_data_emotion_file_path, 'r') as txt_file, open(cleaned_data_emotion_file_path, 'w', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["probing_dataset", "emotion", "text_example"])
            for line in txt_file:
                if line[0].isdigit():
                    cleaned_line = line.split(maxsplit=1)[1].strip().strip('"“”')
                else:
                    cleaned_line = line.strip().strip('"“”')
                writer.writerow(["custom", emotion, cleaned_line, "None"])


In [4]:
def combine_custom_cleaned_data(cleaned_data_path, probing_data_path, emotions):
    for emotion in emotions:
        cleaned_data_emotion_file_path = f"{cleaned_data_path}{emotion}.csv"
        with open(cleaned_data_emotion_file_path, 'r') as cleaned_data_emotion_file, open(probing_data_path, 'a', newline='') as probing_data_file:
            reader = csv.reader(cleaned_data_emotion_file)
            writer = csv.writer(probing_data_file)
            next(reader)  # Skip the header
            rows = list(reader)
            for row in rows:
                writer.writerow(row)

In [5]:
def read_playscript_data(generated_playscripts_path, probing_data_path, premise_type, train_or_test):
    filtered_df = filter_csv_by_columns(generated_playscripts_path, {"premise_type": premise_type})
    with open(probing_data_path, 'a', newline='') as probing_data_file:
        writer = csv.writer(probing_data_file)
        
        for index, row in filtered_df.iterrows():
            dialogues = ast.literal_eval(row['dialogues'])
            alice_emotion = row['alice_emotion']
            bob_emotion = row['bob_emotion']
            for i, dialogue in enumerate(dialogues):
                if i % 2 == 0 and alice_emotion != "generic":
                    writer.writerow([f"playscript_{premise_type}_{train_or_test}", alice_emotion, dialogue, "Alice"])
                if i % 2 == 1 and bob_emotion != "generic":
                    writer.writerow([f"playscript_{premise_type}_{train_or_test}", bob_emotion, dialogue, "Bob"])

In [6]:
def combine_non_playscript_data(probing_data_path):
    rep_e_filtered_df = filter_csv_by_columns(probing_data_path, {"probing_dataset": "rep_e"})
    custom_filtered_df = filter_csv_by_columns(probing_data_path, {"probing_dataset": "custom"})
    playscript_train_filtered_df = filter_csv_by_columns(probing_data_path, {"probing_dataset": "playscript_neutral_train"})
    combined_non_test_df = pd.concat([rep_e_filtered_df, custom_filtered_df, playscript_train_filtered_df])
    combined_non_test_df['probing_dataset'] = 'combined_non_test'
    combined_non_test_df.to_csv(probing_data_path, mode='a', header=False, index=False)

In [7]:
probing_data_path = "../data/permanent/emotion_probing_data.csv"
rep_e_data_path = "../representation-engineering/data/emotions/"
custom_raw_data_path = "../data/permanent/raw_probing_data/"
custom_cleaned_data_path = "../data/permanent/cleaned_probing_data/"
generated_playscripts_path = "../data/permanent/generated_playscripts.csv"
generated_playscripts_train_path = "../data/permanent/generated_playscripts_train.csv"
read_rep_e_data(rep_e_data_path, probing_data_path, emotions)
clean_custom_emotion_data(custom_raw_data_path, custom_cleaned_data_path, emotions)
combine_custom_cleaned_data(custom_cleaned_data_path, probing_data_path, emotions)
read_playscript_data(generated_playscripts_train_path, probing_data_path, "neutral", "train")
read_playscript_data(generated_playscripts_path, probing_data_path, "neutral", "test")
combine_non_playscript_data(probing_data_path)