In [51]:
import csv
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import importlib
import sys
import os
from openai import OpenAI
import ast

sys.path.append('../')
if 'playscript_utils' in sys.modules:
    importlib.reload(sys.modules['playscript_utils'])
else:
    import playscript_utils
from playscript_utils import model_name_dict


In [2]:
def read_csv(file_path):
    data = []
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        for row in reader:
            if reader.line_num == 1:
                continue
            # To convert single-line strings in the CSV with \\n to newlines
            row_data = row[0].replace('\\n', '\n')
            data.append(row_data)
    return data

In [3]:
def format_prompt_generic_emotion(premise, example_playscripts_path):
    example_playscripts = read_csv(example_playscripts_path)
    prefix = "Generate a conversation between two characters, Alice and Bob, using the given premise. Here are a few examples:"
    few_shot_prompt = '\n\n'.join(example_playscripts[0:3])
    suffix = "Now, generate a conversation between Alice and Bob based on the following premise. Generate exactly 3 lines of dialogue per character, alternating per character, so 6 lines of dialogue in total. Do not include stage directions like '(smirking)' or '(smiling)'. At the end of the conversation, output '[END]'."
    premise_formatted = f"Premise: {premise}"
    prompt = '\n\n'.join([prefix, few_shot_prompt, suffix, premise_formatted])
    return prompt

In [69]:
def format_prompt_controlled_emotion(premise, alice_emotion, bob_emotion):
    alice_emotion_prefix = f"Alice's dialogue should convey {alice_emotion}. " if alice_emotion != "generic" else ""
    bob_emotion_prefix = f"Bob's dialogue should convey {bob_emotion}. " if bob_emotion != "generic" else ""
    prefix = f"Generate a conversation between Alice and Bob based on a given premise. Generate exactly 3 lines of dialogue per person, "\
        f"alternating per person, so 6 lines of dialogue in total. {alice_emotion_prefix}{bob_emotion_prefix}At the end of the conversation, output '[END]'. "\
        "The format of the conversation should look like this:"
    conversation_example = "Alice: ...\nBob: ...\nAlice: ...\nBob: ...\nAlice: ...\nBob: ...\n[END]"
    suffix = "Now, generate a conversation based on the following premise:"
    premise_formatted = f"Premise: {premise}"
    prompt = '\n\n'.join([prefix, conversation_example, suffix, premise_formatted])
    return prompt

In [5]:
def generate_text_HF(model, tokenizer, prompt, num_generations=4, temperature=0.7, seed=42):
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to("cuda")
    with torch.no_grad():
        model_gen_tokens = model.generate(input_ids, max_new_tokens=256, min_new_tokens=5, temperature=temperature, num_return_sequences=num_generations)[:, input_ids.shape[-1]:]
        generations = []
        for i in range(num_generations):
            generations.append(tokenizer.decode(model_gen_tokens[i], skip_special_tokens=True).strip())
        return generations

In [6]:
def generate_text_openai(client, model_name, prompt, num_generations=1, temperature=1.0, seed=42):
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=256,
        n=num_generations,
        temperature=temperature,
        seed=seed
    )
    generations = [completion.message.content for completion in completion.choices]
    return generations

In [7]:
def filter_conversation_by_character(unfiltered_conversation):
    end_index = unfiltered_conversation.find("[END]")
    if end_index != -1:
        filtered_conversation = unfiltered_conversation[:end_index].split('\n')
    else:
        filtered_conversation = unfiltered_conversation.split('\n')
    filtered_conversation = [line for line in filtered_conversation if line.strip() != '']
    for i in range(len(filtered_conversation)):
        if "Alice: " in filtered_conversation[i] or "Bob: " in filtered_conversation[i]:
            filtered_conversation = filtered_conversation[i:]
            break
    character_dialogues = {}
    character_dialogues["Alice"] = []
    character_dialogues["Bob"] = []
    for i in range(len(filtered_conversation)):
        current_character = "Alice" if i % 2 == 0 else "Bob"
        if f'{current_character}: ' not in filtered_conversation[i]:
            raise ValueError(f"The current line doesn't start with {current_character}: {filtered_conversation[i]}")
        character, dialogue = filtered_conversation[i].split(": ", 1)
        character_dialogues[character].append(dialogue)
    return character_dialogues

In [38]:
def append_to_playscripts_csv_generic_emotion(generated_playscripts_path, model_name, temperature, seed, num_generations, premise, unfiltered_conversation, alice_dialogues, bob_dialogues, error_message):
    unfiltered_conversation_single_line = unfiltered_conversation.replace('\n', '\\n')
    with open(generated_playscripts_path, mode='a+', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(["model_name", "temperature", "seed", "num_generations", "premise", "unfiltered_conversation", "alice_dialogues", "bob_dialogues", "alice_dialogue_count", "bob_dialogue_count", "error_message"])
        else:
            # Read the existing rows and replace the first row
            file.seek(0)
            existing_rows = list(csv.reader(file))
            existing_rows[0] = ["model_name", "temperature", "seed", "num_generations", "premise", "unfiltered_conversation", "alice_dialogues", "bob_dialogues", "alice_dialogue_count", "bob_dialogue_count", "error_message"]
            file.seek(0)
            file.truncate()
            writer.writerows(existing_rows)
        writer.writerow([model_name, temperature, seed, num_generations, premise, unfiltered_conversation_single_line, alice_dialogues, bob_dialogues, len(alice_dialogues), len(bob_dialogues), error_message])

In [45]:
def append_to_playscripts_csv_controlled_emotion(generated_playscripts_path, model_name, temperature, seed, num_generations, premise, alice_emotion, bob_emotion, unfiltered_conversation, alice_dialogues, bob_dialogues, error_message):
    unfiltered_conversation_single_line = unfiltered_conversation.replace('\n', '\\n')
    with open(generated_playscripts_path, mode='a+', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(["model_name", "temperature", "seed", "num_generations", "premise", "alice_emotion", "bob_emotion", "unfiltered_conversation", "alice_dialogues", "bob_dialogues", "alice_dialogue_count", "bob_dialogue_count", "error_message"])
        else:
            # Read the existing rows and replace the first row
            file.seek(0)
            existing_rows = list(csv.reader(file))
            existing_rows[0] = ["model_name", "temperature", "seed", "num_generations", "premise", "alice_emotion", "bob_emotion", "unfiltered_conversation", "alice_dialogues", "bob_dialogues", "alice_dialogue_count", "bob_dialogue_count", "error_message"]
            file.seek(0)
            file.truncate()
            writer.writerows(existing_rows)
        writer.writerow([model_name, temperature, seed, num_generations, premise, alice_emotion, bob_emotion, unfiltered_conversation_single_line, alice_dialogues, bob_dialogues, len(alice_dialogues), len(bob_dialogues), error_message])

In [34]:
def generate_playscripts_generic_emotion(premises, model_name, model, tokenizer, num_generations=4, temperature=0.7, seed=42, example_playscripts_path="../data/generic_emotion/example_playscripts.csv", generated_playscripts_path="../data/generic_emotion/generated_playscripts.csv"):
    if seed is not None:
        torch.manual_seed(seed)
        np.random.seed(seed)
        torch.cuda.manual_seed_all(seed)

    for i, premise in enumerate(premises):
        print(f"Generating playscript for premise {i+1} of {len(premises)}: {premise}")
        unfiltered_conversations = generate_text_HF(model, tokenizer, format_prompt_generic_emotion(premise, example_playscripts_path), num_generations=num_generations, temperature=temperature, seed=seed)
        for unfiltered_conversation in unfiltered_conversations:
            try:
                character_dialogues = filter_conversation_by_character(unfiltered_conversation)
                alice_dialogues = character_dialogues["Alice"]
                bob_dialogues = character_dialogues["Bob"]
                append_to_playscripts_csv_generic_emotion(generated_playscripts_path, model_name, temperature, seed, num_generations, premise, unfiltered_conversation, alice_dialogues, bob_dialogues, "")
            except ValueError as e:
                append_to_playscripts_csv_generic_emotion(generated_playscripts_path, model_name, temperature, seed, num_generations, premise, unfiltered_conversation, "", "", str(e))

In [72]:
def generate_playscripts_controlled_emotion(premises, emotions, model_name="gpt-4o-mini-2024-07-18", num_generations=1, temperature=1.0, seed=42, generated_playscripts_path="../data/controlled_emotion/generated_playscripts.csv"):
    if seed is not None:
        torch.manual_seed(seed)
        np.random.seed(seed)
        torch.cuda.manual_seed_all(seed)
    openai_api_key = os.environ.get("OPENAI_API_KEY")
    client = OpenAI(api_key=openai_api_key)
    for i, premise in enumerate(premises):
        print(f"Generating playscript for premise {i+1} of {len(premises)}: {premise}")
        emotion_pairs = [["generic", "generic"]]
        for alice_emotion in emotions:
            for bob_emotion in emotions:
                emotion_pairs.append([alice_emotion, bob_emotion])
        for alice_emotion, bob_emotion in emotion_pairs:
            print(f"Generating playscript with Alice's emotion {alice_emotion} and Bob's emotion {bob_emotion}")
            unfiltered_conversations = generate_text_openai(client, model_name, format_prompt_controlled_emotion(premise, alice_emotion, bob_emotion), num_generations=num_generations, temperature=temperature, seed=seed)
            for unfiltered_conversation in unfiltered_conversations:
                try:
                    character_dialogues = filter_conversation_by_character(unfiltered_conversation)
                    alice_dialogues = character_dialogues["Alice"]
                    bob_dialogues = character_dialogues["Bob"]
                    append_to_playscripts_csv_controlled_emotion(generated_playscripts_path, model_name, temperature, seed, num_generations, premise, alice_emotion, bob_emotion, unfiltered_conversation, alice_dialogues, bob_dialogues, "")
                except ValueError as e:
                    append_to_playscripts_csv_controlled_emotion(generated_playscripts_path, model_name, temperature, seed, num_generations, premise, alice_emotion, bob_emotion, unfiltered_conversation, "", "", str(e))

In [None]:
# Generating playscripts with generic emotion
generic_emotion_example_playscripts_path = "../data/generic_emotion/example_playscripts.csv"
generic_emotion_generated_playscripts_path = "../data/generic_emotion/generated_playscripts.csv"
generic_emotion_test_premises_path = "../data/generic_emotion/test_premises.csv"
generic_emotion_model_name = "llama3_8b_instruct"
model_HF = model_name_dict[generic_emotion_model_name]
model = AutoModelForCausalLM.from_pretrained(model_HF, device_map="auto", low_cpu_mem_usage = True, torch_dtype=torch.float16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_HF, trust_remote_code=True)
generic_emotion_test_premises = read_csv(generic_emotion_test_premises_path)[0:1]
generate_playscripts_generic_emotion(generic_emotion_test_premises, generic_emotion_model_name, model, tokenizer, num_generations=4, temperature=0.7, seed=42, example_playscripts_path=generic_emotion_example_playscripts_path, generated_playscripts_path=generic_emotion_generated_playscripts_path)

In [54]:
# For rearranging CSVs
def reformat_csv(file_path):
    with open(file_path, mode='r', newline='') as file:
        reader = csv.reader(file)
        rows = list(reader)

    header = rows[0]
    updated_rows = [header]

    for row in rows[1:]:
        try:
            alice_dialogues = ast.literal_eval(row[6])
        except (SyntaxError, ValueError):
            alice_dialogues = []
        try:
            bob_dialogues = ast.literal_eval(row[7])
        except (SyntaxError, ValueError):
            bob_dialogues = []
        alice_dialogue_count = len(alice_dialogues)
        bob_dialogue_count = len(bob_dialogues)
        error_message = row[8]

        updated_row = row[:8] + [alice_dialogue_count, bob_dialogue_count, error_message]
        updated_rows.append(updated_row)

    with open(file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(updated_rows)


In [55]:
# Manual editing of row 83, which threw an error when generating (for generic emotion with Llama3_70B)
with open(generic_emotion_generated_playscripts_path, mode='r', newline='') as file:
    reader = csv.reader(file)
    rows = list(reader)

# Replace row 83 with row 82 (index 82 with 81 in 0-based index)
rows[82] = rows[81]

# Save the edited version to a new CSV file
generic_emotion_generated_playscripts_edited_path = "../data/generic_emotion/generated_playscripts_edited.csv"
with open(generic_emotion_generated_playscripts_edited_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)


In [74]:
# Generating playscripts with controlled emotion
controlled_emotion_example_playscripts_path = "../data/controlled_emotion/example_playscripts.csv"
controlled_emotion_generated_playscripts_path = "../data/controlled_emotion/generated_playscripts.csv"
controlled_emotion_test_premises_path = "../data/controlled_emotion/test_premises.csv"
controlled_emotion_test_premises = read_csv(controlled_emotion_test_premises_path)[0:10]
emotions = ["happiness", "sadness", "anger", "fear", "disgust", "surprise"]
model_name = "gpt-4o-mini-2024-07-18"

# Generating playscripts with generic emotion
generate_playscripts_controlled_emotion(controlled_emotion_test_premises, emotions, model_name, num_generations=1, temperature=1.0, seed=42, generated_playscripts_path=controlled_emotion_generated_playscripts_path)

Generating playscript for premise 1 of 10: Alice and Bob are coworkers in an office job. They talk about what they're working on.
Generating playscript with Alice's emotion generic and Bob's emotion generic
Generating playscript with Alice's emotion happiness and Bob's emotion happiness
Generating playscript with Alice's emotion happiness and Bob's emotion sadness
Generating playscript with Alice's emotion happiness and Bob's emotion anger
Generating playscript with Alice's emotion happiness and Bob's emotion fear
Generating playscript with Alice's emotion happiness and Bob's emotion disgust
Generating playscript with Alice's emotion happiness and Bob's emotion surprise
Generating playscript with Alice's emotion sadness and Bob's emotion happiness
Generating playscript with Alice's emotion sadness and Bob's emotion sadness
Generating playscript with Alice's emotion sadness and Bob's emotion anger
Generating playscript with Alice's emotion sadness and Bob's emotion fear
Generating plays