In [1]:
import csv
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import importlib
import sys
import os
from openai import OpenAI
import ast
import pandas as pd

sys.path.append('../')
if 'playscript_utils' in sys.modules:
    importlib.reload(sys.modules['playscript_utils'])
else:
    import playscript_utils
from playscript_utils import model_name_dict, emotions, filter_csv_by_columns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_csv(file_path):
    data = []
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        for row in reader:
            if reader.line_num == 1:
                continue
            # To convert single-line strings in the CSV with \\n to newlines
            row_data = row[0].replace('\\n', '\n')
            data.append(row_data)
    return data

In [3]:
# Paired with premise_type="creative"; kind of counterintuitive, but just run with it
# Think of it as the more creative the premise, the less we need to control the emotion of the characters
def format_prompt_generic_emotion(premise, example_playscripts_path):
    example_playscripts = read_csv(example_playscripts_path)
    prefix = "Generate a conversation between two characters, Alice and Bob, using the given premise. Here are a few examples:"
    few_shot_prompt = '\n\n'.join(example_playscripts[0:3])
    suffix = "Now, generate a conversation between Alice and Bob based on the following premise. Generate exactly 3 lines of dialogue per character, alternating per character, so 6 lines of dialogue in total. Do not include stage directions like '(smirking)' or '(smiling)'. At the end of the conversation, output '[END]'."
    premise_formatted = f"Premise: {premise}"
    prompt = '\n\n'.join([prefix, few_shot_prompt, suffix, premise_formatted])
    return prompt

In [4]:
# Paired with premise_type="neutral"
# Think of it as the more neutral the premise, the more we can control the emotion of the characters
def format_prompt_controlled_emotion(premise, alice_emotion, bob_emotion):
    alice_emotion_prefix = f"Alice's dialogue should convey {alice_emotion}. " if alice_emotion != "generic" else ""
    bob_emotion_prefix = f"Bob's dialogue should convey {bob_emotion}. " if bob_emotion != "generic" else ""
    prefix = f"Generate a conversation between Alice and Bob based on a given premise. Generate exactly 3 lines of dialogue per person, "\
        f"alternating per person, so 6 lines of dialogue in total. {alice_emotion_prefix}{bob_emotion_prefix}At the end of the conversation, output '[END]'. "\
        "The format of the conversation should look like this:"
    conversation_example = "Alice: ...\nBob: ...\nAlice: ...\nBob: ...\nAlice: ...\nBob: ...\n[END]"
    suffix = "Now, generate a conversation based on the following premise:"
    premise_formatted = f"Premise: {premise}"
    prompt = '\n\n'.join([prefix, conversation_example, suffix, premise_formatted])
    return prompt

In [5]:
def generate_text_HF(model, tokenizer, prompt, num_generations=4, temperature=0.7):
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to("cuda")
    with torch.no_grad():
        model_gen_tokens = model.generate(input_ids, max_new_tokens=256, min_new_tokens=5, temperature=temperature, num_return_sequences=num_generations)[:, input_ids.shape[-1]:]
        generations = []
        for i in range(num_generations):
            generations.append(tokenizer.decode(model_gen_tokens[i], skip_special_tokens=True).strip())
        return generations

In [6]:
def generate_text_openai(client, model_name, prompt, num_generations=1, temperature=1.0, seed=42):
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=256,
        n=num_generations,
        temperature=temperature,
        seed=seed
    )
    generations = [completion.message.content for completion in completion.choices]
    return generations

In [7]:
def filter_playscript(unfiltered_playscript):
    end_index = unfiltered_playscript.find("[END]")
    if end_index != -1:
        filtered_playscript = unfiltered_playscript[:end_index].split('\n')
    else:
        filtered_playscript = unfiltered_playscript.split('\n')
    filtered_playscript = [line for line in filtered_playscript if line.strip() != '']
    for i in range(len(filtered_playscript)):
        if "Alice: " in filtered_playscript[i] or "Bob: " in filtered_playscript[i]:
            filtered_playscript = filtered_playscript[i:]
            break
    dialogues = []
    for i in range(len(filtered_playscript)):
        current_character = "Alice" if i % 2 == 0 else "Bob"
        if f'{current_character}: ' not in filtered_playscript[i]:
            raise ValueError(f"The current line doesn't start with {current_character}: {filtered_playscript[i]}")
        character, dialogue = filtered_playscript[i].split(": ", 1)
        dialogues.append(dialogue.strip())
    return dialogues

In [8]:
def append_to_playscripts_csv(generated_playscripts_path, generation_model_name, temperature, seed, num_generations, premise_type, premise, alice_emotion, bob_emotion, unfiltered_playscript, dialogues, dialogues_length, error_message):
    unfiltered_playscript_single_line = unfiltered_playscript.replace('\n', '\\n')
    with open(generated_playscripts_path, mode='a+', newline='') as file:
        writer = csv.writer(file)
        if file.tell() == 0:
            writer.writerow(["generation_model", "temperature", "seed", "num_generations", "premise_type", "premise", "alice_emotion", "bob_emotion", "unfiltered_playscript", "dialogues", "dialogues_length", "error_message"])
        else:
            # Read the existing rows and replace the first row
            file.seek(0)
            existing_rows = list(csv.reader(file))
            existing_rows[0] = ["generation_model", "temperature", "seed", "num_generations", "premise_type", "premise", "alice_emotion", "bob_emotion", "unfiltered_playscript", "dialogues", "dialogues_length", "error_message"]
            file.seek(0)
            file.truncate()
            writer.writerows(existing_rows)
        writer.writerow([generation_model_name, temperature, seed, num_generations, premise_type, premise, alice_emotion, bob_emotion, unfiltered_playscript_single_line, dialogues, dialogues_length, error_message])

In [9]:
def generate_playscripts(premise_type, generation_model_type, generation_model_name, num_generations, temperature, seed, example_playscripts_path, generated_playscripts_path, premises_path, emotions=None, model=None, tokenizer=None, client=None):
    if seed is not None:
        torch.manual_seed(seed)
        np.random.seed(seed)
        torch.cuda.manual_seed_all(seed)
    premises_df = pd.read_csv(premises_path)
    filtered_premises_df = premises_df[premises_df['premise_type'].eq(premise_type)]
    premise_list = filtered_premises_df['premise'].tolist()
    for i, premise in enumerate(premise_list):
        print(f"Generating playscript for premise {i+1} of {len(premises_df)}: {premise}")
        if premise_type == "creative":
            formatted_prompt = format_prompt_generic_emotion(premise, example_playscripts_path)
            if generation_model_type == "HF":
                unfiltered_playscripts = generate_text_HF(model, tokenizer, formatted_prompt, num_generations, temperature)
            elif generation_model_type == "openai":
                unfiltered_playscripts = generate_text_openai(client, generation_model_name, formatted_prompt, num_generations, temperature, seed)
            for unfiltered_playscript in unfiltered_playscripts:
                try:
                    dialogues = filter_playscript(unfiltered_playscript)
                    append_to_playscripts_csv(generated_playscripts_path, generation_model_name, temperature, seed, num_generations, premise_type, premise, "generic", "generic", unfiltered_playscript, dialogues, len(dialogues), "")
                except ValueError as e:
                    append_to_playscripts_csv(generated_playscripts_path, generation_model_name, temperature, seed, num_generations, premise_type, premise, "generic", "generic", unfiltered_playscript, "", "", str(e))
        elif premise_type == "neutral":
            emotion_combinations = [[emotion1, emotion2] for emotion1 in emotions for emotion2 in emotions]
            emotion_combinations.insert(0, ["generic", "generic"])
            for alice_emotion, bob_emotion in emotion_combinations:
                formatted_prompt = format_prompt_controlled_emotion(premise, alice_emotion, bob_emotion)
                if generation_model_type == "HF":
                    unfiltered_playscripts = generate_text_HF(model, tokenizer, formatted_prompt, num_generations, temperature)
                elif generation_model_type == "openai":
                    unfiltered_playscripts = generate_text_openai(client, generation_model_name, formatted_prompt, num_generations, temperature, seed)
                for unfiltered_playscript in unfiltered_playscripts:
                    try:
                        dialogues = filter_playscript(unfiltered_playscript)
                        append_to_playscripts_csv(generated_playscripts_path, generation_model_name, temperature, seed, num_generations, premise_type, premise, alice_emotion, bob_emotion, unfiltered_playscript, dialogues, len(dialogues), "")
                    except ValueError as e:
                        append_to_playscripts_csv(generated_playscripts_path, generation_model_name, temperature, seed, num_generations, premise_type, premise, alice_emotion, bob_emotion, unfiltered_playscript, "", "", str(e))

In [10]:
def edit_generated_playscripts(generated_playscripts_path, edited_playscripts_path):
# Manual editing of row 83, which threw an error when generating (for generic emotion with Llama3_70B)
    with open(generated_playscripts_path, mode='r', newline='') as file:
        reader = csv.reader(file)
        rows = list(reader)

    # Replace row 83 with row 82 (index 82 with 81 in 0-based index)
    rows[82] = rows[81]

    # Save the edited version to a new CSV file
    with open(edited_playscripts_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(rows)


In [11]:
def convert_csv_format(read_file_path, write_file_path):
    df = pd.read_csv(read_file_path)
    df.rename(columns={'model_name': 'generation_model'}, inplace=True)
    # df['probing_model'] = "llama2_13b_chat"
    # df['probing_method'] = "pca"
    # Reorder columns to match the exact order
    # df = df[['generation_model', 'temperature', 'seed', 'num_generations', 'premise_type', 'premise', 'alice_emotion', 'bob_emotion', 'unfiltered_conversation', 'dialogues', 'dialogues_length', 'error_message', 'probing_model', 'probing_method', 'stimulis_format', 'dialogue_concatenate', 'emotion_scores']]
    df = df[['generation_model', 'temperature', 'seed', 'num_generations', 'premise_type', 'premise', 'alice_emotion', 'bob_emotion', 'unfiltered_conversation', 'dialogues', 'dialogues_length', 'error_message']]

    df.to_csv(write_file_path, index=False)


In [12]:
# Strip whitespace from each dialogue in the "dialogues" column
def strip_whitespace_from_dialogues(read_path, write_path):
    # Load the generated playscripts CSV
    df = pd.read_csv(read_path)
    
    def strip_dialogues(dialogues_str):
        dialogues_list = ast.literal_eval(dialogues_str)
        stripped_dialogues = [dialogue.strip() for dialogue in dialogues_list]
        return str(stripped_dialogues)
    
    df["dialogues"] = df["dialogues"].apply(strip_dialogues)
    
    # Save the modified DataFrame back to CSV
    df.to_csv(write_path, index=False)

In [13]:
# Generating playscripts with generic emotion
example_playscripts_path = "../data/permanent/example_playscripts.csv"
generated_playscripts_path = "../data/temp/generated_playscripts.csv"
premises_path = "../data/temp/premises.csv"

In [None]:
# generation_model_name = "llama3_8b_instruct"
# generation_model_HF = model_name_dict[generation_model_name]
# generation_model = AutoModelForCausalLM.from_pretrained(generation_model_HF, device_map="auto", low_cpu_mem_usage = True, torch_dtype=torch.float16, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(generation_model_HF, trust_remote_code=True)

# generate_playscripts("creative", "HF", generation_model_name, 4, 0.7, 42, example_playscripts_path, generated_playscripts_path, premises_path, emotions, generation_model, tokenizer, client=None)

In [14]:
# Generating playscripts with controlled emotion
generation_model_name = "gpt-4o-mini-2024-07-18"
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
generate_playscripts("neutral", "openai", generation_model_name, 1, 1, 42, example_playscripts_path, generated_playscripts_path, premises_path, emotions, model=None, tokenizer=None, client=client)

Generating playscript for premise 1 of 1: Alice and Bob are tourists in a city. They discuss the attractions they plan to visit.


KeyboardInterrupt: 