In [2]:
import requests
import ast
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import json

In [3]:
with open('synopsis_mappings.txt', 'r') as file:
    dict_str = file.read()
synopsis_dict = ast.literal_eval(dict_str)

def get_episode_synopsis(episode_filename: str, season: int):
    episode_number = episode_filename.split("_")[0][1:]
    if (len(episode_number) == 1):
        episode_number = "0" + episode_number
    episode_code = str(season) + episode_number
    return synopsis_dict[episode_code]
        

In [4]:
def get_str_script_from_df(df):
    # convert df into a string episode script
    df = df.iloc[:, 2:].fillna('<scene>')
    
    # Concatenate the remaining columns into a single string for each row
    df['formatted_string'] = df.apply(lambda row: ': '.join(row.astype(str)), axis=1)
    
    # Combine all rows into a single string separated by line breaks
    final_string = '\n'.join(df['formatted_string'])
    return "plot: " + synopsis + "\n" + final_string.replace("\'", "`").lower()

In [5]:
# get api key from file
API_KEY = ""
with open("key.secret", "r") as f:
    API_KEY = f.read()

from openai import OpenAI
client = OpenAI(api_key=API_KEY)
    
def is_line_offensive(line: str):
    response = client.moderations.create(input=line)
    return response.results[0].flagged

In [10]:
SEASONS = [8,9,10,11,12]
CHARACTERS = ["stan", "kyle", "cartman", "butters"]

char_descriptions = {
    "cartman": "Eric Cartman embodies both extreme negativity and occasional glimpses of a more nuanced personality. He is often described as evil, vicious, angry, self-absorbed, immature, destructive, sarcastic, snooty, loud-mouthed, lazy, and seemingly mentally unbalanced. Cartman is infamous for his foul language, racist views, and tendency to stereotype others. His dark personality often suggests a severe mental imbalance. He lacks moral responsibility and social conscience, deriving pleasure from the misfortunes of others and usually showing a lack of empathy.His extreme actions are sometimes interpreted as a defense mechanism against his insecurities and possibly confused sexuality. Cartman's actions often parody the cliché of a typical villain becoming temporarily benevolent. His seemingly good intentions often conceal ulterior motives, such as financial gain or personal vendettas, particularly against his Jewish friend Kyle. Apart from his anti-Semitism, Cartman also harbors prejudice against racial, ethnic, religious minorities, people of lower income like his friend Kenny, and red-haired, light-skinned, freckled individuals (anti-ginger sentiment). He particularly loathes hippies and liberals. Cartman is an orderly individual, using rules and order to manipulate others to his advantage. He idolizes figures who share his bigotries and aims. He employs both law and chaos for personal gain. While often cold-hearted and selfish, Cartman occasionally shows a warm-hearted, caring side, though this is usually a façade for a selfish scheme.",
    "stan": "Stan Marsh from 'South Park' is characterized as the most tender and sensitive among the main characters. He often displays a high moral compass, challenging unethical practices and confronting dishonesty. His sensitivity is highlighted through his reactions to personal losses and his deep empathy for animals, showcasing his strong animal rights stance.Stan is known for his clarity in understanding scams and corporate corruption, often seeing through falsely glorified practices and celebrities. This skepticism might stem from his distrust of adults, influenced by his experiences with his often immature and incompetent father.He shares a close bond with Kyle, but they have distinct personalities. While they sometimes interchange roles of being the more gullible or skeptical one, their friendship remains a constant in the series.A recurring theme in Stan's character is his struggle with depression. He experiences profound sadness in response to personal events, notably in relationships, and often adopts a cynical worldview during these periods.Overall, Stan is a complex character, balancing his tender-hearted nature with a sharp awareness of the world's harsh realities. His actions often reflect a mix of moral integrity, emotional depth, and a critical view of societal issues.",
    "kyle": "Kyle Broflovski from 'South Park' is characterized by his strong morals, firm beliefs, and a compassionate, if somewhat temperamental, personality. He is particularly quick to anger when interacting with his antagonist, Cartman, often finding himself coaxed into conflicts due to his emotional nature. Despite this, Kyle generally bases his decisions on his beliefs and emotions, sometimes leading to manipulation by others. Kyle is known for his moral standing, often opposing actions he deems wrong or evil, though he can be pressured by peers into participating. His morality is not infallible; he has abandoned his principles for monetary gain, only to self-reflect and change his stance upon realizing the error of his ways. Kyle's fraternal instinct is strong, especially towards his adopted brother Ike, whom he goes to great lengths to protect and care for. His compassionate side extends beyond his immediate circle, as he often shows empathy towards others, unlike many of his peers. However, Kyle can also display mean and neurotic traits. Intelligent and sensible, Kyle is usually the voice of reason among his friends, often finding logical solutions to problems, though these are sometimes overshadowed by more absurd ideas from his peers.",
    "butters": "Butters Stotch from 'South Park' stands out with his uniquely innocent and warm personality in a show known for its cynicism and mature humor. He embodies the traits of sweetness, naivety, and gullibility, contrasting sharply with the other more adult-like children in the series. Butters often displays a childlike demeanor, marked by a genuine niceness and a lack of the usual cruelty or confidence found in his peers. This makes him both endearing and vulnerable, as he can be easily taken advantage of. Unlike most characters in 'South Park', Butters rarely uses foul language, instead opting for whimsical euphemisms. He is committed to self-improvement and has talents like tap dancing, but his extremely low self-esteem often hinders him from recognizing or effectively using his abilities. This low self-esteem also leads to a perpetual fear of being grounded. Socially considered a dork, Butters is consistently seeking approval and acceptance, making him a relatable and sympathetic character."
}
manager_description = "You are a bot controlling the speaking pattern for a system that is going to be generating South Park scripts. Given input text, containing information about previous lines in the episode, your task is to respond with a single word, that is the name of the character who is going to speak next. The only characters you can say are 'cartman', 'stan', 'kyle' or 'butters'. You cannot ask multiple agents to speak simultaneously."
# _data vars are arrays of dictionaries like this:
# {"messages": [
# {"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, 
# {"role": "user", "content": "What's the capital of France?"}, 
# {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}
# ]}
# system provides agent context. user provides script context (prev_lines)
cartman_data = []
stan_data = []
kyle_data = []
butters_data = []
manager_data = []

for SEASON in SEASONS:
    # get all .csv files in correct season dir
    episodes = list(filter(lambda file: file[-4:] == ".csv", os.listdir(os.path.join(".", "episodes_csv", f"s{SEASON}"))))
    print(f"Season {SEASON}!")
    for episode in episodes:
        print("episode " + str(episode))
        df = pd.read_csv(os.path.join(".", "episodes_csv", f"s{SEASON}", episode), delimiter=';', header=None)
        synopsis = get_episode_synopsis(episode, SEASON)
        str_script = get_str_script_from_df(df)

        # Generate input-output tuples
        lines = str_script.split("\n")
        prev_lines = []  # store context
        
        for line in lines:
            try:
                split_line = line.split(": ")
                character = split_line[0]
                char_line = split_line[1]

                # check if the line is offensive. 
                # If yes, go to next episode
                if is_line_offensive(char_line):
                    break
                
                input = "\n".join(prev_lines)
                output = char_line
            
                prev_lines.append(line)
                
                if character not in char_descriptions:
                    continue
                
                message = {
                    "messages": [
                            {"role": "system", "content": char_descriptions[character]},
                            {"role": "user", "content": input},
                            {"role": "assistant", "content": output}
                        ]
                }
                manager_message = {
                    "messages": [
                            {"role": "system", "content": manager_description},
                            {"role": "user", "content": input},
                            {"role": "assistant", "content": character}
                        ]
                }
                manager_data.append(manager_message)

                if character == "cartman":
                    cartman_data.append(message)
                elif character == "stan":
                    stan_data.append(message)
                elif character == "kyle":
                    kyle_data.append(message)
                elif character == "butters":
                    butters_data.append(message)
            except:
                continue

Season 8!
episode e12_stupid-spoiled-whore-video-playset.csv
episode e7_goobacks.csv
episode e9_something-wall-mart-this-way-comes.csv
episode e11_quest-for-ratings.csv
episode e14_woodland-critter-christmas.csv
episode e5_awesom-o.csv
episode e3_the-passion-of-the-jew.csv
episode e10_pre-school.csv
episode e13_cartman-s-incredible-gift.csv
episode e1_good-times-with-weapons.csv
episode e6_the-jeffersons.csv
episode e4_you-got-f-d-in-the-a.csv
episode e8_douche-and-turd.csv
episode e2_up-the-down-steroid.csv
Season 9!
episode e4_best-friends-forever.csv
episode e11_ginger-kids.csv
episode e6_the-death-of-eric-cartman.csv
episode e9_marjorine.csv
episode e12_trapped-in-the-closet.csv
episode e7_erection-day.csv
episode e3_wing.csv
episode e2_die-hippie-die.csv
episode e5_the-losing-edge.csv
episode e1_mr-garrison-s-fancy-new-vagina.csv
episode e13_free-willzyx.csv
episode e10_follow-that-egg.csv
episode e8_two-days-before-the-day-after-tomorrow.csv
episode e14_bloody-mary.csv
Season 10!

In [11]:
# save all to file
def save_data_to_file(data: list, save_filepath: str):
    full_str = ""
    for line in data:
        json_string = json.dumps(line, separators=(',', ':'))
        full_str += json_string + "\n"
    with open(save_filepath, 'w') as file:
        file.write(full_str)

In [12]:
TUNING_EXAMPLES = 10000 
EVAL_EXAMPLES = 10000

In [13]:
# save tuning data
save_data_to_file(cartman_data[:TUNING_EXAMPLES], os.path.join("tuning_data", "cartman_it.jsonl")) 
save_data_to_file(stan_data[:TUNING_EXAMPLES], os.path.join("tuning_data", "stan_it.jsonl"))
save_data_to_file(kyle_data[:TUNING_EXAMPLES], os.path.join("tuning_data", "kyle_it.jsonl"))
save_data_to_file(butters_data[:TUNING_EXAMPLES], os.path.join("tuning_data", "butters_it.jsonl"))
save_data_to_file(manager_data[:TUNING_EXAMPLES], os.path.join("tuning_data", "manager_it.jsonl"))

In [61]:
# save evaluation data
save_data_to_file(cartman_data[TUNING_EXAMPLES + 1:TUNING_EXAMPLES + 1 + EVAL_EXAMPLES], os.path.join("tuning_data", "cartman_eval.txt")) 
save_data_to_file(stan_data[TUNING_EXAMPLES + 1:TUNING_EXAMPLES + 1 + EVAL_EXAMPLES], os.path.join("tuning_data", "stan_eval.txt"))
save_data_to_file(kyle_data[TUNING_EXAMPLES + 1:TUNING_EXAMPLES + 1 + EVAL_EXAMPLES], os.path.join("tuning_data", "kyle_eval.txt"))
save_data_to_file(butters_data[TUNING_EXAMPLES + 1:TUNING_EXAMPLES + 1 + EVAL_EXAMPLES], os.path.join("tuning_data", "butters_eval.txt"))
save_data_to_file(manager_data[TUNING_EXAMPLES + 1:TUNING_EXAMPLES + 1 + EVAL_EXAMPLES], os.path.join("tuning_data", "manager_eval.txt"))