In [1]:
import os
import pandas as pd

### Get real script lines for each character

In [2]:
def get_str_script_from_df(df):
    # convert df into a string episode script
    df = df.iloc[:, 2:].fillna('<scene>')
    
    # Concatenate the remaining columns into a single string for each row
    df['formatted_string'] = df.apply(lambda row: ': '.join(row.astype(str)), axis=1)
    
    # Combine all rows into a single string separated by line breaks
    final_string = '\n'.join(df['formatted_string'])
    return final_string.replace("\'", "`").lower()

In [3]:
SEASONS = [8,9,10,11,12]
CHARACTERS = ["stan", "kyle", "cartman", "butters"]

cartman_data = []
stan_data = []
kyle_data = []
butters_data = []

for SEASON in SEASONS:
    # get all .csv files in correct season dir
    episodes = list(filter(lambda file: file[-4:] == ".csv", os.listdir(os.path.join(".", "episodes_csv", f"s{SEASON}"))))
    print(f"Season {SEASON}!")
    for episode in episodes:
        df = pd.read_csv(os.path.join(".", "episodes_csv", f"s{SEASON}", episode), delimiter=';', header=None)
        str_script = get_str_script_from_df(df)

        lines = str_script.split("\n")
        
        for line in lines:
            try:
                split_line = line.split(": ")
                character = split_line[0]
                char_line = split_line[1]
                
                if character not in CHARACTERS:
                    continue
                    
                if character == "cartman":
                    cartman_data.append(char_line)
                elif character == "stan":
                    stan_data.append(char_line)
                elif character == "kyle":
                    kyle_data.append(char_line)
                elif character == "butters":
                    butters_data.append(char_line)
            except:
                continue

Season 8!
Season 9!
Season 10!
Season 11!
Season 12!


### Get generated script lines for each character

In [4]:
gen_eps_path = os.path.join(".", "gen_eps")
dirs = list(filter(lambda filename: filename[0] == "2", os.listdir(gen_eps_path)))

cartman_data_gen = []
stan_data_gen = []
kyle_data_gen = []
butters_data_gen = []

for dir in dirs:
    file = os.path.join(gen_eps_path, dir, "ep.txt")
    with open(file, "r") as f:
        script = f.read()
    lines = script.split("\n")
    for line in lines:
        split_line = line.split(": ")
        if len(split_line) < 2:
            continue
        character = split_line[0]
        char_line = split_line[1]

        if character == "cartman":
            cartman_data_gen.append(char_line)
        elif character == "stan":
            stan_data_gen.append(char_line)
        elif character == "butters":
            butters_data_gen.append(char_line)
        elif character == "kyle":
            kyle_data_gen.append(char_line)

In [5]:
print(f"butters: {len(butters_data_gen)}, stan: {len(stan_data_gen)}, cartman: {len(cartman_data_gen)}, kyle: {len(kyle_data_gen)}")

butters: 76, stan: 194, cartman: 301, kyle: 67


In [14]:
import re
def preprocess_text(txt: str):
    txt = txt.lower()
    # remove punctuation
    txt = re.sub(r'[^\w\s]','',txt)
    return txt

In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joachimmaksim/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [53]:
from nltk.tokenize import sent_tokenize

def find_average_sentence_length(txt: str):
    sentences = sent_tokenize(txt) 
    sentence_lengths = list(map(lambda s: len(s), sentences))
    return sum(sentence_lengths) / len(sentence_lengths)

def find_average_word_length(txt: str):
    words = txt.split(" ")
    word_lengths = list(map(lambda w: len(w), words))
    return sum(word_lengths) / len(word_lengths)

In [61]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=False)

butters_gen_combined = " ".join(butters_data_gen)
cartman_gen_combined = " ".join(cartman_data_gen)
stan_gen_combined = " ".join(stan_data_gen)
kyle_gen_combined = " ".join(kyle_data_gen)

butters_combined = " ".join(butters_data)


gen_sentence_len = find_average_sentence_length(butters_gen_combined)
real_sentence_len = find_average_sentence_length(butters_combined)
print(f"sentence length (gen / real) = \t{gen_sentence_len} / {real_sentence_len} = {gen_sentence_len / real_sentence_len}")

gen_word_len = find_average_word_length(preprocess_text(butters_gen_combined))
real_word_len = find_average_word_length(preprocess_text(butters_combined))
print(f"word length (gen / real) = \t{gen_word_len} / {real_word_len} \t= {gen_word_len / real_word_len}")

print("gen Butters similarity to butters (rouge-1 precision):")
print(scorer.score(preprocess_text(butters_combined), preprocess_text(butters_gen_combined))["rouge1"].precision)

print("gen Cartman similarity to butters (rouge-1 precision):")
print(scorer.score(preprocess_text(butters_combined), preprocess_text(cartman_gen_combined))["rouge1"].precision)

print("gen Kyle similarity to butters (rouge-1 precision):")
print(scorer.score(preprocess_text(butters_combined), preprocess_text(kyle_gen_combined))["rouge1"].precision)

print("gen Stan similarity to butters (rouge-1 precision):")
print(scorer.score(preprocess_text(butters_combined), preprocess_text(stan_gen_combined))["rouge1"].precision)



sentence length (gen / real) = 	38.266129032258064 / 41.048050139275766 = 0.9322276917520159
word length (gen / real) = 	3.9554347826086955 / 4.072424407025167 	= 0.9712727327204262
gen Butters similarity to butters (rouge-1 precision):
0.816304347826087
gen Cartman similarity to butters (rouge-1 precision):
0.7084901030590545
gen Kyle similarity to butters (rouge-1 precision):
0.776685393258427
gen Stan similarity to butters (rouge-1 precision):
0.7957911145752143
