In [1]:
import os 

os.chdir("..")

### Setting 2


For each sample in the evaluation set, provide LLMs with 5 samples from the training data that belong to the particular cluster of the evaluation sample.

In [3]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting2(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."
    
    assert "cluster" in training_df.columns, \
        f"Cluster column 'cluster' not found in training DataFrame."
    
    assert "cluster" in evaluation_df.columns, \
        f"Cluster column 'cluster' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        
        author = row[author_col]
        summary = row[summary_col]
        cluster = row["cluster"]
        
        num_words = round_up_to_nearest_10(count_words(row[text_col]))
        samples = training_df[(training_df[author_col]==author) & 
                              (training_df["cluster"]==cluster)][text_col].sample(num_exemplars)
        
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                       genre=genre, num_words=num_words,
                                       summary=summary)
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt

    # evaluation_df.to_csv(evaluation_df_fp, index=False)
    
    return evaluation_df

In [32]:
def sanity_check():

    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for dataset in datasets:
        evaluation_df = create_writing_prompts_setting2(
            training_df_fp=f"dataset_followup/{dataset}_train.csv",
            evaluation_df_fp=f"dataset_followup/{dataset}_test.csv",
            genre="blog",
            author_col="author", 
            text_col="text", 
            summary_col="summary", 
            num_exemplars=5
        )
        train = pd.read_csv(f"dataset_followup/{dataset}_train.csv")

        for ix, row in evaluation_df.iterrows():
            ixes = [int(i) for i in row["training sample indices"].split(",")]
            cluster = row["cluster"]

            cluster_train = train[train["cluster"]==cluster]["cluster"].to_list()
            assert len(set(cluster_train)) == 1, \
                f"Cluster mismatch for author {row['author']} in dataset {dataset}. " \
                f"Expected cluster: {cluster}, Found clusters: {set(cluster_train)}"
            assert cluster_train[0] == cluster, \
                f"Cluster mismatch for author {row['author']} in dataset {dataset}. " \
                f"Expected cluster: {cluster}, Found clusters: {set(cluster_train)}"

In [33]:
sanity_check()

Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 568.49it/s]


Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 785.84it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 674.23it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 828.41it/s]


### Setting 3

In [None]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting3(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    '''Create writing prompts for the evaluation set based on the training set.
    For each sample in the evaluation set, find the num_exemplars most similar samples
    in the training set based on word count. The prompt will include the writing samples
    and the summary of the evaluation sample.
    '''
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)
    training_df["num_words"] = training_df[text_col].apply(count_words)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        
        author = row[author_col]
        summary = row[summary_col]
        
        num_words = count_words(row[text_col])
        samples = training_df.copy()[training_df[author_col]==author]
        samples["wc_diff"] = abs(samples["num_words"] - num_words)
        samples = samples.sort_values("wc_diff", ).head(num_exemplars)
        
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                       genre=genre, num_words=round_up_to_nearest_10(num_words),
                                       summary=summary)
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt

    
    return evaluation_df

### Setting 5

### Setting 6

In [66]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting6(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    nums_exemplars=[2, 4, 6, 8, 10]):
    '''Similar to setting 1, but for each sample in the evaluation set,
    find multiple num_exemplars random samples in the training set to be used as writing examples.
    A larger nums_exemplar will subsume all smaller nums_exemplar. 
    '''
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)
    nums_exemplars = sorted(nums_exemplars, reverse=True)

    assert training_df[author_col].value_counts().min() >= nums_exemplars[0], \
        f"Each author must have at least {nums_exemplars[0]} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    out = []
    exemplars_map = {}
    for num_exemplar in nums_exemplars:
        for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
            
            author = row[author_col]
            summary = row[summary_col]
            
            num_words = round_up_to_nearest_10(count_words(row[text_col]))

            if ix in exemplars_map:
                samples = exemplars_map[ix][:num_exemplar]
            else:
                samples = training_df[training_df[author_col]==author][text_col].sample(num_exemplar)
                exemplars_map[ix] = samples
            
            writing_samples = list_writing_samples(samples)
            prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                        genre=genre, num_words=num_words,
                                        summary=summary)
            evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
            evaluation_df.at[ix, "prompt"] = prompt

        out.append(evaluation_df.copy())
        out[-1]["num_exemplars"] = num_exemplar
    
    out_df = pd.concat(out, axis=0).reset_index(drop=True)
    return out_df

In [73]:
def sanity_check():

    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for dataset in datasets:
        evaluation_df = create_writing_prompts_setting6(
            training_df_fp=f"dataset_followup/{dataset}_train.csv",
            evaluation_df_fp=f"dataset_followup/{dataset}_test.csv",
            genre="blog",
            author_col="author", 
            text_col="text", 
            summary_col="summary", 
            nums_exemplars=[2, 4, 6, 8, 10]
        )
        nums_exemplars = sorted(evaluation_df["num_exemplars"].unique())

        for n1, n2 in zip(nums_exemplars, nums_exemplars[1:]):
            sub_n1 = evaluation_df[evaluation_df["num_exemplars"]==n1].reset_index(drop=True)
            sub_n2 = evaluation_df[evaluation_df["num_exemplars"]==n2].reset_index(drop=True)
            assert len(sub_n1) == len(sub_n2), \
                f"Number of samples for num_exemplars {n1} and {n2} do not match. " \
                f"num_exemplars {n1}: {len(sub_n1)}, num_exemplars {n2}: {len(sub_n2)}"
            
            for ixes1, ixes2 in zip(sub_n1["training sample indices"], sub_n2["training sample indices"]):
                
                assert ixes1 in ixes2, \
                    f"Sample indices for num_exemplars {n1} not in num_exemplars {n2}. " \
                    f"Sample indices: {ixes1} not in {ixes2}"

In [74]:
sanity_check()

Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 575.82it/s]
100%|██████████| 300/300 [00:00<00:00, 719.97it/s]
100%|██████████| 300/300 [00:00<00:00, 724.43it/s]
100%|██████████| 300/300 [00:00<00:00, 741.40it/s]
100%|██████████| 300/300 [00:00<00:00, 741.24it/s]


Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 817.73it/s]
100%|██████████| 300/300 [00:00<00:00, 1198.14it/s]
100%|██████████| 300/300 [00:00<00:00, 1204.19it/s]
100%|██████████| 300/300 [00:00<00:00, 1221.04it/s]
100%|██████████| 300/300 [00:00<00:00, 1217.05it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 706.40it/s]
100%|██████████| 500/500 [00:00<00:00, 1182.29it/s]
100%|██████████| 500/500 [00:00<00:00, 1192.22it/s]
100%|██████████| 500/500 [00:00<00:00, 1200.58it/s]
100%|██████████| 500/500 [00:00<00:00, 1202.24it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 876.27it/s]
100%|██████████| 500/500 [00:00<00:00, 1163.89it/s]
100%|██████████| 500/500 [00:00<00:00, 1165.93it/s]
100%|██████████| 500/500 [00:00<00:00, 1178.35it/s]
100%|██████████| 500/500 [00:00<00:00, 1189.76it/s]
