In [1]:
import os 

os.chdir("..")

### Setting 2


For each sample in the evaluation set, provide LLMs with 5 samples from the training data that belong to the particular cluster of the evaluation sample.

In [3]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting2(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."
    
    assert "cluster" in training_df.columns, \
        f"Cluster column 'cluster' not found in training DataFrame."
    
    assert "cluster" in evaluation_df.columns, \
        f"Cluster column 'cluster' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        
        author = row[author_col]
        summary = row[summary_col]
        cluster = row["cluster"]
        
        num_words = round_up_to_nearest_10(count_words(row[text_col]))
        samples = training_df[(training_df[author_col]==author) & 
                              (training_df["cluster"]==cluster)][text_col].sample(num_exemplars)
        
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                       genre=genre, num_words=num_words,
                                       summary=summary)
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt

    # evaluation_df.to_csv(evaluation_df_fp, index=False)
    
    return evaluation_df

In [32]:
def sanity_check():

    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for dataset in datasets:
        evaluation_df = create_writing_prompts_setting2(
            training_df_fp=f"dataset_followup/{dataset}_train.csv",
            evaluation_df_fp=f"dataset_followup/{dataset}_test.csv",
            genre="blog",
            author_col="author", 
            text_col="text", 
            summary_col="summary", 
            num_exemplars=5
        )
        train = pd.read_csv(f"dataset_followup/{dataset}_train.csv")

        for ix, row in evaluation_df.iterrows():
            ixes = [int(i) for i in row["training sample indices"].split(",")]
            cluster = row["cluster"]

            cluster_train = train[train["cluster"]==cluster]["cluster"].to_list()
            assert len(set(cluster_train)) == 1, \
                f"Cluster mismatch for author {row['author']} in dataset {dataset}. " \
                f"Expected cluster: {cluster}, Found clusters: {set(cluster_train)}"
            assert cluster_train[0] == cluster, \
                f"Cluster mismatch for author {row['author']} in dataset {dataset}. " \
                f"Expected cluster: {cluster}, Found clusters: {set(cluster_train)}"

In [33]:
sanity_check()

Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 568.49it/s]


Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 785.84it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 674.23it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 828.41it/s]


### Setting 3

In [None]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting3(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    '''Create writing prompts for the evaluation set based on the training set.
    For each sample in the evaluation set, find the num_exemplars most similar samples
    in the training set based on word count. The prompt will include the writing samples
    and the summary of the evaluation sample.
    '''
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)
    training_df["num_words"] = training_df[text_col].apply(count_words)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        
        author = row[author_col]
        summary = row[summary_col]
        
        num_words = count_words(row[text_col])
        samples = training_df.copy()[training_df[author_col]==author]
        samples["wc_diff"] = abs(samples["num_words"] - num_words)
        samples = samples.sort_values("wc_diff", ).head(num_exemplars)
        
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                       genre=genre, num_words=round_up_to_nearest_10(num_words),
                                       summary=summary)
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt

    
    return evaluation_df

In [36]:
df = pd.read_csv("LLM_writing/Setting2/blog/prompts.csv")
df

Unnamed: 0,author,text,topic,gender,age,sign,date,AA-label,summary,cluster,training sample indices,prompt
0,1417798,"Good morning, ladies! How are...",indUnk,female,35,Scorpio,"17,September,2003",78,"The writer greets her friends from Panhandle, ...",0.0,20289156211766318419752,You will be given one or more writing samples ...
1,1417798,Rick Alert! Rick was just on...,indUnk,female,35,Scorpio,"17,October,2003",78,Rick appeared on Fox to discuss baseball and w...,0.0,7529335248032137114115,You will be given one or more writing samples ...
2,1417798,"Teri, I love how you transiti...",indUnk,female,35,Scorpio,"24,February,2004",78,The writer humorously comments on Teri's conve...,0.0,71971965787042052724567,You will be given one or more writing samples ...
3,1417798,Hey gals! Thanks so much for ...,indUnk,female,35,Scorpio,"20,August,2003",78,The writer thanks friends for their support af...,0.0,1240956767366629712940,You will be given one or more writing samples ...
4,1417798,"Hey gals, I've been busy with...",indUnk,female,35,Scorpio,"29,June,2003",78,The writer shares updates about a weekend spen...,0.0,2462159668614131043365,You will be given one or more writing samples ...
...,...,...,...,...,...,...,...,...,...,...,...,...
495,152151,happy new year everyone! so I'm ba...,Engineering,female,23,Libra,"02,January,2002",87,The writer is back in Berkeley and ready to st...,1.0,241238489194502400216309,You will be given one or more writing samples ...
496,152151,omg so much more reading to do.. i...,Engineering,female,23,Libra,"09,April,2003",87,The writer expresses stress over having a lot ...,1.0,33121304811568232117464,You will be given one or more writing samples ...
497,152151,okies quick quick recap. friday wi...,Engineering,female,23,Libra,"20,April,2003",87,"The author recounts their weekend: On Friday, ...",0.0,1973330761301317793574,You will be given one or more writing samples ...
498,152151,yay we're going boarding this weeke...,Engineering,female,23,Libra,"10,January,2003",87,The writer is excited about going boarding thi...,0.0,2297310593122862257720061,You will be given one or more writing samples ...
