In [1]:
import os 

os.chdir("..")

### Setting 2


For each sample in the evaluation set, provide LLMs with 5 samples from the training data that belong to the particular cluster of the evaluation sample.

In [3]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting2(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."
    
    assert "cluster" in training_df.columns, \
        f"Cluster column 'cluster' not found in training DataFrame."
    
    assert "cluster" in evaluation_df.columns, \
        f"Cluster column 'cluster' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        
        author = row[author_col]
        summary = row[summary_col]
        cluster = row["cluster"]
        
        num_words = round_up_to_nearest_10(count_words(row[text_col]))
        samples = training_df[(training_df[author_col]==author) & 
                              (training_df["cluster"]==cluster)][text_col].sample(num_exemplars)
        
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                       genre=genre, num_words=num_words,
                                       summary=summary)
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt

    # evaluation_df.to_csv(evaluation_df_fp, index=False)
    
    return evaluation_df

In [32]:
def sanity_check():

    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for dataset in datasets:
        evaluation_df = create_writing_prompts_setting2(
            training_df_fp=f"dataset_followup/{dataset}_train.csv",
            evaluation_df_fp=f"dataset_followup/{dataset}_test.csv",
            genre="blog",
            author_col="author", 
            text_col="text", 
            summary_col="summary", 
            num_exemplars=5
        )
        train = pd.read_csv(f"dataset_followup/{dataset}_train.csv")

        for ix, row in evaluation_df.iterrows():
            ixes = [int(i) for i in row["training sample indices"].split(",")]
            cluster = row["cluster"]

            cluster_train = train[train["cluster"]==cluster]["cluster"].to_list()
            assert len(set(cluster_train)) == 1, \
                f"Cluster mismatch for author {row['author']} in dataset {dataset}. " \
                f"Expected cluster: {cluster}, Found clusters: {set(cluster_train)}"
            assert cluster_train[0] == cluster, \
                f"Cluster mismatch for author {row['author']} in dataset {dataset}. " \
                f"Expected cluster: {cluster}, Found clusters: {set(cluster_train)}"

In [33]:
sanity_check()

Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 568.49it/s]


Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 785.84it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 674.23it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 828.41it/s]


### Setting 3

In [None]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting3(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    '''Create writing prompts for the evaluation set based on the training set.
    For each sample in the evaluation set, find the num_exemplars most similar samples
    in the training set based on word count. The prompt will include the writing samples
    and the summary of the evaluation sample.
    '''
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)
    training_df["num_words"] = training_df[text_col].apply(count_words)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        
        author = row[author_col]
        summary = row[summary_col]
        
        num_words = count_words(row[text_col])
        samples = training_df.copy()[training_df[author_col]==author]
        samples["wc_diff"] = abs(samples["num_words"] - num_words)
        samples = samples.sort_values("wc_diff", ).head(num_exemplars)
        
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                       genre=genre, num_words=round_up_to_nearest_10(num_words),
                                       summary=summary)
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt

    
    return evaluation_df

### Setting 5

In [88]:
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from scripts.utils import round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting5


def align_df1_to_df2(df1, df2, col1, col2, col3):
    """
    Aligns df1 to df2 based on matching values in columns `col1,` `col2` and `col3`,
    handling duplicates in both dataframes.

    Returns a new DataFrame with the same number of rows and order as df2,
    by selecting matching rows from df1.
    """
    df1_temp = df1.copy()
    df1_temp['_used'] = False
    matched_rows = []

    for _, row in df2.iterrows():
        match = df1_temp[
            (df1_temp[col1] == row[col1]) &
            (df1_temp[col2] == row[col2]) &
            (df1_temp[col3] == row[col3]) &
            (~df1_temp['_used'])
        ]

        if match.empty:
            return None

        first_match_idx = match.index[0]
        matched_rows.append(df1_temp.loc[first_match_idx])
        df1_temp.at[first_match_idx, '_used'] = True

    result = pd.DataFrame(matched_rows).drop(columns=['_used']).reset_index(drop=True)
    return result


def create_writing_prompts_setting5(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    num_exemplars=5):
    
    def get_text_snippet(text, percentage=0.2):
        words = word_tokenize(text)
        num_words = len(words)
        snippet_length = min(50, int(num_words * percentage))
        length_to_continue = num_words - snippet_length
        snippet = " ".join(words[:snippet_length])
        return snippet, length_to_continue
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)

    assert training_df[author_col].value_counts().min() >= num_exemplars, \
        f"Each author must have at least {num_exemplars} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy().reset_index(drop=True)
    summary_only_prompt_tmp, exemplars_plus_summary_prompt_tmp = \
        get_prompt_template_for_writing_setting5()

    print(f"Generating summary-only prompts...")
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        summary = row[summary_col]
        snippet, length_to_continue = get_text_snippet(row[text_col])
        length_to_continue = round_up_to_nearest_10(length_to_continue)
        prompt = summary_only_prompt_tmp.substitute(genre=genre, 
                                                    num_words=length_to_continue, 
                                                    summary=summary, 
                                                    snippet=snippet)
        evaluation_df.at[ix, "training sample indices"] = "-"
        evaluation_df.at[ix, "prompt"] = prompt

    evaluation_df["Condition"] = "summary-only"
    out = [evaluation_df.copy()]
    print(f"Generating exemplars-plus-summary prompts...")
    
    dataset = evaluation_df_fp.split("/")[-1].split("_")[0]
    earlier_setting1_prompts_df = None
    earlier_setting1_prompts_fp = f"LLM_writing/Setting1/{dataset}/prompts.csv"
    
    if os.path.exists(earlier_setting1_prompts_fp):
        earlier_setting1_prompts_df = pd.read_csv(earlier_setting1_prompts_fp)
        earlier_setting1_prompts_df = align_df1_to_df2(earlier_setting1_prompts_df, evaluation_df, 
                                                       text_col, author_col, summary_col)
        use_random_samples = True if earlier_setting1_prompts_df is None else False
    
    for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
        author = row[author_col]
        summary = row[summary_col]
        snippet, length_to_continue = get_text_snippet(row[text_col])
        length_to_continue = round_up_to_nearest_10(length_to_continue)

        if earlier_setting1_prompts_df is not None and not use_random_samples:

            indices = [int(i) for i in earlier_setting1_prompts_df.at[ix, "training sample indices"].split(",")]
            samples = training_df.loc[indices]

            if len(samples) != num_exemplars:
                print(f"Number of samples found for {row[text_col]} in earlier setting1 prompts is not {num_exemplars}. ")
                use_random_samples = True
                break
                
            if len(samples[author_col].unique()) != 1:
                print(f"More than one author found for sample {row[text_col]} in earlier setting1 prompts. " \
                      f"Found authors: {samples[author_col].unique()}")
                use_random_samples = True
                break

            if samples[author_col].values[0] != author:
                print(f"Author mismatch for sample {row[text_col]} in earlier setting1 prompts.")
                use_random_samples = True
                break

            samples = samples[text_col]
        
        if use_random_samples:
            samples = training_df[training_df[author_col]==author][text_col].sample(num_exemplars)

        writing_samples = list_writing_samples(samples)
        prompt = exemplars_plus_summary_prompt_tmp.substitute(writing_samples=writing_samples, 
                                                              genre=genre, 
                                                              num_words=length_to_continue,
                                                              summary=summary, 
                                                              snippet=snippet)
        
        evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "prompt"] = prompt
    
    evaluation_df["Condition"] = "exemplars-plus-summary"
    
    out.append(evaluation_df.copy())
    out_df = pd.concat(out, ignore_index=True)
    return out_df

In [89]:
def sanity_check():

    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for dataset in datasets:
        if dataset.startswith("blog"):
            genre = "blog post"
        elif dataset.startswith("enron"):
            genre = "email"
        elif dataset.startswith("reddit"):
            genre = "reddit post"
        elif dataset.startswith("CCAT50"):
            genre = "news article"
        else:
            raise ValueError(f"Unknown dataset: {dataset}. Please specify a genre.")

        evaluation_df = create_writing_prompts_setting5(
            training_df_fp=f"dataset_followup/{dataset}_train.csv",
            evaluation_df_fp=f"dataset_followup/{dataset}_test.csv",
            genre=genre,
            author_col="author", 
            text_col="text", 
            summary_col="summary", 
            num_exemplars=5
        )
        evaluation_df = evaluation_df.copy()[evaluation_df.Condition == "exemplars-plus-summary"].reset_index(drop=True)
        
        earlier_eval_df = pd.read_csv(f"LLM_writing/Setting1/{dataset}/prompts.csv")
        earlier_eval_df = align_df1_to_df2(earlier_eval_df, evaluation_df, 
                                           "text", "author", "summary")

        assert len(earlier_eval_df) == len(evaluation_df), \
            f"Number of samples in earlier setting1 prompts and new setting5 prompts do not match. " \
            f"Earlier: {len(earlier_eval_df)}, New: {len(evaluation_df)}"

        assert earlier_eval_df["training sample indices"].to_list() == evaluation_df["training sample indices"].to_list(), \
            f"Training sample indices do not match between earlier setting1 prompts and new setting5 prompts. " \
            f"Earlier: {earlier_eval_df['training sample indices'].to_list()}, New: {evaluation_df['training sample indices'].to_list()}"

In [90]:
sanity_check()

Generating summary-only prompts...


100%|██████████| 300/300 [00:00<00:00, 770.05it/s]


Generating exemplars-plus-summary prompts...


100%|██████████| 300/300 [00:00<00:00, 640.48it/s]


Generating summary-only prompts...


100%|██████████| 300/300 [00:00<00:00, 1249.00it/s]


Generating exemplars-plus-summary prompts...


100%|██████████| 300/300 [00:00<00:00, 972.27it/s] 


Generating summary-only prompts...


100%|██████████| 500/500 [00:00<00:00, 1206.97it/s]


Generating exemplars-plus-summary prompts...


100%|██████████| 500/500 [00:00<00:00, 946.89it/s]


Generating summary-only prompts...


100%|██████████| 500/500 [00:00<00:00, 1133.29it/s]


Generating exemplars-plus-summary prompts...


100%|██████████| 500/500 [00:00<00:00, 929.46it/s]


In [2]:
import pandas as pd

enron_test = pd.read_csv("dataset_followup/enron_test.csv")
blog_test = pd.read_csv("dataset_followup/blog_test.csv")
CCAT50_test = pd.read_csv("dataset_followup/CCAT50_test.csv")
reddit_test = pd.read_csv("dataset_followup/reddit_test.csv")

In [3]:
enron_test.columns, blog_test.columns, CCAT50_test.columns, reddit_test.columns

(Index(['author', 'text', 'subject', 'AA-label', 'summary', 'cluster'], dtype='object'),
 Index(['author', 'text', 'topic', 'gender', 'age', 'sign', 'date', 'AA-label',
        'summary', 'cluster'],
       dtype='object'),
 Index(['author', 'text', 'file_name', 'AA-label', 'summary', 'cluster'], dtype='object'),
 Index(['index', 'author', 'text', 'subreddit', 'AA-label', 'summary',
        'cluster'],
       dtype='object'))

In [None]:
from scripts.utils import count_words


enron_test["text"].apply(count_words).describe()

(count     300.000000
 mean      330.516667
 std       288.417507
 min       102.000000
 25%       148.000000
 50%       203.000000
 75%       397.000000
 max      1451.000000
 Name: text, dtype: float64,)

In [6]:
blog_test["text"].apply(count_words).describe()

count     500.000000
mean      330.428000
std       247.381292
min       101.000000
25%       156.000000
50%       252.500000
75%       416.000000
max      1402.000000
Name: text, dtype: float64

In [7]:
reddit_test["text"].apply(count_words).describe()

count     500.00000
mean      342.25800
std       251.85395
min       101.00000
25%       165.75000
50%       265.00000
75%       433.00000
max      1427.00000
Name: text, dtype: float64

In [8]:
CCAT50_test["text"].apply(count_words).describe()

count     300.000000
mean      577.880000
std       148.238167
min       107.000000
25%       490.000000
50%       588.500000
75%       662.500000
max      1228.000000
Name: text, dtype: float64

In [10]:
dd = pd.read_csv("LLM_writing/Setting1/blog/prompts.csv")
vs = dd["training sample indices"].values[0]
vs

'16746,6286,12122,9723,15564'

[16746, 6286, 12122, 9723, 15564]

In [20]:
dd.loc[list(eval(vs))]

Unnamed: 0,author,text,topic,gender,age,sign,date,summary,training sample indices,prompt
16746,15365,Repair & Maintenance >> Wheels >> Tire ...,indUnk,female,34,Cancer,"20,April,2004","To fix a flat bike tire: remove the wheel, tak...",69422253113837156510317,You will be given one or more writing samples ...
6286,1000866,I would love people to just answer this...,Student,female,17,Libra,"06,June,2003",The speaker expresses feelings of loneliness a...,23563102190534823591,You will be given one or more writing samples ...
12122,3211137,"Wow, what a weekend. I planned a lot o...",indUnk,male,16,Sagittarius,"18,January,2004",The writer describes a busy and eventful weeke...,89201250023520608322325,You will be given one or more writing samples ...
9723,2123946,"Holy crap, urlLink this is ...",indUnk,female,24,Capricorn,"08,February,2004",The writer describes a beautiful day with suns...,49282521220297231545556,You will be given one or more writing samples ...
15564,180519,Well well well (deep subject) Inspired ...,indUnk,female,23,Cancer,"13,September,2003","The author revives their blog, Buddha Stew, af...",17096946424570245772874,You will be given one or more writing samples ...


### Setting 6

In [66]:
import pandas as pd
from tqdm import tqdm
from scripts.utils import count_words, round_up_to_nearest_10, list_writing_samples
from scripts.prompt_templates import get_prompt_template_for_writing_setting1


def create_writing_prompts_setting6(training_df_fp, 
                                    evaluation_df_fp, 
                                    genre,
                                    author_col="author", 
                                    text_col="text", 
                                    summary_col="summary", 
                                    nums_exemplars=[2, 4, 6, 8, 10]):
    '''Similar to setting 1, but for each sample in the evaluation set,
    find multiple num_exemplars random samples in the training set to be used as writing examples.
    A larger nums_exemplar will subsume all smaller nums_exemplar. 
    '''
    
    training_df = pd.read_csv(training_df_fp)
    evaluation_df = pd.read_csv(evaluation_df_fp)
    nums_exemplars = sorted(nums_exemplars, reverse=True)

    assert training_df[author_col].value_counts().min() >= nums_exemplars[0], \
        f"Each author must have at least {nums_exemplars[0]} samples in the training set."
    
    assert summary_col in evaluation_df.columns, \
        f"Summary column '{summary_col}' not found in evaluation DataFrame."

    evaluation_df = evaluation_df.copy()
    prompt_tmp = get_prompt_template_for_writing_setting1()        
    
    print(f"Generating prompts...")
    out = []
    exemplars_map = {}
    for num_exemplar in nums_exemplars:
        for ix, row in tqdm(evaluation_df.iterrows(), total=len(evaluation_df)):
            
            author = row[author_col]
            summary = row[summary_col]
            
            num_words = round_up_to_nearest_10(count_words(row[text_col]))

            if ix in exemplars_map:
                samples = exemplars_map[ix][:num_exemplar]
            else:
                samples = training_df[training_df[author_col]==author][text_col].sample(num_exemplar)
                exemplars_map[ix] = samples
            
            writing_samples = list_writing_samples(samples)
            prompt = prompt_tmp.substitute(writing_samples=writing_samples, 
                                        genre=genre, num_words=num_words,
                                        summary=summary)
            evaluation_df.at[ix, "training sample indices"] = ",".join([str(ix) for ix in samples.index])
            evaluation_df.at[ix, "prompt"] = prompt

        out.append(evaluation_df.copy())
        out[-1]["num_exemplars"] = num_exemplar
    
    out_df = pd.concat(out, axis=0).reset_index(drop=True)
    return out_df

In [73]:
def sanity_check():

    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for dataset in datasets:
        evaluation_df = create_writing_prompts_setting6(
            training_df_fp=f"dataset_followup/{dataset}_train.csv",
            evaluation_df_fp=f"dataset_followup/{dataset}_test.csv",
            genre="blog",
            author_col="author", 
            text_col="text", 
            summary_col="summary", 
            nums_exemplars=[2, 4, 6, 8, 10]
        )
        nums_exemplars = sorted(evaluation_df["num_exemplars"].unique())

        for n1, n2 in zip(nums_exemplars, nums_exemplars[1:]):
            sub_n1 = evaluation_df[evaluation_df["num_exemplars"]==n1].reset_index(drop=True)
            sub_n2 = evaluation_df[evaluation_df["num_exemplars"]==n2].reset_index(drop=True)
            assert len(sub_n1) == len(sub_n2), \
                f"Number of samples for num_exemplars {n1} and {n2} do not match. " \
                f"num_exemplars {n1}: {len(sub_n1)}, num_exemplars {n2}: {len(sub_n2)}"
            
            for ixes1, ixes2 in zip(sub_n1["training sample indices"], sub_n2["training sample indices"]):
                
                assert ixes1 in ixes2, \
                    f"Sample indices for num_exemplars {n1} not in num_exemplars {n2}. " \
                    f"Sample indices: {ixes1} not in {ixes2}"

In [74]:
sanity_check()

Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 575.82it/s]
100%|██████████| 300/300 [00:00<00:00, 719.97it/s]
100%|██████████| 300/300 [00:00<00:00, 724.43it/s]
100%|██████████| 300/300 [00:00<00:00, 741.40it/s]
100%|██████████| 300/300 [00:00<00:00, 741.24it/s]


Generating prompts...


100%|██████████| 300/300 [00:00<00:00, 817.73it/s]
100%|██████████| 300/300 [00:00<00:00, 1198.14it/s]
100%|██████████| 300/300 [00:00<00:00, 1204.19it/s]
100%|██████████| 300/300 [00:00<00:00, 1221.04it/s]
100%|██████████| 300/300 [00:00<00:00, 1217.05it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 706.40it/s]
100%|██████████| 500/500 [00:00<00:00, 1182.29it/s]
100%|██████████| 500/500 [00:00<00:00, 1192.22it/s]
100%|██████████| 500/500 [00:00<00:00, 1200.58it/s]
100%|██████████| 500/500 [00:00<00:00, 1202.24it/s]


Generating prompts...


100%|██████████| 500/500 [00:00<00:00, 876.27it/s]
100%|██████████| 500/500 [00:00<00:00, 1163.89it/s]
100%|██████████| 500/500 [00:00<00:00, 1165.93it/s]
100%|██████████| 500/500 [00:00<00:00, 1178.35it/s]
100%|██████████| 500/500 [00:00<00:00, 1189.76it/s]


### Test on Jack's Corpus