- Create prompts
- Run LLMs on the created prompts

### Generate Toy Examples

In [12]:
import pandas as pd
from random import sample

# df1 = pd.read_csv("../dataset_prepare/blog_train.csv")
# df2 = pd.read_csv("../dataset_prepare/CCAT50_train.csv")
# df3 = pd.read_csv("../dataset_prepare/enron_train.csv")
# df4 = pd.read_csv("../dataset_prepare/reddit_train.csv")
# df1_test = pd.read_csv("../dataset_prepare/blog_test.csv")
# df2_test = pd.read_csv("../dataset_prepare/CCAT50_test.csv")
# df3_test = pd.read_csv("../dataset_prepare/enron_test.csv")
# df4_test = pd.read_csv("../dataset_prepare/reddit_test.csv")


def get_toy_data(train_fp, author_col="author", 
                 num_authors=20, num_samples_per_author=5):
    df = pd.read_csv(train_fp)
    df_test = pd.read_csv(train_fp.replace("train", "test"))

    dataset = train_fp.split("/")[-1].split("_")[0]
    df.insert(0, "dataset", dataset)
    df_test.insert(0, "dataset", dataset)

    authors = sample(df[author_col].unique().tolist(), num_authors)
    df = df.copy()[df[author_col].isin(authors)]
    df_test = df_test.copy()[df_test[author_col].isin(authors)]
    df_train = df.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
    df_test = df_test.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
    return df_train, df_test

In [13]:
train, test = [], []
train_fps = [
    "../dataset_prepare/blog_train.csv",
    "../dataset_prepare/CCAT50_train.csv",
    "../dataset_prepare/enron_train.csv",
    "../dataset_prepare/reddit_train.csv"
]

for fp in train_fps:
    train_df, test_df = get_toy_data(fp)
    train.append(train_df)
    test.append(test_df)

train = pd.concat(train, ignore_index=True)
test = pd.concat(test, ignore_index=True)
train.to_csv("../dataset_prepare/toy_train.csv", index=False)
test.to_csv("../dataset_prepare/toy_test.csv", index=False)

  df_train = df.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_test = df_test.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_train = df.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_test = df_test.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_train = df.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_test = df_test.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_train = df.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)
  df_test = df_test.groupby(author_col).apply(lambda x: x.sample(num_samples_per_author)).reset_index(drop=True)


In [14]:
train

Unnamed: 0,dataset,author,text,topic,gender,age,sign,date,file_name,subject,index,subreddit
0,blog,15365,urlLink Rejection Feeling rejected by...,indUnk,female,34.0,Cancer,"04,June,2002",,,,
1,blog,15365,urlLink Yahoo! News - Indian Villagers...,indUnk,female,34.0,Cancer,"10,August,2003",,,,
2,blog,15365,urlLink The medieval 'magician' Albert...,indUnk,female,34.0,Cancer,"11,October,2001",,,,
3,blog,15365,urlLink Where the stars purr : MOSCOW...,indUnk,female,34.0,Cancer,"25,December,2003",,,,
4,blog,15365,urlLink Osho Zen Tarot 28. Receptivity...,indUnk,female,34.0,Cancer,"04,April,2003",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
395,reddit,wonderfuldog,Anyone with a working brain realises that Crea...,,,,,,,,184093.0,atheism
396,reddit,wonderfuldog,reposting] \n On various occasions here at /r/...,,,,,,,,662585.0,atheism
397,reddit,wonderfuldog,What point are you trying to make to me? \n I'...,,,,,,,,1067406.0,Christianity
398,reddit,wonderfuldog,"Coelurosauria]( is a big group, so that's like...",,,,,,,,52965.0,askscience


In [15]:
test

Unnamed: 0,dataset,author,text,topic,gender,age,sign,date,file_name,subject,index,subreddit
0,blog,15365,"'Bathrooms, hallway corners, laundr...",indUnk,female,34.0,Cancer,"28,July,2004",,,,
1,blog,15365,urlLink June 2003 Outlook from Moonsur...,indUnk,female,34.0,Cancer,"07,June,2004",,,,
2,blog,15365,urlLink SAGITTARIUS LUNAR CYCLE by Cl...,indUnk,female,34.0,Cancer,"07,June,2004",,,,
3,blog,15365,"The Beatles Title: Let It Be (Lennon, M...",indUnk,female,34.0,Cancer,"12,October,2002",,,,
4,blog,15365,THE MOON WAS STILL UP Anger and pain I c...,indUnk,female,34.0,Cancer,"14,September,2003",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
395,reddit,wonderfuldog,Occam's Razor]( \n A guy is found dead of a g...,,,,,,,,446467.0,atheism
396,reddit,wonderfuldog,"If we lived side-by-side with dinosaurs, don't...",,,,,,,,849514.0,atheism
397,reddit,wonderfuldog,it means different things to different people ...,,,,,,,,1242640.0,atheism
398,reddit,wonderfuldog,Do you think there is any truth to people sayi...,,,,,,,,609221.0,atheism


### Summarize the test set

In [53]:
from litellm import completion


def get_completion(prompt, temperature=0, max_tries=5,
                   model="openai/gpt-4.1-mini-2025-04-14"):
    
    for _ in range(max_tries):
        try:
            # Call the completion function with the provided parameters
            response = completion(
                model=model, temperature=temperature,
                messages=[{"role": "user", "content": prompt}]
            )
            return response["choices"][0]["message"]["content"]
        
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    return "SOMETHING_WRONG"

In [28]:
from string import Template


prompt_tmp = '''
You will be given a piece of text. Your task is to summarize the text in a concise and clear manner, \
capturing the main ideas and key points while maintaining the original meaning.

### Text to Summarize

$text

### Instructions

- Provide a summary that is brief yet comprehensive.
- Ensure that the summary accurately reflects the content of the original text.
- Avoid adding any personal opinions or interpretations.

Begin your response below:
    '''.strip()

prompt_tmp = Template(prompt_tmp)

In [35]:
from tqdm import tqdm


for ix, row in tqdm(test.iterrows(), total=len(test)):
    text = row["text"]
    prompt = prompt_tmp.substitute(text=text)
    
    # Get the completion
    summary = get_completion(prompt, temperature=0, max_tries=5)
    
    # Save the summary back to the DataFrame
    test.at[ix, "summary"] = summary
    
# Save the test DataFrame with summaries to a new CSV file
test.to_csv("../dataset_prepare/toy_test_with_summaries.csv", index=False)

100%|██████████| 400/400 [09:44<00:00,  1.46s/it]


### Create prompts for the test set, used to generate LLM generated samples

In [36]:
from string import Template


prompt_tmp = '''
You will be given one or more writing samples from a specific author. Your task is to analyze \
the author's style, tone, and voice, then craft a new piece that closely mimics their writing based on a provided summary.


### Author's Writing Sample(s)

$writing_samples

### Writing Task Summary

$summary

### Instructions

- Ensure your writing closely matches the provided samples in terms of tone, vocabulary, sentence structure, and overall style.
- Maintain consistency with the author's voice while accurately reflecting the details of the given summary.
- Strive to make your writing indistinguishable from the original author's work.

Begin your response below:
'''.strip()

prompt_tmp = Template(prompt_tmp)

In [40]:
import pandas as pd


def list_writing_samples(samples):
    return '\n\n'.join([f'Sample#{ix+1}\n\n{sample.strip()}' 
                      for ix, sample in enumerate(samples)])


def create_prompts(training_df, evaluation_df, 
                   author_col="author", text_col="text", 
                   summary_col="summary", 
                   num_exemplar=5):
    # few-shot prompting LLMs to generare a writing sample that minicks the 
    # writing styles of the few-shot examples sampled from the training data
    evaluation_df = evaluation_df.copy()
    
    assert training_df[author_col].value_counts().min() >= num_exemplar, \
        f"Each author must have at least {num_exemplar} samples in the training set."
    

    for ix, row in evaluation_df.iterrows():
        author = row[author_col]
        summary = row[summary_col]
        samples = training_df[training_df[author_col]==author][text_col].sample(num_exemplar)
        writing_samples = list_writing_samples(samples)
        prompt = prompt_tmp.substitute(writing_samples=writing_samples, summary=summary)
        evaluation_df.at[ix, "Sample Indices"] = ",".join([str(ix) for ix in samples.index])
        evaluation_df.at[ix, "Prompt"] = prompt

    return evaluation_df

In [41]:
training = pd.read_csv("../dataset_prepare/toy_train.csv")
evaluation = pd.read_csv("../dataset_prepare/toy_test_with_summaries.csv")

test_set_with_prompts = create_prompts(training, evaluation, num_exemplar=5)
test_set_with_prompts.to_csv("../dataset_prepare/toy_test_with_summaries_and_mimicked_writings.csv", index=False)

In [42]:
test_set_with_prompts

Unnamed: 0,dataset,author,text,topic,gender,age,sign,date,file_name,subject,index,subreddit,summary,Sample Indices,Prompt
0,blog,15365,"'Bathrooms, hallway corners, laundr...",indUnk,female,34.0,Cancer,"28,July,2004",,,,,"Choose accessible locations like bathrooms, ha...",41203,You will be given one or more writing samples ...
1,blog,15365,urlLink June 2003 Outlook from Moonsur...,indUnk,female,34.0,Cancer,"07,June,2004",,,,,"The Gemini New Moon on May 30th, 2003, marks a...",13024,You will be given one or more writing samples ...
2,blog,15365,urlLink SAGITTARIUS LUNAR CYCLE by Cl...,indUnk,female,34.0,Cancer,"07,June,2004",,,,,"The Sagittarius lunar cycle, marked by a total...",24031,You will be given one or more writing samples ...
3,blog,15365,"The Beatles Title: Let It Be (Lennon, M...",indUnk,female,34.0,Cancer,"12,October,2002",,,,,"""Let It Be"" by The Beatles is a song expressin...",02413,You will be given one or more writing samples ...
4,blog,15365,THE MOON WAS STILL UP Anger and pain I c...,indUnk,female,34.0,Cancer,"14,September,2003",,,,,The text explores deep emotions of anger and p...,42130,You will be given one or more writing samples ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,reddit,wonderfuldog,Occam's Razor]( \n A guy is found dead of a g...,,,,,,,,446467.0,atheism,The text explains Occam's Razor by comparing t...,397399396395398,You will be given one or more writing samples ...
396,reddit,wonderfuldog,"If we lived side-by-side with dinosaurs, don't...",,,,,,,,849514.0,atheism,"The text questions why, if humans lived alongs...",396395399397398,You will be given one or more writing samples ...
397,reddit,wonderfuldog,it means different things to different people ...,,,,,,,,1242640.0,atheism,The text emphasizes the importance of a shared...,398399396395397,You will be given one or more writing samples ...
398,reddit,wonderfuldog,Do you think there is any truth to people sayi...,,,,,,,,609221.0,atheism,The text addresses the criticism that r/atheis...,397399395398396,You will be given one or more writing samples ...


### Prompting

In [54]:
model="openai/gpt-4.1-mini-2025-04-14"

for ix in tqdm(test_set_with_prompts.index):
    # if f"{model}-sample" in test_set_with_prompts and test_set_with_prompts.at[ix, f"{model}-sample"] != "":
    #     continue

    prompt = test_set_with_prompts.at[ix, "Prompt"]
    response = get_completion(prompt, model=model)
    test_set_with_prompts.at[ix, f"{model}-sample"] = response

    if (ix + 1) % 10 == 0:
        test_set_with_prompts.to_csv("toy_test_with_summaries_and_mimicked_writings.csv", index=False)

100%|██████████| 400/400 [19:17<00:00,  2.89s/it]


In [55]:
test_set_with_prompts

Unnamed: 0,dataset,author,text,topic,gender,age,sign,date,file_name,subject,index,subreddit,summary,Sample Indices,Prompt,openai/gpt-4.1-mini-2025-04-14-sample
0,blog,15365,"'Bathrooms, hallway corners, laundr...",indUnk,female,34.0,Cancer,"28,July,2004",,,,,"Choose accessible locations like bathrooms, ha...",41203,You will be given one or more writing samples ...,Choosing the right spot for your kitten’s litt...
1,blog,15365,urlLink June 2003 Outlook from Moonsur...,indUnk,female,34.0,Cancer,"07,June,2004",,,,,"The Gemini New Moon on May 30th, 2003, marks a...",13024,You will be given one or more writing samples ...,urlLink Cosmic Currents - Gemini New Moon Ushe...
2,blog,15365,urlLink SAGITTARIUS LUNAR CYCLE by Cl...,indUnk,female,34.0,Cancer,"07,June,2004",,,,,"The Sagittarius lunar cycle, marked by a total...",24031,You will be given one or more writing samples ...,"The Sagittarius lunar cycle, illuminated by th..."
3,blog,15365,"The Beatles Title: Let It Be (Lennon, M...",indUnk,female,34.0,Cancer,"12,October,2002",,,,,"""Let It Be"" by The Beatles is a song expressin...",02413,You will be given one or more writing samples ...,urlLink Let It Be: Words of Comfort in Trouble...
4,blog,15365,THE MOON WAS STILL UP Anger and pain I c...,indUnk,female,34.0,Cancer,"14,September,2003",,,,,The text explores deep emotions of anger and p...,42130,You will be given one or more writing samples ...,urlLink Beneath the Surface: The Unyielding Fl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,reddit,wonderfuldog,Occam's Razor]( \n A guy is found dead of a g...,,,,,,,,446467.0,atheism,The text explains Occam's Razor by comparing t...,397399396395398,You will be given one or more writing samples ...,Occam's Razor is a principle that gets brought...
396,reddit,wonderfuldog,"If we lived side-by-side with dinosaurs, don't...",,,,,,,,849514.0,atheism,"The text questions why, if humans lived alongs...",396395399397398,You will be given one or more writing samples ...,reposting]\n\nOne question that seems to get o...
397,reddit,wonderfuldog,it means different things to different people ...,,,,,,,,1242640.0,atheism,The text emphasizes the importance of a shared...,398399396395397,You will be given one or more writing samples ...,Anyone who’s ever tried to have a serious conv...
398,reddit,wonderfuldog,Do you think there is any truth to people sayi...,,,,,,,,609221.0,atheism,The text addresses the criticism that r/atheis...,397399395398396,You will be given one or more writing samples ...,reposting] \nWe get this criticism here fairl...
