In [1]:
import os

os.chdir("..")

In [2]:
import pandas as pd

pd.read_csv("LLM_writing/Setting1/CCAT50/prompts.csv").columns

Index(['author', 'text', 'file_name', 'summary', 'training sample indices',
       'prompt'],
      dtype='object')

In [3]:
pd.read_csv("LLM_writing/Setting1/enron/prompts.csv").columns

Index(['author', 'text', 'subject', 'summary', 'prompt',
       'training sample indices'],
      dtype='object')

In [4]:
pd.read_csv("LLM_writing/Setting1/reddit/prompts.csv").columns

Index(['index', 'author', 'text', 'subreddit', 'summary',
       'training sample indices', 'prompt'],
      dtype='object')

### Sampling Criteria

- 1. Sample up to 50 authors per dataset
- 2. Each author should have 10 samples per evaluation dataset
- 3. Each author in the evaluation data should at least form 2 clusters
- 4. These 2 clusters must also appear in the train sets, with each cluster have at least 10 samples

In [148]:
import os
import pandas as pd
from tqdm import tqdm
from bertopic import BERTopic


def load_dataset(dataset):
    train = pd.read_csv(f"dataset_prepare/{dataset}_train.csv")
    test = pd.read_csv(f"dataset_prepare/{dataset}_test.csv")
    test = test[train.columns.to_list() + ["summary"]]
    train["split"] = "train"
    test["split"] = "test"
    return pd.concat([train, test], ignore_index=True)


def berttopic_clustering(docs):
    topic_model = BERTopic()
    topics, _ = topic_model.fit_transform(docs)
    return topics


def topic_model_a_dataset(dataset):
    df = load_dataset(dataset)

    for author in tqdm(df.author.unique()):
        sub = df[df.author == author]
        docs = sub.text.tolist()
        topics = berttopic_clustering(docs)

        for j, ix in enumerate(sub.index):
            df.loc[ix, "topic"] = topics[j]
    
    test = df[df.split == "test"]
    test.drop(columns=["split"], inplace=True)
    train = df[df.split == "train"]
    train.drop(columns=["split"], inplace=True)

    original_train = pd.read_csv(f"dataset_prepare/{dataset}_train.csv")
    assert train.shape[0] == original_train.shape[0], "Train set size mismatch after topic modeling."
    assert train[["author", "text"]].equals(original_train[["author", "text"]]), "Train set content mismatch after topic modeling."

    test = test[test.topic != -1]
    authors_to_keep = []
    for author in test.author.unique():
        sub = test[test.author == author]

        # Check if the author has at least 10 samples in the test set with more than 1 topic
        if len(sub) >= 10 and len(sub.topic.unique()) > 1:
            test_topics = sub.topic.unique()
            train_sub = train[train.author == author]

            # Check if the author has at least 10 samples in the train set for each topic
            to_add = True

            for topic in test_topics:
                train_sub_sub = train_sub[train_sub.topic == topic]
                if len(train_sub_sub) < 5:
                    to_add = False

            if to_add:
                authors_to_keep.append(author)

    # test = test[test.author.isin(authors_to_keep)]
    test_new = []
    for author in authors_to_keep:
        sub = test[test.author == author].sample(10)
        test_new.append(sub)

    test = pd.concat(test_new, ignore_index=True)
    
    save_dir = "dataset_followup"
    os.makedirs(save_dir, exist_ok=True)
    train.to_csv(f"{save_dir}/{dataset}_train.csv", index=False)
    test.to_csv(f"{save_dir}/{dataset}_test.csv", index=False)
    print(f"Saved {dataset} dataset with topics to {save_dir} folder.")
    return train, test

In [149]:
train, test = topic_model_a_dataset("enron")
print("Number of unique authors in test set:", len(test.author.unique()))

KeyError: "['summary'] not in index"

In [140]:
train, test = topic_model_a_dataset("blog")
print("Number of unique authors in test set:", len(test.author.unique()))

100%|██████████| 100/100 [02:09<00:00,  1.29s/it]


Saved blog dataset with topics to dataset_followup folder.
Number of unique authors in test set: 78


In [141]:
train, test = topic_model_a_dataset("CCAT50")
print("Number of unique authors in test set:", len(test.author.unique()))

100%|██████████| 50/50 [00:17<00:00,  2.91it/s]

Saved CCAT50 dataset with topics to dataset_followup folder.
Number of unique authors in test set: 35





In [142]:
train, test = topic_model_a_dataset("reddit")
print("Number of unique authors in test set:", len(test.author.unique()))

100%|██████████| 100/100 [00:41<00:00,  2.41it/s]


Saved reddit dataset with topics to dataset_followup folder.
Number of unique authors in test set: 95


In [145]:
test

Unnamed: 0,index,author,text,subreddit,AA-label,topic
0,1142426,CaspianX2,"I'm not sure how you can call the BLS stats ""j...",politics,39,0.0
1,566030,CaspianX2,Even if Fluke was funded by a lobbying group t...,politics,39,0.0
2,1224678,CaspianX2,"Okay, I'll try to condense it down as best I c...",AskMen,39,1.0
3,379826,CaspianX2,"Actually, in a way, the rest of the world does...",explainlikeimfive,39,0.0
4,607184,CaspianX2,"Umm... Super Mario RPG is on Virtual Console, ...",gaming,39,2.0
...,...,...,...,...,...,...
945,1236798,ZeNuGerman,"I do machine learning, and thus routinely have...",AskReddit,68,3.0
946,582704,ZeNuGerman,"The issue here, though, is historical context....",AskReddit,68,1.0
947,38225,ZeNuGerman,Add enough transistors and you could simply le...,AskReddit,68,3.0
948,557901,ZeNuGerman,Thank you for your thoughtful reply. I do agre...,atheism,68,1.0


In [152]:
blog_test = pd.read_csv("dataset_prepare/CCAT50_test.csv")
blog_test.columns

Index(['author', 'text', 'file_name', 'AA-label',
       'longformer-base-4096-AA-prediction',
       'longformer-base-4096-AA-probabilities',
       'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities',
       'longformer-base-4096-AA-top_k-probabilities',
       'longformer-base-4096-AA-top_k-predictions',
       'ModernBERT-base-AA-top_k-probabilities',
       'ModernBERT-base-AA-top_k-predictions'],
      dtype='object')

In [153]:
blog_test = pd.read_csv("dataset_prepare/reddit_test.csv")
blog_test.columns

Index(['index', 'author', 'text', 'subreddit', 'AA-label',
       'longformer-base-4096-AA-prediction',
       'longformer-base-4096-AA-probabilities',
       'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities',
       'longformer-base-4096-AA-top_k-probabilities',
       'longformer-base-4096-AA-top_k-predictions',
       'ModernBERT-base-AA-top_k-probabilities',
       'ModernBERT-base-AA-top_k-predictions'],
      dtype='object')

In [154]:
blog_test = pd.read_csv("dataset_prepare/enron_test.csv")
blog_test.columns

Index(['author', 'text', 'subject', 'AA-label',
       'bert-base-uncased-AA-prediction', 'bert-base-uncased-AA-probabilities',
       'longformer-base-4096-AA-prediction',
       'longformer-base-4096-AA-probabilities',
       'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities',
       'longformer-base-4096-AA-top_k-probabilities',
       'longformer-base-4096-AA-top_k-predictions',
       'ModernBERT-base-AA-top_k-probabilities',
       'ModernBERT-base-AA-top_k-predictions'],
      dtype='object')

In [155]:
blog_test = pd.read_csv("dataset_prepare/blog_test.csv")
blog_test.columns

Index(['author', 'text', 'topic', 'gender', 'age', 'sign', 'date', 'summary',
       'training sample indices', 'prompt', 'label',
       'bert-base-uncased-prediction', 'bert-base-uncased-probabilities',
       'AA-label', 'longformer-base-4096-AA-prediction',
       'longformer-base-4096-AA-probabilities',
       'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities',
       'longformer-base-4096-AA-top_k-probabilities',
       'longformer-base-4096-AA-top_k-predictions',
       'ModernBERT-base-AA-top_k-probabilities',
       'ModernBERT-base-AA-top_k-predictions'],
      dtype='object')