In [1]:
import os

os.chdir("../")

#### Helper Funcs

In [2]:
import pandas as pd
from scripts.utils import get_filepathes_from_dir


def find_all_LLM_generated_writing_fps(LLM_writing_dir):
    """
    Find all fps in the given directory that contain LLM-generated writing.
    """
    # Get all file paths in the directory
    file_paths = get_filepathes_from_dir(LLM_writing_dir, 
                                         include_sub_dir=True, 
                                         file_format=".csv")
    
    assert any(fp.endswith("prompts.csv") for fp in file_paths), \
        "No prompts.csv files found in the directory. Not the right directory?" 
    # Filter out files that contain LLM-generated writing
    llm_fps = [fp for fp in file_paths if not fp.endswith("prompts.csv")]

    return llm_fps


def load_the_corresponding_prompts(llm_fp):
    """
    Load the corresponding prompts for the given LLM-generated writing file path.
    """
    # Get the corresponding prompts file path
    parent_dire = os.path.dirname(llm_fp)
    prompts_fp = os.path.join(parent_dire, "prompts.csv")
    
    # Check if the prompts file exists
    if not os.path.exists(prompts_fp):
        raise FileNotFoundError(f"Prompts file not found: {prompts_fp}")
    
    return pd.read_csv(prompts_fp)

In [3]:
llm_fps = find_all_LLM_generated_writing_fps("LLM_writing")

### AA Models

In [4]:
df1_test = pd.read_csv("dataset_prepare/blog_test.csv")
df2_test = pd.read_csv("dataset_prepare/CCAT50_test.csv")
df3_test = pd.read_csv("dataset_prepare/enron_test.csv")
df4_test = pd.read_csv("dataset_prepare/reddit_test.csv")

df1_test.columns, df2_test.columns, df3_test.columns, df4_test.columns

(Index(['author', 'text', 'topic', 'gender', 'age', 'sign', 'date', 'summary',
        'training sample indices', 'prompt', 'label',
        'bert-base-uncased-prediction', 'bert-base-uncased-probabilities',
        'AA-label', 'longformer-base-4096-AA-prediction',
        'longformer-base-4096-AA-probabilities',
        'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities'],
       dtype='object'),
 Index(['author', 'text', 'file_name', 'AA-label',
        'longformer-base-4096-AA-prediction',
        'longformer-base-4096-AA-probabilities',
        'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities'],
       dtype='object'),
 Index(['author', 'text', 'subject', 'AA-label',
        'bert-base-uncased-AA-prediction', 'bert-base-uncased-AA-probabilities',
        'longformer-base-4096-AA-prediction',
        'longformer-base-4096-AA-probabilities',
        'ModernBERT-base-AA-prediction', 'ModernBERT-base-AA-probabilities'],
       dtype='object'),
 Index([

In [None]:
# from sklearn.metrics import accuracy_score


# def get_report_from_df(df, dataset):
#     out = []
#     cols = ["dataset", "# classes", "min_class_num", "avg_class_num", "model", "accuracy"]
#     models_pred = [col for col in df.columns if col.endswith("-AA-prediction")]
#     labels = df["AA-label"].tolist()
#     number_of_classes = len(set(labels))
#     min_class_num = df["AA-label"].value_counts().min()
#     avg_class_num = df["AA-label"].value_counts().mean()
#     for model_pred in models_pred:
#         preds = df[model_pred].tolist()
#         accu = accuracy_score(labels, preds)
#         model = model_pred.split("-")[0]
#         out.append([dataset, number_of_classes, min_class_num, avg_class_num, model, accu])

#     return pd.DataFrame(out, columns=cols)
        

In [35]:
# o1 = get_report_from_df(df1_test, "blog_test")
# o2 = get_report_from_df(df2_test, "CCAT50_test")
# o3 = get_report_from_df(df3_test, "enron_test")
# o4 = get_report_from_df(df4_test, "reddit_test")
# o = pd.concat([o1, o2, o3, o4], axis=0)
# o = o.reset_index(drop=True)
# o

#### deployment

In [5]:
import os
import json
import pandas as pd

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from sklearn.metrics import classification_report
from transformers import Trainer, AutoModelForSequenceClassification


def get_text_encodings(model_name, texts, max_length):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return  tokenizer(texts, truncation=True, 
                      padding="max_length", 
                      max_length=max_length)


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def get_dataset(model_name, texts,
                max_length, labels):
    
    encodings = get_text_encodings(model_name, texts, 
                                   max_length)

    dataset = CustomDataset(encodings, labels)
    return dataset


def get_model_and_trainer(ckpt_dir):
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_dir)
    trainer = Trainer(model=model)
    return trainer


def print_classification_report(y_test, y_pred):
    print(classification_report(y_test, y_pred, zero_division=0))


def sanity_check_av_ckpt_dir(ckpt_dir, test_set_fp=None, 
                             test_on_samples=False, samples_size=1000):
    ckpt_dir_parent = os.path.dirname(ckpt_dir)

    with open(os.path.join(ckpt_dir_parent, "args.json"), "r") as f:
        args = json.load(f)
    
    model_name = args["model_name"]
    max_length = args["max_length"]
    if test_set_fp is None:
        test_set_fp = args["test_df_fp"]

    print(f"Test set file path: {test_set_fp}")
    df = pd.read_csv(test_set_fp)

    if test_on_samples:
        df = df.sample(samples_size, random_state=42).reset_index(drop=True)

    labels = df["AA-label"].tolist()
    dataset = get_dataset(model_name, df["text"].tolist(), 
                          max_length, labels)
    
    trainer = get_model_and_trainer(ckpt_dir)
    predictions = trainer.predict(dataset)
    y_pred = predictions.predictions.argmax(-1)

    model_name = model_name.split('/')[-1]
    prev_y_pred = df[f"{model_name}-AA-prediction"]
    overlap = (y_pred == prev_y_pred).mean()
    print(f"Overlap: {overlap:.2f}")

    # print_classification_report(y_pred, labels)

  from .autonotebook import tqdm as notebook_tqdm
2025-05-01 20:37:52.092367: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746146272.103514 2470291 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746146272.107141 2470291 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746146272.117583 2470291 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746146272.117594 2470291 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746146272.117596 2470291

In [50]:
sanity_check_av_ckpt_dir("AA_models/longformer-base-4096/blog/checkpoint-6300", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/blog_test.csv


Overlap: 1.00


In [51]:
sanity_check_av_ckpt_dir("AA_models/longformer-base-4096/CCAT50/checkpoint-620", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/CCAT50_test.csv


Overlap: 1.00


In [52]:
sanity_check_av_ckpt_dir("AA_models/longformer-base-4096/enron/checkpoint-970", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/enron_test.csv


Overlap: 1.00


In [53]:
sanity_check_av_ckpt_dir("AA_models/longformer-base-4096/reddit/checkpoint-2110", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/reddit_test.csv


Overlap: 1.00


In [54]:
sanity_check_av_ckpt_dir("AA_models/ModernBERT-base/blog/checkpoint-2524", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/blog_test.csv




Overlap: 1.00


In [55]:
sanity_check_av_ckpt_dir("AA_models/ModernBERT-base/CCAT50/checkpoint-620", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/CCAT50_test.csv


Overlap: 1.00


In [56]:
sanity_check_av_ckpt_dir("AA_models/ModernBERT-base/enron/checkpoint-970", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/enron_test.csv


Overlap: 1.00


In [57]:
sanity_check_av_ckpt_dir("AA_models/ModernBERT-base/reddit/checkpoint-2110", 
                         test_on_samples=True, samples_size=1000)

Test set file path: dataset_prepare/reddit_test.csv


Overlap: 0.99


In [58]:
sanity_check_av_ckpt_dir("AA_models/ModernBERT-base/reddit/checkpoint-2110",)

Test set file path: dataset_prepare/reddit_test.csv


Overlap: 1.00


In [59]:
ckpt_dir = "AA_models/ModernBERT-base/reddit/checkpoint-2110"
ckpt_dir_parent = os.path.dirname(ckpt_dir)
with open(os.path.join(ckpt_dir_parent, "args.json"), "r") as f:
    args = json.load(f)
model_name = args["model_name"]
max_length = args["max_length"]
df = pd.read_csv(args["test_df_fp"])
df = df.sample(1000, random_state=42).reset_index(drop=True)
labels = df["AA-label"].tolist()
dataset = get_dataset(model_name, df["text"].tolist(), 
                      max_length, labels)
trainer = get_model_and_trainer(ckpt_dir)
predictions = trainer.predict(dataset)
y_pred = predictions.predictions.argmax(-1)

NameError: name 'prev_y_pred' is not defined

In [61]:
model_name = model_name.split('/')[-1]
prev_y_pred = df[f"{model_name}-AA-prediction"]
overlap = (y_pred == prev_y_pred).mean()
print(f"Overlap: {overlap:.2f}")

Overlap: 0.99


In [62]:
df[y_pred != prev_y_pred]

Unnamed: 0,index,author,text,subreddit,AA-label,longformer-base-4096-AA-prediction,longformer-base-4096-AA-probabilities,ModernBERT-base-AA-prediction,ModernBERT-base-AA-probabilities
157,252998,TooManyInLitter,"Go, try to enjoy yourself. Look at all the lit...",TrueAtheism,15,15,0.019077,86,0.001097
215,1252363,adrianmonk,"the more scales you know, the easier this is g...",piano,36,63,0.001499,2,0.000527
320,295797,Philo_T_Farnsworth,If ever there was a single movie that defined ...,Physics,81,60,0.001728,4,0.001876
338,1779037,avapoet,"Good advice generally, but not true. Cancellin...",AskUK,5,5,0.00214,90,0.00085
478,1012050,tubcat,Here's my honest opinion here. Find a jumping ...,comicbooks,98,98,0.003665,98,0.014395


In [65]:
from torch.nn.functional import softmax

logits = predictions.predictions  # This contains the raw logits output
    # Convert logits to probabilities using softmax
probabilities = softmax(torch.tensor(logits), dim=1)

topk_values, topk_indices = torch.topk(probabilities, k=10, dim=1)

# Convert to Python lists for further use
top10_probs = topk_values.tolist()
top10_labels = topk_indices.tolist()

In [72]:
df["top10_probs"] = top10_probs
df["top10_labels"] = top10_labels

In [73]:
df[y_pred != prev_y_pred]

Unnamed: 0,index,author,text,subreddit,AA-label,longformer-base-4096-AA-prediction,longformer-base-4096-AA-probabilities,ModernBERT-base-AA-prediction,ModernBERT-base-AA-probabilities,top10_probs,top10_labels
157,252998,TooManyInLitter,"Go, try to enjoy yourself. Look at all the lit...",TrueAtheism,15,15,0.019077,86,0.001097,"[0.18032802641391754, 0.17674608528614044, 0.1...","[30, 86, 79, 42, 64, 40, 10, 15, 52, 43]"
215,1252363,adrianmonk,"the more scales you know, the easier this is g...",piano,36,63,0.001499,2,0.000527,"[0.1878989189863205, 0.18598251044750214, 0.16...","[26, 2, 84, 59, 39, 29, 25, 64, 36, 77]"
320,295797,Philo_T_Farnsworth,If ever there was a single movie that defined ...,Physics,81,60,0.001728,4,0.001876,"[0.23631545901298523, 0.23211009800434113, 0.1...","[35, 4, 81, 80, 28, 94, 87, 34, 20, 31]"
338,1779037,avapoet,"Good advice generally, but not true. Cancellin...",AskUK,5,5,0.00214,90,0.00085,"[0.2719070017337799, 0.2707090377807617, 0.112...","[33, 90, 49, 5, 25, 47, 42, 53, 3, 89]"
478,1012050,tubcat,Here's my honest opinion here. Find a jumping ...,comicbooks,98,98,0.003665,98,0.014395,"[0.44210806488990784, 0.4293091893196106, 0.04...","[11, 98, 7, 56, 92, 1, 91, 71, 17, 15]"


In [74]:
def get_top_k_accuracy(y_true, y_pred, k=10):
    """
    Calculate the top-k accuracy.
    """
    top_k_correct = 0
    for i in range(len(y_true)):
        if y_true[i] in y_pred[i][:k]:
            top_k_correct += 1
    return top_k_correct / len(y_true)

In [77]:
for k in range(1, 11):
    top_k_accu = get_top_k_accuracy(labels, top10_labels, k=k)
    print(f"Top-{k} accuracy: {top_k_accu:.2f}")

Top-1 accuracy: 0.68
Top-2 accuracy: 0.79
Top-3 accuracy: 0.82
Top-4 accuracy: 0.85
Top-5 accuracy: 0.87
Top-6 accuracy: 0.89
Top-7 accuracy: 0.91
Top-8 accuracy: 0.92
Top-9 accuracy: 0.93
Top-10 accuracy: 0.94


### Real deployment

In [None]:
import os
import json
import pandas as pd

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from torch.nn.functional import softmax
from sklearn.metrics import classification_report
from transformers import Trainer, AutoModelForSequenceClassification


def get_text_encodings(model_name, texts, max_length):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return  tokenizer(texts, truncation=True, 
                      padding="max_length", 
                      max_length=max_length)


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def get_dataset(model_name, texts,
                max_length, labels):
    
    encodings = get_text_encodings(model_name, texts, 
                                   max_length)

    dataset = CustomDataset(encodings, labels)
    return dataset


def get_model_and_trainer(ckpt_dir):
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_dir)
    trainer = Trainer(model=model)
    return trainer


def print_classification_report(y_test, y_pred):
    print(classification_report(y_test, y_pred, zero_division=0))


def deploy_an_AA_model(ckpt_dir, deploy_fp, 
                       text_col="writing", 
                       top_k=10, overwrite=False):
    
    ckpt_dir_parent = os.path.dirname(ckpt_dir)

    with open(os.path.join(ckpt_dir_parent, "args.json"), "r") as f:
        args = json.load(f)
    
    model_name = args["model_name"]
    max_length = args["max_length"]
    model_name__ = model_name.split('/')[-1]

    df = pd.read_csv(deploy_fp)

    if text_col not in df.columns:
        raise ValueError(f"Column '{text_col}' not found in the DataFrame.")

    df[text_col] = df[text_col].fillna("SOMETHING_WRONG")
    
    if f"{model_name__}-AA-top_k-predictions" in df.columns and not overwrite:
        print(f"Column '{model_name__}-AA-top_k-predictions' already exists in the DataFrame. "
              f"Set 'overwrite=True' to overwrite it.")
        return

    labels = [0] * len(df)  # Dummy labels, not used in prediction
    dataset = get_dataset(model_name, 
                          df[text_col].tolist(), 
                          max_length, labels)
    
    trainer = get_model_and_trainer(ckpt_dir)
    predictions = trainer.predict(dataset)
    logits = predictions.predictions  # This contains the raw logits output
    # Convert logits to probabilities using softmax
    probabilities = softmax(torch.tensor(logits), dim=1)

    topk_values, topk_indices = torch.topk(probabilities, k=top_k, dim=1)

    # Convert to Python lists for further use
    top_k_probs = topk_values.tolist()
    top_k_preds = topk_indices.tolist()
    df[f"{model_name__}-AA-top_k-probabilities"] = top_k_probs
    df[f"{model_name__}-AA-top_k-predictions"] = top_k_preds
    df.to_csv(deploy_fp, index=False)
    print(f"Deployment completed. Results saved to {deploy_fp}")


def main():
    models = ["AA_models/longformer-base-4096", 
              "AA_models/ModernBERT-base"]
    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for model in models:
        for dataset in datasets:
            dir_path = os.path.join(model, dataset)
            ckpt_dir_names = [dn for dn in os.listdir(dir_path) if dn.startswith("checkpoint-")]
            
            if not ckpt_dir_names:
                print(f"No checkpoints found in {dir_path}.")
                continue
            
            # select the latest checkpoint
            ckpt_dir_names.sort(key=lambda x: int(x.split("-")[1]))
            ckpt_dir = os.path.join(dir_path, ckpt_dir_names[-1])

            for setting in [1, 2, 3, 4, 5]:
                dataset_dir = os.path.join("LLM_writing", f"Setting{setting}", dataset)
                if not os.path.exists(dataset_dir):
                    print(f"Directory for Setting {setting} and dataset {dataset} does not exist.")
                    continue
            
                prompt_fp = os.path.join(dataset_dir, "prompts.csv")
                if not os.path.exists(prompt_fp):
                    print(f"Prompts file not found in {dataset_dir}.")
                    continue
                
                df_prompts = pd.read_csv(prompt_fp)

                llm_fps = [os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir) 
                        if f.endswith(".csv") and f != "prompts.csv"]
                
                for llm_fp in llm_fps:
                    llm_df = pd.read_csv(llm_fp)

                    if len(df_prompts) != len(llm_df):
                        print(f"Length mismatch between prompts and LLM-generated writing for {llm_fp}.")
                        continue

                    deploy_an_AA_model(ckpt_dir, llm_fp,
                                    text_col="writing", 
                                    top_k=10, overwrite=False)


if __name__ == "__main__":
    main()

In [None]:
ckpt_dir = "AA_models/longformer-base-4096/blog/checkpoint-6300"
fp = "LLM_writing/Setting1/blog/gemini-2.0-flash.csv"
deploy_an_AA_model(ckpt_dir, fp)

In [47]:
sorted(["checkpoint-630", "checkpoint-3300"], key =lambda x: int(x.split("-")[1]))

['checkpoint-630', 'checkpoint-3300']

### AV models

In [None]:
import os
import json
import pandas as pd

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from sklearn.metrics import classification_report
from transformers import Trainer, AutoModelForSequenceClassification


def get_text_encodings(model_name, texts1, texts2, max_length):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer(texts1, texts2, 
                     truncation=True, 
                     padding="max_length", 
                     max_length=max_length)


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) 
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def get_dataset(model_name, texts1, texts2,
                max_length, labels):
    
    encodings = get_text_encodings(model_name, 
                                   texts1, texts2, 
                                   max_length)

    dataset = CustomDataset(encodings, labels)
    return dataset


def get_model_and_trainer(model_load_file):
    model = AutoModelForSequenceClassification.from_pretrained(model_load_file)
    trainer = Trainer(model=model)
    return trainer


def deploy_an_AV_model(ckpt_dir, 
                       deploy_fp1, 
                       deploy_fp2,
                       text_col1,
                       text_col2,
                       overwrite=False):
    ckpt_dir_parent = os.path.dirname(ckpt_dir)

    with open(os.path.join(ckpt_dir_parent, "args.json"), "r") as f:
        args = json.load(f)
    
    model_name = args["model_name"]
    max_length = args["max_length"]
    
    df1 = pd.read_csv(deploy_fp1)[:1000]
    df2 = pd.read_csv(deploy_fp2)[:1000]

    assert df1.shape[0] == df2.shape[0], \
        f"DataFrames must have the same number of rows. " \
        f"Got {df1.shape[0]} and {df2.shape[0]} rows."

    model_name__ = model_name.split('/')[-1]
    if f"{model_name__}-prediction" in df2.columns and not overwrite:
        print(f"Column '{model_name__}-prediction' already exists in the DataFrame. "
              f"Set 'overwrite=True' to overwrite it.")
        return

    if text_col1 not in df1.columns:
        raise ValueError(f"Column '{text_col1}' not found in the DataFrame.")
    if text_col2 not in df2.columns:
        raise ValueError(f"Column '{text_col2}' not found in the DataFrame.")
    
    df1[text_col1] = df1[text_col1].fillna("SOMETHING_WRONG")
    df2[text_col2] = df2[text_col2].fillna("SOMETHING_WRONG")


    labels = [0] * len(df1)  # Dummy labels, not used in prediction
    dataset = get_dataset(model_name, df1[text_col1].tolist(), 
                          df2[text_col2].tolist(), max_length, labels)
    
    trainer = get_model_and_trainer(ckpt_dir)
    predictions = trainer.predict(dataset)
    y_pred = predictions.predictions.argmax(-1)
    logits = predictions.predictions  # This contains the raw logits output
    # Convert logits to probabilities using softmax
    probabilities = softmax(torch.tensor(logits), dim=1).tolist()

    df2[f"{model_name__}-prediction"] = y_pred
    df2[f"{model_name__}-probabilities"] = probabilities
    df2.to_csv(deploy_fp2, index=False)
    print(f"Deployment completed. Results saved to {deploy_fp2}")


def main():
    models = ["AV_models/longformer-base-4096", 
              "AV_models/ModernBERT-base"]
    datasets = ["CCAT50", "enron", "reddit", "blog"]

    for model in models:
        for dataset in datasets:
            dir_path = os.path.join(model, dataset)
            ckpt_dir_names = [dn for dn in os.listdir(dir_path) if dn.startswith("checkpoint-")]
            
            if not ckpt_dir_names:
                print(f"No checkpoints found in {dir_path}.")
                continue
            
            # select the latest checkpoint
            ckpt_dir_names.sort(key=lambda x: int(x.split("-")[1]))
            ckpt_dir = os.path.join(dir_path, ckpt_dir_names[-1])

            for setting in [1, 2, 3, 4, 5]:
                dataset_dir = os.path.join("LLM_writing", f"Setting{setting}", dataset)
                if not os.path.exists(dataset_dir):
                    print(f"Directory for Setting {setting} and dataset {dataset} does not exist.")
                    continue
                    
                prompt_fp = os.path.join(dataset_dir, "prompts.csv")
                if not os.path.exists(prompt_fp):
                    print(f"Prompts file not found in {dataset_dir}.")
                    continue
                df_prompts = pd.read_csv(prompt_fp)
                
                llm_fps = [os.path.join(dataset_dir, f) for f in os.listdir(dataset_dir) 
                        if f.endswith(".csv") and f != "prompts.csv"]
                
                for llm_fp in llm_fps:
                    llm_df = pd.read_csv(llm_fp)

                    if len(df_prompts) != len(llm_df):
                        print(f"Length mismatch between prompts and LLM-generated writing for {llm_fp}.")
                        continue
                    
                    print(f"===> Deploying model {model} on {llm_fp}")
                    deploy_an_AV_model(ckpt_dir, prompt_fp, 
                                       llm_df, text_col1="text",
                                       text_col2="writing",
                                       overwrite=False)


if __name__ == "__main__":
    main()


### Style Models

In [13]:
!ls

AA_models			      LLM_writing
AV_models			      notebooks
create_stylometry_features.py	      README.md
create_summaries_for_eval_samples.py  requirements.txt
create_summaries.sh		      scripts
dataset_prepare			      Style_features_LLM
fileStructure.png		      train_AA_classifiers.sh
generate_llm_writing.py		      train_and_eval_an_AA_model.py
generate_llm_writing.sh		      train_and_eval_an_AV_model.py
LIWC2007_English100131.dic	      train_AV_classifiers.sh


In [15]:
df = pd.read_csv("Style_features_LLM/Setting1/blog/gemini-2.0-flash_features.csv")
df

Unnamed: 0,writing,liwc_features,writeprint_features
0,"so, alltel. :) monthly visits to see the hotti...","{'liwc_funct_frac': 0.3575, 'liwc_pronoun_frac...","{'letter_a': 0.0584, 'letter_b': 0.0173, 'lett..."
1,IT Jobs Moving Overseas\n\nAccording to a Gart...,"{'liwc_funct_frac': 0.4196, 'liwc_pronoun_frac...","{'letter_a': 0.0378, 'letter_b': 0.0198, 'lett..."
2,"So, this nightclub fire thing. Awful, right? J...","{'liwc_funct_frac': 0.409, 'liwc_pronoun_frac'...","{'letter_a': 0.0644, 'letter_b': 0.0145, 'lett..."
3,"Ugh, conservatives. Seriously, what IS that ab...","{'liwc_funct_frac': 0.3946, 'liwc_pronoun_frac...","{'letter_a': 0.054, 'letter_b': 0.0154, 'lette..."
4,"beach trip! yay! the weather was so nice, and ...","{'liwc_funct_frac': 0.4516, 'liwc_pronoun_frac...","{'letter_a': 0.062, 'letter_b': 0.0113, 'lette..."
...,...,...,...
25220,"So, Rolling Stone just dropped their top 500 a...","{'liwc_funct_frac': 0.3984, 'liwc_pronoun_frac...","{'letter_a': 0.0583, 'letter_b': 0.0168, 'lett..."
25221,Reflections on Doing All to the Glory of God\n...,"{'liwc_funct_frac': 0.5145, 'liwc_pronoun_frac...","{'letter_a': 0.0499, 'letter_b': 0.0138, 'lett..."
25222,Hey Gals! Things have been so crazy busy here ...,"{'liwc_funct_frac': 0.465, 'liwc_pronoun_frac'...","{'letter_a': 0.0547, 'letter_b': 0.0118, 'lett..."
25223,"Eh, whatever about the World Series. I used to...","{'liwc_funct_frac': 0.4655, 'liwc_pronoun_frac...","{'letter_a': 0.0638, 'letter_b': 0.0213, 'lett..."


In [1]:
import os

os.chdir("../")

In [3]:
import pandas as pd

df = pd.read_csv("LLM_writing/Setting1/blog/gpt-4o-2024-08-06.csv")
df.columns

Index(['writing', 'longformer-base-4096-AA-top_k-probabilities',
       'longformer-base-4096-AA-top_k-predictions',
       'ModernBERT-base-AA-top_k-probabilities',
       'ModernBERT-base-AA-top_k-predictions', 'ModernBERT-base-prediction',
       'ModernBERT-base-probabilities', 'ModernBERT-base-AV-prediction',
       'ModernBERT-base-AV-probabilities'],
      dtype='object')