In [None]:
# https://github.com/baixianghuang/authorship-llm/tree/main

In [None]:
!pip install openai

In [None]:
!pip install tiktoken

In [None]:
import os
import csv
import json
import time
import torch
import openai
import pickle
import random
import tiktoken
# import py3langid
import numpy as np
import pandas as pd
import torch.nn.functional as F

from random import shuffle
from sklearn import metrics
from ast import literal_eval
from openai import AzureOpenAI
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

In [None]:
N_EVAL=10

In [None]:
def num_tokens_from_string(string, encoding_name):
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def eval_fn(y_test, y_pred):
    acc = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
    f1_w = round(metrics.f1_score(y_test, y_pred, average='weighted')*100, 2)
    f1_micro = round(metrics.f1_score(y_test, y_pred, average='micro')*100, 2)
    f1_macro = round(metrics.f1_score(y_test, y_pred, average='macro')*100, 2)
    # Macro Precision
    macro_precision = round(metrics.precision_score(y_test, y_pred, average='macro') * 100, 2)
    # Macro Recall
    macro_recall = round(metrics.recall_score(y_test, y_pred, average='macro') * 100, 2)
    print(f"Macro Precision: {macro_precision}%")
    print(f"Macro Recall: {macro_recall}%")
    return acc, f1_w, f1_micro, f1_macro


def embed_fn(model_name, texts, baseline_type):
    if baseline_type == 'bert':
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenized_texts = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        embedding = model(tokenized_texts.input_ids.to(model.device), tokenized_texts.attention_mask.to(model.device)).last_hidden_state.mean(dim=1)
    elif baseline_type == 'tf-idf':
        vectorizer = TfidfVectorizer(max_features=3000, analyzer='char', ngram_range=(4, 4))
        embedding = torch.from_numpy(vectorizer.fit_transform(texts).toarray())
    elif baseline_type == 'ada':
        ada_client = AzureOpenAI(api_key = "replace_this", api_version = "2023-05-15", azure_endpoint = "replace_this")
        ada_response = ada_client.embeddings.create(input = texts, model = "replace_this")
        embedding = torch.Tensor([e.embedding for e in ada_response.data])
    return embedding


def run_aa_baseline(df_sub, model_name, baseline_type='bert'):
    ls_acc, ls_f1_w, ls_f1_micro, ls_f1_macro = [], [], [], []

    for i in df_sub.index:
        ls_query_text, ls_potential_text = df_sub.loc[i, 'query_text'], df_sub.loc[i, 'potential_text']
        embed_query_texts = F.normalize(embed_fn(model_name, ls_query_text, baseline_type))
        embed_potential_texts = F.normalize(embed_fn(model_name, ls_potential_text, baseline_type))

        preds = embed_query_texts @ embed_potential_texts.T
        preds = F.softmax(preds, dim=-1)
        labels = np.arange(0, len(ls_query_text))

        acc, f1_w, f1_micro, f1_macro = eval_fn(labels, preds.argmax(-1).numpy())
        ls_acc.append(acc)
        ls_f1_w.append(f1_w)
        ls_f1_micro.append(f1_micro)
        ls_f1_macro.append(f1_macro)

    muti_avg = (round(np.mean(ls_acc), 2), round(np.mean(ls_f1_w), 2), round(np.mean(ls_f1_micro), 2), round(np.mean(ls_f1_macro), 2))
    muti_std = (round(np.std(ls_acc), 2), round(np.std(ls_f1_w), 2), round(np.std(ls_f1_micro), 2), round(np.std(ls_f1_macro), 2))
    return muti_avg, muti_std

# data prep

In [None]:
df = pd.read_csv("blogtext.csv")
df.drop(['gender', 'age', 'topic', 'sign', 'date'], axis=1, inplace=True)
df.shape


# Finding and removing duplicate rows
df[df[['text']].duplicated(keep=False)].sort_values('text')


print('Before removing duplicates, df.shape:', df.shape)
df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
print('New df.shape:', df.shape)


%%time
print(f"{df.shape[0]:,}")
df['lang'] = df['text'].apply(lambda x: py3langid.classify(x)[0])
print('% of English text:', f"{df[df.lang=='en'].shape[0] / df.shape[0]}")

df = df[df.lang=='en']
df.drop('lang', axis=1, inplace=True)
print(f"{df.shape[0]:,}")


# check # of tokens
for i in range(10):
    text1, text2 = df.sample(2).text.values
    print(num_tokens_from_string(text1 + text2, "gpt-3.5-turbo"))


%%time
df = df[df["text"].apply(lambda x: num_tokens_from_string(x, "gpt-3.5-turbo") < 512)]
print(f"{df.shape[0]:,}")


%%time
df = df[df["text"].apply(lambda x: num_tokens_from_string(x, "gpt-3.5-turbo") > 56)]
print(f"{df.shape[0]:,}")


v = df.id.value_counts()
df = df[df.id.isin(v[v >= 2].index)]
print('# unique authors:', df.id.nunique())
print('New df.shape:', df.shape)


def sampler_aa_fn_pro(df, n, reps):
    """
    Sample a new list of authors every time, use each of author as a query author so that the number of labels = n.
    Then, compute evaluaion metric for this set of authors and repeat this for multiple times (repetitions) to compute mean F1 etc.
    All the authors are unique as long as n is less than the number of unique authors.
    n: number of candidate authors.
    reps: number of repetitions.
    """
    dict_to_df = []
    ls_unique_author = df.id.unique().tolist()
    for _ in range(reps):
        candidate_authors = random.sample(ls_unique_author, n)
        ls_unique_author = [e for e in ls_unique_author if e not in candidate_authors]
        ls_queries, ls_potential_texts = [], []
        dict_row = {}

        for author_id in candidate_authors:
            # each text in these 2 lists are from unique authors, texts at same index are from the same author
            text, text_same_author = df.loc[author_id == df.id].text.sample(2)
            ls_queries.append(text)
            ls_potential_texts.append(text_same_author)

        dict_row["query_text"] = ls_queries
        dict_row["potential_text"] = ls_potential_texts
        dict_to_df.append(dict_row)

    df_sub = pd.DataFrame(dict_to_df)
    return df_sub




In [None]:

def sampler_aa_fn_pro(df, n, reps):
    """
    Sample a new list of authors every time, use each of author as a query author so that the number of labels = n.
    Then, compute evaluaion metric for this set of authors and repeat this for multiple times (repetitions) to compute mean F1 etc.
    All the authors are unique as long as n is less than the number of unique authors.
    n: number of candidate authors.
    reps: number of repetitions.
    """
    dict_to_df = []
    ls_unique_author = df.id.unique().tolist()
    for _ in range(reps):
        candidate_authors = random.sample(ls_unique_author, n)
        ls_unique_author = [e for e in ls_unique_author if e not in candidate_authors]
        ls_queries, ls_potential_texts = [], []
        dict_row = {}

        for author_id in candidate_authors:
            # each text in these 2 lists are from unique authors, texts at same index are from the same author
            text, text_same_author = df.loc[author_id == df.id].text.sample(2)
            ls_queries.append(text)
            ls_potential_texts.append(text_same_author)

        dict_row["query_text"] = ls_queries
        dict_row["potential_text"] = ls_potential_texts
        dict_to_df.append(dict_row)

    df_sub = pd.DataFrame(dict_to_df)
    return df_sub

# nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/imdb62/processed/imdb62_AA_test.csv')
nlp_test = pd.read_csv('/content/drive/MyDrive/msc_project/data/diffusiondb/paraphrased/test_random100_label_1.csv')
nlp_test = nlp_test[['prompt', 'user_name']]
nlp_test.columns = ['text', 'id']
# nlp_test.columns = ['id', 'text']
print(nlp_test)
df_sub = sampler_aa_fn_pro(nlp_test, 10, 3)

In [None]:
df_sub

# exp

In [None]:
def run_aa(df, method, model_name, prompt_input, system_msg, ls_df, ls_model, ls_method, n_eval=N_EVAL):
    """randomly select a subset of query texts"""
    start_time = time.time()
    df_res_all = pd.DataFrame()
    print("\n++++++++++ ", method, model_name, n_eval, " ++++++++++")

    for i in df.index:
        ls_reps = []
        text_label_map = {}
        sampled_queries = []  # select a subset for evaluation (e.g, n_eval out of 10)
        ls_query_text, ls_potential_text = df.loc[i, 'query_text'], df.loc[i, 'potential_text']
        random.seed(0)
        for idx, val in random.sample(list(enumerate(ls_query_text)), n_eval):
            text_label_map[val] = idx
            sampled_queries.append(val)

        for query_text in sampled_queries:
            example_texts = json.dumps(dict(enumerate(ls_potential_text)))
            prompt = prompt_input+f"""The input texts are delimited with triple backticks. ```\n\nQuery text: {query_text} \n\nTexts from potential authors: {example_texts}\n\n```"""
            # List of potential author IDs: {list(dict(enumerate(ls_potential_text)).keys())}

            raw_response = openai.chat.completions.create(
                model=model_name,
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )

            response_str = raw_response.choices[0].message.content
            print(prompt)
            print('\nRaw response content:\n', response_str, '\nLabel:', text_label_map[query_text])
            try:
                response = json.loads(response_str, strict=False)
            except json.JSONDecodeError:
                print(f"++++++++++ JSONDecodeError ++++++++++")
                response = json.loads("{}")
                response['analysis'] = response_str
                response['answer'] = -1

            response["query_text"], response["example_texts"] = query_text, example_texts
            response["tokens"] = raw_response.usage.total_tokens
            response["label"] = text_label_map[query_text]
            ls_reps.append(response)
            response = None

        df_reps = pd.DataFrame(ls_reps)
        df_reps['answer'] = pd.to_numeric(df_reps['answer'], errors='coerce')
        df_reps['answer'] = df_reps['answer'].fillna(-1)
        df_res_all = pd.concat([df_res_all, df_reps]).reset_index(drop=True)

    ls_df.append(df_res_all)
    ls_method.append(method)
    ls_model.append(model_name)
    print("--- Execution Time: %s seconds ---" % round(time.time() - start_time, 2))
    return df_res_all

In [None]:
dict_baseline = {'TF-IDF':'TF-IDF', 'BERT':'bert-base-uncased',
                 'RoBERTa':'roberta-base', 'ELECTRA':'google/electra-base-discriminator',
                 'DeBERTa':'microsoft/deberta-base'}
dict_embed_type = {'TF-IDF':'tf-idf', 'BERT':'bert', 'RoBERTa':'bert',
                   'ELECTRA':'bert', 'DeBERTa':'bert'}

def compare_baseline_mod(df_sub, ls_df, ls_model, ls_method, n_eval=N_EVAL, std_flag=False, baseline_idx=len(dict_baseline)):
    ls_res_avg, ls_res_std = [], []

    # for key, val in list(dict_baseline.items())[:baseline_idx]:
    #     muti_avg, muti_std = run_aa_baseline(df_sub, val, dict_embed_type[key])
    #     ls_res_avg.append((key, val)+muti_avg+(0,))
    #     ls_res_std.append((key, val)+muti_std+(0,))

    for i, df_tmp in enumerate(ls_df):
        muti_avg, muti_std = eval_all_fn(df_tmp, n_eval)
        answer_tmp = df_tmp.copy()

        ls_res_avg.append((ls_method[i], ls_model[i])+muti_avg+(abs(answer_tmp[answer_tmp.answer==-1]['answer'].astype('int').sum()),))
        ls_res_std.append((ls_method[i], ls_model[i])+muti_std+(None,))

    res_avg = pd.DataFrame(ls_res_avg, columns=ls_col)
    res_std = pd.DataFrame(ls_res_std, columns=ls_col)
    if std_flag:
        return res_avg, res_std
    else:
        return res_avg


def eval_all_fn(df_res_all, n_eval):
    """evaluate the entire df of multiple repetitions, take avg of each rep.
    The null or -1 answers are counted as false
    Make sure n_eval is same in run_aa()"""
    ls_acc, ls_f1_w, ls_f1_micro, ls_f1_macro = [], [], [], []
    for i in range(0, len(df_res_all.index), n_eval):
        df_reps = df_res_all[i: i+n_eval]
        acc, f1_w, f1_micro, f1_macro = eval_fn(df_reps["label"], df_reps["answer"])
        ls_acc.append(acc)
        ls_f1_w.append(f1_w)
        ls_f1_micro.append(f1_micro)
        ls_f1_macro.append(f1_macro)

    muti_avg = (round(np.mean(ls_acc), 2), round(np.mean(ls_f1_w), 2), round(np.mean(ls_f1_micro), 2), round(np.mean(ls_f1_macro), 2))
    muti_std = (round(np.std(ls_acc), 2), round(np.std(ls_f1_w), 2), round(np.std(ls_f1_micro), 2), round(np.std(ls_f1_macro), 2))
    return muti_avg, muti_std

In [None]:
# api_version = "2023-12-01-preview"  # "2023-05-15"
ls_col = ['Prompt', 'Model', 'Accuracy', 'Weighted F1', 'Micro F1', 'Macro F1', 'Unsure']

openai.api_key = ''

# n=10

In [None]:
# m1, m2 = "gpt-35-turbo", "gpt-4-turbo"
# m1, m2 = 'gpt-3.5-turbo', 'gpt-4-turbo'
m1 = 'gpt-4-turbo'
v1, v2, v3, v4 = 'no_guidance', 'little_guidance', 'grammar', 'LIP'

prompt1 = "Given a set of texts with known authors and a query text, determine the author of the query text. "
prompt2 = prompt1+"Do not consider topic differences. "
prompt3 = prompt1+"Focus on grammatical styles. "
prompt4 = prompt1+"Analyze the writing styles of the input texts, disregarding the differences in topic and content. Focus on linguistic features such as phrasal verbs, modal verbs, punctuation, rare words, affixes, quantities, humor, sarcasm, typographical errors, and misspellings. "
system_msg = """Respond with a JSON object including two key elements:
{
  "analysis": Reasoning behind your answer.
  "answer": The query text's author ID.
}"""

In [None]:
%%time

# imdb62
ls_df_10, ls_model_10, ls_method_10 = [], [], []

# df1_gpt35 = run_aa(df_10, v1, m1, prompt1, system_msg, ls_df_10, ls_model_10, ls_method_10)
# df2_gpt35 = run_aa(df_10, v2, m1, prompt2, system_msg, ls_df_10, ls_model_10, ls_method_10)
# df3_gpt35 = run_aa(df_10, v3, m1, prompt3, system_msg, ls_df_10, ls_model_10, ls_method_10)
# df4_gpt35 = run_aa(df_10, v4, m1, prompt4, system_msg, ls_df_10, ls_model_10, ls_method_10)

# df1_gpt4 = run_aa(df_10, v1, m2, prompt1, system_msg, ls_df_10, ls_model_10, ls_method_10)
# df2_gpt4 = run_aa(df_10, v2, m2, prompt2, system_msg, ls_df_10, ls_model_10, ls_method_10)
# df3_gpt4 = run_aa(df_10, v3, m2, prompt3, system_msg, ls_df_10, ls_model_10, ls_method_10)
df4_gpt4 = run_aa(df_sub, v4, m1, prompt4, system_msg, ls_df_10, ls_model_10, ls_method_10)

compare_baseline_mod(df_sub, ls_df_10, ls_model_10, ls_method_10)