# Preliminaries

In [1]:
import os
import random
from io import StringIO

import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import statsmodels.api as sm
from statsmodels.formula.api import ols

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm
from matplotlib.ticker import LogLocator

from tqdm import tqdm

In [2]:
random.seed(0)
np.random.seed(0)

In [3]:
with open("function_words/oshea.txt") as f:
    lines = f.readlines()
    function_words = [line.split()[0] for line in lines]

In [4]:
HUMAN = "human"

GEMINI_10 = "gemini-1.0-pro"
GEMINI_15 = "gemini-1.5-pro-latest"

CLAUDE_SONNET = "claude-3-sonnet-20240229"
CLAUDE_OPUS = "claude-3-opus-20240229"

GPT_35 = "gpt-3.5-turbo-0125"
GPT_40 = "gpt-4-turbo-2024-04-09"

LLMS = [GEMINI_10, GEMINI_15, CLAUDE_SONNET, CLAUDE_OPUS, GPT_35, GPT_40]
AUTHORS = [HUMAN] + LLMS

REDDIT = "reddit"
HEWLETT = "hewlett"
DATASETS = [REDDIT, HEWLETT]

PAIRS = []
for i, author1 in enumerate(AUTHORS):
    for author2 in AUTHORS[i + 1 :]:
        PAIRS.append((author1, author2))

In [5]:
df = []
for dataset in DATASETS:
    for author in AUTHORS:
        df_cur = pd.read_csv(f"{dataset}/responses/{author}.csv")
        df_cur["dataset"] = dataset
        df_cur["author"] = author
        df.append(df_cur)
df = pd.concat(df, ignore_index=True)

In [6]:
class ZScoreTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, function_words):
        self.function_words = function_words
        self.vectorizer = TfidfVectorizer(
            use_idf=False, norm=None, tokenizer=word_tokenize, token_pattern=None
        )
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        # Fit the vectorizer to the corpus
        word_counts = self.vectorizer.fit_transform(X).toarray()

        # Save the function words and their indices if they are in the vocabulary
        self.used_function_words = [
            word for word in self.function_words if word in self.vectorizer.vocabulary_
        ]
        self.used_function_words_indices = [
            self.vectorizer.vocabulary_[word] for word in self.used_function_words
        ]

        # Fit z-score scaler to the relative frequencies
        relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)
        self.scaler.fit(relative_freqs)
        return self

    def transform(self, X):
        # Transform the corpus into word counts
        word_counts = self.vectorizer.transform(X).toarray()

        # Calculate the relative frequencies
        relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)

        # Calculate the z-scores
        z_scores = self.scaler.transform(relative_freqs)[
            :, self.used_function_words_indices
        ]

        return z_scores

    def get_used_function_words(self):
        return self.used_function_words

# Visualization

In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

for reducer, reducer_name in [(PCA, "PCA"), (TSNE, "t-SNE")]:
    for author1, author2 in PAIRS:
        filename = f"figures/pairwise/{reducer_name}/{author1}_{author2}.png"
        if os.path.exists(filename):
            continue

        df_sampled = (
            df[(df["author"] == author1) | (df["author"] == author2)]
            .groupby(["dataset", "author", "prompt_id"])
            .apply(lambda x: x.sample(10), include_groups=False)
            .reset_index(drop=False)
        )

        z_scores_transformer = ZScoreTransformer(function_words)
        z_scores = z_scores_transformer.fit_transform(df_sampled["text"])

        dim_reducer = reducer(n_components=2)
        z_scores_reduced = dim_reducer.fit_transform(z_scores)

        df_reduced = pd.DataFrame(
            z_scores_reduced, columns=[f"{reducer_name} 1", f"{reducer_name} 2"]
        )
        df_reduced["author"] = df_sampled["author"]

        plt.figure(figsize=(8, 6))
        sns.scatterplot(
            data=df_reduced, x=f"{reducer_name} 1", y=f"{reducer_name} 2", hue="author"
        )
        plt.title(f"{reducer_name} over function word embeddings")
        plt.legend(title="Author")
        plt.savefig(filename)
        plt.close()

# Classification pipeline

In [23]:
def classify(df, authors, function_words, return_df_coefs=False):
    df = df[df["author"].isin(authors)]

    # n_responses_per_author_per_prompt_per_dataset = 10
    # df = df.groupby(["author", "prompt_id", "dataset"]).sample(
    #     n_responses_per_author_per_prompt_per_dataset
    # )

    # Train-test split: 12/4 (2 prompts from each dataset in the test set)
    test_indices = []
    for dataset in DATASETS:
        test_prompts = np.random.choice(8, 2, replace=False)
        test_indices.append(
            (df["dataset"] == dataset) & (df["prompt_id"].isin(test_prompts))
        )
    test_indices = pd.concat(test_indices, axis=1).any(axis=1)

    df_test = df[test_indices].copy()
    df_train = df[~test_indices].copy()

    # Set up 6-fold cross-validation
    train_indices_by_prompt = list(
        df_train.groupby(["dataset", "prompt_id"]).indices.values()
    )

    cv_iterable = []
    for _ in range(6):
        val_indices = np.concatenate(train_indices_by_prompt[:2])
        train_indices = np.concatenate(train_indices_by_prompt[2:])
        cv_iterable.append((train_indices, val_indices))
        # Cycle indices list
        train_indices_by_prompt = (
            train_indices_by_prompt[2:] + train_indices_by_prompt[:2]
        )

    # Use the ZScoreTransformer to get the z-scores
    z_scores_transformer = ZScoreTransformer(function_words)
    z_scores_train = z_scores_transformer.fit_transform(df_train["text"])
    z_scores_test = z_scores_transformer.transform(df_test["text"])

    param_grid = {
        "C": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0],
        "solver": ["lbfgs", "liblinear"],
    }

    model = GridSearchCV(
        LogisticRegression(max_iter=1000),
        param_grid=param_grid,
        cv=cv_iterable,
        n_jobs=-1,
        refit=True,
    )

    # Model training and prediction
    model.fit(z_scores_train, df_train["author"])
    df_test["author_pred"] = model.predict(z_scores_test)

    if not return_df_coefs:
        return df_test

    # Logistic regression model coefficients
    coefs = model.best_estimator_.coef_.squeeze()

    # For multiclass, return the average of the absolute values of the coefficients
    if len(authors) > 2:
        coefs = np.mean(np.abs(coefs), axis=0)

    used_function_words = z_scores_transformer.get_used_function_words()
    df_coefs = pd.DataFrame({"word": used_function_words, "coef": coefs})
    return df_test, df_coefs

# Pairwise

In [9]:
import os
import pandas as pd
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from json import dumps

N_TRIALS = 2


def process_pair(pair_trial):
    (author1, author2), trial = pair_trial
    df_test, df_coefs = classify(
        df=df,
        authors=[author1, author2],
        function_words=function_words,
        return_df_coefs=True,
    )

    accuracy = sum(df_test["author"] == df_test["author_pred"]) / len(df_test)
    df_test_json = dumps(df_test.to_json(orient="records"))
    df_coefs_json = dumps(df_coefs.to_json(orient="records"))

    df_results = {
        "author1": author1,
        "author2": author2,
        "trial": trial,
        "accuracy": accuracy,
        "df_test": df_test_json,
        "df_coefs": df_coefs_json,
    }
    return df_results


results_filename = "results/pairwise_classification.csv"
if os.path.exists(results_filename):
    df_results = pd.read_csv(results_filename)
else:
    # Run in parallel using joblib
    pairs_trials = list(product(PAIRS, range(N_TRIALS)))
    results = Parallel(n_jobs=-1)(
        delayed(process_pair)(pair_trial) for pair_trial in tqdm(pairs_trials)
    )

    df_results = pd.DataFrame(results)
    df_results.to_csv(results_filename, index=False)

100%|██████████| 42/42 [01:01<00:00,  1.46s/it] 


## Heatmap

In [10]:
pivot = (
    df_results[["author1", "author2", "accuracy"]]
    .groupby(["author1", "author2"])
    .mean()
    .reset_index()
    .pivot(index="author1", columns="author2", values="accuracy")
    .reindex(index=AUTHORS, columns=AUTHORS)
)

classification_figname = "figures/pairwise/classification/heatmap.png"
if not os.path.exists(classification_figname):
    sns.heatmap(pivot, annot=True, vmin=0.5, vmax=1.0)
    plt.title("Binary Logistic Regression Classification Accuracies")
    plt.ylabel("Author 1")
    plt.xlabel("Author 2")
    plt.savefig(classification_figname, bbox_inches="tight")
    plt.close()

## Confusion matrices

In [11]:
from json import loads
from io import StringIO
from sklearn.metrics import confusion_matrix

df_cms = []
for _, (author1, author2, trial, accuracy, df_test, df_coefs) in df_results.iterrows():
    df_test = pd.read_json(StringIO(loads(df_test)))

    cm = confusion_matrix(df_test["author"], df_test["author_pred"], normalize="true")

    zero_zero = cm[0, 0]
    zero_one = cm[0, 1]
    one_zero = cm[1, 0]
    one_one = cm[1, 1]

    df_cms.append(
        {
            "author1": author1,
            "author2": author2,
            "zero_zero": zero_zero,
            "zero_one": zero_one,
            "one_zero": one_zero,
            "one_one": one_one,
        }
    )
df_cms = pd.DataFrame(df_cms)
df_cms.groupby(["author1", "author2"]).mean().reset_index()

for _, (author1, author2, z_z, z_o, o_z, o_o) in df_cms.iterrows():
    authors = [author1, author2]
    cm = np.array([[z_z, z_o], [o_z, o_o]])
    cm = pd.DataFrame(cm, index=authors, columns=authors)

    classification_figname = (
        f"figures/pairwise/classification/confusion_matrices/{author1}_{author2}.png"
    )
    sns.heatmap(cm, annot=True, vmin=0, vmax=1)
    plt.title("Average Confusion Matrix for Binary Logistic Regression")
    plt.ylabel("True author")
    plt.xlabel("Predicted author")
    plt.savefig(classification_figname, bbox_inches="tight")
    plt.close()

## Average word coefficients

In [12]:
from json import loads
from io import StringIO
from sklearn.metrics import confusion_matrix
import numpy as np


df_coefs_list = []
for i, (author1, author2, trial, accuracy, df_test, df_coefs) in df_results.iterrows():
    df_coefs = pd.read_json(StringIO(loads(df_coefs)))
    df_coefs["i"] = i
    df_coefs = df_coefs.pivot(index="i", columns="word", values="coef")
    df_coefs["author1"] = author1
    df_coefs["author2"] = author2
    df_coefs_list.append(df_coefs)

df_coefs_list = pd.concat(df_coefs_list).replace(np.nan, 0)
df_coefs_list = df_coefs_list.groupby(["author1", "author2"]).mean().abs()
df_coefs_list = df_coefs_list.reset_index(drop=True)

### Average coef


In [21]:
df_coefs_list.mean().sort_values(ascending=False).head(20)

word
in             0.254228
here           0.238299
an             0.229257
and            0.225788
a              0.214595
of             0.214263
as             0.209407
to             0.204849
is             0.173815
moreover       0.158036
this           0.156179
that           0.155051
with           0.153098
not            0.150558
furthermore    0.150114
which          0.148979
despite        0.148544
how            0.142244
it             0.140667
when           0.139862
dtype: float64

### Average rank coef


In [22]:
df_coefs_list.rank(axis=1, ascending=False).mean(axis=0).sort_values().head(20)

word
and            39.333333
of             42.809524
a              42.857143
as             43.095238
in             49.476190
this           50.285714
with           51.238095
an             53.952381
is             55.809524
despite        56.952381
here           57.000000
to             60.333333
it             60.666667
that           64.523810
through        65.523810
not            65.619048
which          65.619048
moreover       65.952381
furthermore    69.761905
he             71.190476
dtype: float64

# Multi-class

In [24]:
import os
import pandas as pd
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from json import dumps


mc_results_filename = "results/multiclass_classification.csv"

if os.path.exists(mc_results_filename):
    df_mc_results = pd.read_csv(mc_results_filename)
else:
    mc_results = []
    for _ in tqdm(range(N_TRIALS)):
        df_test, df_coefs = classify(
            df=df,
            authors=AUTHORS,
            function_words=function_words,
            return_df_coefs=True,
        )

        df_test_json = dumps(df_test.to_json(orient="records"))
        df_coefs_json = dumps(df_coefs.to_json(orient="records"))

        mc_results.append(
            {
                "author1": author1,
                "author2": author2,
                "df_test": df_test_json,
                "df_coefs": df_coefs_json,
            }
        )

    df_mc_results = pd.DataFrame(mc_results)
    df_mc_results.to_csv(mc_results_filename, index=False)

100%|██████████| 2/2 [02:27<00:00, 73.69s/it]


# Word frequencies

In [None]:
pronoun_figname = "figures/word_frequencies/pronouns_heatmap.png"
if not os.path.exists(pronoun_figname):
    pronouns = [
        "he",
        "him",
        "his",
        "himself",
        "she",
        "her",
        "herself",
        "they",
        "them",
        "their",
        "themselves",
    ]

    all_word_frequencies = []
    for word in pronouns:
        word_frequencies = {}
        for author in AUTHORS:
            author_df = df[df["author"] == author]
            word_counts = author_df["text"].str.count(word)
            word_freq = word_counts / author_df["text"].str.split().apply(len)
            word_frequencies[author] = word_freq.mean()
        all_word_frequencies.append(word_frequencies)

    frequencies_df = pd.DataFrame(all_word_frequencies, index=pronouns)
    frequencies_df = frequencies_df.div(frequencies_df["human"], axis=0)
    sns.heatmap(
        frequencies_df.drop(columns="human"),
        annot=True,
        fmt=".3f",
        norm=LogNorm(vmin=0.25, vmax=4),
        cbar_kws={"format": "%.2g", "ticks": [0.25, 0.5, 1, 2, 4]},
        cmap=sns.color_palette("vlag_r", as_cmap=True),
    )
    plt.title(f"LLM/Human Pronoun Frequency Ratios")
    plt.xlabel("Author")
    plt.ylabel("Word")
    plt.savefig(pronoun_figname, bbox_inches="tight")
    plt.close()