# Preliminaries

In [1]:
import os
import random
from io import StringIO

import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import statsmodels.api as sm
from statsmodels.formula.api import ols

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm
from matplotlib.ticker import LogLocator

from tqdm import tqdm

In [2]:
random.seed(0)
np.random.seed(0)

In [3]:
with open("function_words/oshea.txt") as f:
    lines = f.readlines()
    function_words = [line.split()[0] for line in lines]

In [4]:
HUMAN = "human"

GEMINI_10 = "gemini-1.0-pro"
GEMINI_15 = "gemini-1.5-pro-latest"

CLAUDE_SONNET = "claude-3-sonnet-20240229"
CLAUDE_OPUS = "claude-3-opus-20240229"

GPT_35 = "gpt-3.5-turbo-0125"
GPT_40 = "gpt-4-turbo-2024-04-09"

LLMS = [GEMINI_10, GEMINI_15, CLAUDE_SONNET, CLAUDE_OPUS, GPT_35, GPT_40]
AUTHORS = [HUMAN] + LLMS

REDDIT = "reddit"
HEWLETT = "hewlett"
DATASETS = [REDDIT, HEWLETT]

PAIRS = []
for i, author1 in enumerate(AUTHORS):
    for author2 in AUTHORS[i + 1 :]:
        PAIRS.append((author1, author2))

In [5]:
df = []
for dataset in DATASETS:
    for author in AUTHORS:
        df_cur = pd.read_csv(f"{dataset}/responses/{author}.csv")
        df_cur["dataset"] = dataset
        df_cur["author"] = author
        df.append(df_cur)
df = pd.concat(df, ignore_index=True)

In [6]:
class ZScoreTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, function_words):
        self.function_words = function_words
        self.vectorizer = TfidfVectorizer(
            use_idf=False, norm=None, tokenizer=word_tokenize, token_pattern=None
        )
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        # Fit the vectorizer to the corpus
        word_counts = self.vectorizer.fit_transform(X).toarray()

        # Save the function words and their indices if they are in the vocabulary
        self.used_function_words = [
            word for word in self.function_words if word in self.vectorizer.vocabulary_
        ]
        self.used_function_words_indices = [
            self.vectorizer.vocabulary_[word] for word in self.used_function_words
        ]

        # Calculate the relative frequencies
        relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)

        # Fit the z-score scaler
        self.scaler.fit(relative_freqs)

        return self

    def transform(self, X, z_score=True):
        # Transform the corpus into word counts
        word_counts = self.vectorizer.transform(X).toarray()

        # Calculate the relative frequencies
        relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)

        if not z_score:
            return relative_freqs[:, self.used_function_words_indices]

        # Calculate the z-scores
        return self.scaler.transform(relative_freqs)[
            :, self.used_function_words_indices
        ]

    def get_used_function_words(self):
        return self.used_function_words

# PCA and t-SNE

In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

author_groups = PAIRS + [LLMS, AUTHORS]

for reducer, reducer_name in [(PCA, "PCA"), (TSNE, "t-SNE")]:
    for authors in author_groups:
        if len(authors) == 2:
            filename = f"figures/{reducer_name}/{authors[0]}_{authors[1]}.png"
        elif len(authors) == 6:
            filename = f"figures/{reducer_name}/all_llms.png"
        else:
            filename = f"figures/{reducer_name}/all_authors.png"

        if os.path.exists(filename):
            continue

        df_sampled = (
            df[(df["author"].isin(authors))]
            .groupby(["dataset", "author", "prompt_id"])
            .apply(lambda x: x.sample(10), include_groups=False)
            .reset_index(drop=False)
        )

        z_scores_transformer = ZScoreTransformer(function_words)
        z_scores = z_scores_transformer.fit_transform(df_sampled["text"])

        dim_reducer = reducer(n_components=2)
        z_scores_reduced = dim_reducer.fit_transform(z_scores)

        df_reduced = pd.DataFrame(
            z_scores_reduced, columns=[f"{reducer_name} 1", f"{reducer_name} 2"]
        )
        df_reduced["author"] = df_sampled["author"]

        plt.figure(figsize=(8, 6))
        sns.scatterplot(
            data=df_reduced, x=f"{reducer_name} 1", y=f"{reducer_name} 2", hue="author"
        )
        plt.title(f"{reducer_name} over function word embeddings")
        plt.legend(title="Author")
        plt.savefig(filename)
        plt.close()

# Classification pipeline

In [8]:
def classify(df, authors, function_words, return_df_coefs=False):
    df = df[df["author"].isin(authors)]

    # n_responses_per_author_per_prompt_per_dataset = 10
    # df = df.groupby(["author", "prompt_id", "dataset"]).sample(
    #     n_responses_per_author_per_prompt_per_dataset
    # )

    # Train-test split: 12/4 (2 prompts from each dataset in the test set)
    test_indices = []
    for dataset in DATASETS:
        test_prompts = np.random.choice(8, 2, replace=False)
        test_indices.append(
            (df["dataset"] == dataset) & (df["prompt_id"].isin(test_prompts))
        )
    test_indices = pd.concat(test_indices, axis=1).any(axis=1)

    df_test = df[test_indices].copy()
    df_train = df[~test_indices].copy()

    # Set up 6-fold cross-validation
    train_indices_by_prompt = list(
        df_train.groupby(["dataset", "prompt_id"]).indices.values()
    )

    cv_iterable = []
    for _ in range(6):
        val_indices = np.concatenate(train_indices_by_prompt[:2])
        train_indices = np.concatenate(train_indices_by_prompt[2:])
        cv_iterable.append((train_indices, val_indices))
        # Cycle indices list
        train_indices_by_prompt = (
            train_indices_by_prompt[2:] + train_indices_by_prompt[:2]
        )

    # Use the ZScoreTransformer to get the z-scores
    z_scores_transformer = ZScoreTransformer(function_words)
    z_scores_train = z_scores_transformer.fit_transform(df_train["text"])
    z_scores_test = z_scores_transformer.transform(df_test["text"])

    param_grid = {
        "C": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0],
        "solver": ["lbfgs", "liblinear"],
    }

    model = GridSearchCV(
        LogisticRegression(max_iter=1000),
        param_grid=param_grid,
        cv=cv_iterable,
        n_jobs=-1,
        refit=True,
    )

    # Model training and prediction
    model.fit(z_scores_train, df_train["author"])
    df_test["author_pred"] = model.predict(z_scores_test)

    if not return_df_coefs:
        return df_test

    # Logistic regression model coefficients
    coefs = model.best_estimator_.coef_.squeeze()

    # For multiclass, return the average of the absolute values of the coefficients
    if len(authors) > 2:
        coefs = np.mean(np.abs(coefs), axis=0)

    used_function_words = z_scores_transformer.get_used_function_words()
    df_coefs = pd.DataFrame({"word": used_function_words, "coef": coefs})
    return df_test, df_coefs

# Pairwise

In [9]:
import os
import pandas as pd
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from json import dumps

N_TRIALS = 10


def process_pair(pair_trial):
    (author1, author2), trial = pair_trial
    df_test, df_coefs = classify(
        df=df,
        authors=[author1, author2],
        function_words=function_words,
        return_df_coefs=True,
    )

    accuracy = sum(df_test["author"] == df_test["author_pred"]) / len(df_test)
    df_test_json = dumps(df_test.to_json(orient="records"))
    df_coefs_json = dumps(df_coefs.to_json(orient="records"))

    df_results = {
        "author1": author1,
        "author2": author2,
        "trial": trial,
        "accuracy": accuracy,
        "df_test": df_test_json,
        "df_coefs": df_coefs_json,
    }
    return df_results


results_filename = "classification_results/pairwise_classification.csv"
if os.path.exists(results_filename):
    df_results = pd.read_csv(results_filename)
else:
    # Run in parallel using joblib
    pairs_trials = list(product(PAIRS, range(N_TRIALS)))
    results = Parallel(n_jobs=-1)(
        delayed(process_pair)(pair_trial) for pair_trial in tqdm(pairs_trials)
    )

    df_results = pd.DataFrame(results)
    df_results.to_csv(results_filename, index=False)

## Heatmap

In [10]:
bin_pivot = (
    df_results[["author1", "author2", "accuracy"]]
    .groupby(["author1", "author2"])
    .mean()
    .reset_index()
    .pivot(index="author1", columns="author2", values="accuracy")
    .reindex(index=AUTHORS, columns=AUTHORS)
)

classification_figname = "figures/pairwise/classification/heatmap.png"
if not os.path.exists(classification_figname):
    sns.heatmap(bin_pivot, annot=True, vmin=0.5, vmax=1.0)
    plt.title("Binary Logistic Regression Classification Accuracies")
    plt.ylabel("Author 1")
    plt.xlabel("Author 2")
    plt.savefig(classification_figname, bbox_inches="tight")
    plt.close()

## Confusion matrices

In [11]:
from json import loads
from io import StringIO
from sklearn.metrics import confusion_matrix

df_cms = []
for _, (author1, author2, trial, accuracy, df_test, df_coefs) in df_results.iterrows():
    df_test = pd.read_json(StringIO(loads(df_test)))

    cm = confusion_matrix(
        df_test["author"],
        df_test["author_pred"],
        normalize="true",
        labels=[author1, author2],
    )

    zero_zero = cm[0, 0]
    zero_one = cm[0, 1]
    one_zero = cm[1, 0]
    one_one = cm[1, 1]

    df_cms.append(
        {
            "author1": author1,
            "author2": author2,
            "zero_zero": zero_zero,
            "zero_one": zero_one,
            "one_zero": one_zero,
            "one_one": one_one,
        }
    )
df_cms = pd.DataFrame(df_cms)
df_cms = df_cms.groupby(["author1", "author2"]).mean().reset_index()

for _, (author1, author2, z_z, z_o, o_z, o_o) in df_cms.iterrows():
    classification_figname = (
        f"figures/pairwise/classification/confusion_matrices/{author1}_{author2}.png"
    )
    if os.path.exists(classification_figname):
        continue

    authors = [author1, author2]
    cm = np.array([[z_z, z_o], [o_z, o_o]])
    cm = pd.DataFrame(cm, index=authors, columns=authors)

    sns.heatmap(cm, annot=True, vmin=0, vmax=1)
    plt.title("Average Confusion Matrix for Binary Logistic Regression")
    plt.ylabel("True author")
    plt.xlabel("Predicted author")
    plt.savefig(classification_figname, bbox_inches="tight")
    plt.close()

### Humans and LLMs confused


In [12]:
human_experiments = (
    df_cms[df_cms["author1"] == HUMAN].drop(columns="author1").groupby("author2").mean()
)

print("LLM experiments ranked by the frequency of confusing the LLM for the human:")
human_experiments["one_zero"].sort_values(ascending=False)

LLM experiments ranked by the frequency of confusing the LLM for the human:


author2
claude-3-sonnet-20240229    0.25250
claude-3-opus-20240229      0.21050
gpt-3.5-turbo-0125          0.14800
gemini-1.0-pro              0.08400
gpt-4-turbo-2024-04-09      0.07775
gemini-1.5-pro-latest       0.05475
Name: one_zero, dtype: float64

In [13]:
print("LLM experiments ranked by the frequency of confusing the human for the LLM:")
human_experiments["zero_one"].sort_values(ascending=False)

LLM experiments ranked by the frequency of confusing the human for the LLM:


author2
claude-3-opus-20240229      0.09900
claude-3-sonnet-20240229    0.08775
gpt-4-turbo-2024-04-09      0.06875
gemini-1.5-pro-latest       0.06475
gpt-3.5-turbo-0125          0.03375
gemini-1.0-pro              0.01250
Name: zero_one, dtype: float64

## Average word coefficients

In [14]:
from json import loads
from io import StringIO
from sklearn.metrics import confusion_matrix
import numpy as np


df_coefs_list = []
for i, (author1, author2, trial, accuracy, df_test, df_coefs) in df_results.iterrows():
    df_coefs = pd.read_json(StringIO(loads(df_coefs)))
    df_coefs["i"] = i
    df_coefs = df_coefs.pivot(index="i", columns="word", values="coef")
    df_coefs["author1"] = author1
    df_coefs["author2"] = author2
    df_coefs_list.append(df_coefs)

df_coefs_list = pd.concat(df_coefs_list).replace(np.nan, 0)
df_coefs_list = df_coefs_list.groupby(["author1", "author2"]).mean().abs()
df_coefs_list = df_coefs_list.reset_index(drop=True)

### Average coef


In [15]:
df_coefs_list.mean().sort_values(ascending=False).head(20)

word
in             0.256229
here           0.246231
and            0.237781
an             0.226464
of             0.214341
a              0.207266
is             0.204516
as             0.202178
to             0.192250
this           0.168080
despite        0.162987
which          0.162064
with           0.158224
that           0.149843
moreover       0.149747
was            0.148755
not            0.144959
may            0.141715
furthermore    0.141356
when           0.140737
dtype: float64

# Multi-class

In [16]:
import os
import pandas as pd
from itertools import product
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from json import dumps


mc_results_filename = "classification_results/multiclass_classification.csv"

if os.path.exists(mc_results_filename):
    df_mc_results = pd.read_csv(mc_results_filename)
else:
    mc_results = []
    for _ in tqdm(range(N_TRIALS)):
        df_test, df_coefs = classify(
            df=df,
            authors=AUTHORS,
            function_words=function_words,
            return_df_coefs=True,
        )

        df_test_json = dumps(df_test.to_json(orient="records"))
        df_coefs_json = dumps(df_coefs.to_json(orient="records"))

        mc_results.append(
            {
                "df_test": df_test_json,
                "df_coefs": df_coefs_json,
            }
        )

    df_mc_results = pd.DataFrame(mc_results)
    df_mc_results.to_csv(mc_results_filename, index=False)

## Confusion matrix

In [17]:
cms = []
for _, (df_test, _) in df_mc_results.iterrows():
    df_test = pd.read_json(StringIO(loads(df_test)))
    cm = confusion_matrix(
        df_test["author"], df_test["author_pred"], normalize="true", labels=AUTHORS
    )
    cms.append(cm)
multi_cm = np.mean(cms, axis=0)

classification_figname = "figures/multiclass/classification/confusion_matrix.png"
if not os.path.exists(classification_figname):
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        multi_cm,
        annot=True,
        xticklabels=AUTHORS,
        yticklabels=AUTHORS,
        fmt=".2f",
    )
    plt.title("Multiclass Classification Confusion Matrix")
    plt.ylabel("True author")
    plt.xlabel("Predicted author")
    plt.savefig(classification_figname, bbox_inches="tight")
    plt.close()

## Average word coefficients

In [18]:
from json import loads
from io import StringIO
from sklearn.metrics import confusion_matrix
import numpy as np


df_coefs_list = []
for _, (_, df_coefs) in df_mc_results.iterrows():
    df_coefs = pd.read_json(StringIO(loads(df_coefs)))
    df_coefs["i"] = i
    df_coefs = df_coefs.pivot(index="i", columns="word", values="coef")
    df_coefs_list.append(df_coefs)

df_coefs_list = pd.concat(df_coefs_list).replace(np.nan, 0)

### Average coef

In [19]:
highest_coef_mc = df_coefs_list.mean().sort_values(ascending=False).head(20)
highest_coef_mc

word
here       0.268697
a          0.248612
in         0.244271
and        0.238381
an         0.227587
of         0.224629
is         0.209168
as         0.205005
to         0.201880
this       0.182059
which      0.181930
the        0.174612
that       0.170505
their      0.169423
it         0.164873
may        0.161354
how        0.160643
despite    0.159006
when       0.158213
not        0.154064
dtype: float64

# Word frequencies

In [20]:
pronoun_figname = "figures/word_frequencies/pronouns_heatmap.png"
pronouns = [
    "he",
    "him",
    "his",
    "himself",
    "she",
    "her",
    "herself",
    "they",
    "them",
    "their",
    "themselves",
]

# Select the words with the highest coefficients in the multiclass classification
high_coef_figname = "figures/word_frequencies/high_coef_words_heatmap.png"
highest_coef_mc_words = list(highest_coef_mc.index)

In [21]:
word_list_tuples = [
    ("Pronouns", pronouns, pronoun_figname),
    ("High LR Coefficient Words", highest_coef_mc_words, high_coef_figname),
]

for title, words, figname in word_list_tuples:
    if os.path.exists(figname):
        continue

    all_word_frequencies = []
    for word in words:
        word_frequencies = {}
        for author in AUTHORS:
            author_df = df[df["author"] == author]
            word_counts = author_df["text"].str.count(word)
            word_freq = word_counts / author_df["text"].str.split().apply(len)
            word_frequencies[author] = word_freq.mean()
        all_word_frequencies.append(word_frequencies)

    frequencies_df = pd.DataFrame(all_word_frequencies, index=words)
    frequencies_df = frequencies_df.div(frequencies_df["human"], axis=0)
    plt.figure(figsize=(6, 8))
    sns.heatmap(
        frequencies_df.drop(columns="human"),
        annot=True,
        fmt=".3f",
        norm=LogNorm(vmin=0.25, vmax=4),
        cbar_kws={"format": "%.2g", "ticks": [0.25, 0.5, 1, 2, 4]},
        cmap=sns.color_palette("vlag_r", as_cmap=True),
    )
    plt.title(f"LLM/Human Frequency Ratios over {title}")
    plt.xlabel("Author")
    plt.ylabel("Word")
    plt.savefig(figname, bbox_inches="tight")
    plt.close()

# Dendrograms

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. Binary classification
# By turning the binary classification results into a similarity matrix
np.fill_diagonal(bin_pivot.values, 1)
binary_distance = 1 - bin_pivot.fillna(bin_pivot.T)

# 2. Multiclass classification
# By using the distribution of predictions in the multiclass LR as a feature vector for each author
multiclass_distance = 1 - cosine_similarity(multi_cm)

# 3. Average feature vector
# By using the average feature vector from the z-score transformer for each author
z_scores_transformer = ZScoreTransformer(function_words)
z_scores = z_scores_transformer.fit_transform(df["text"])
z_scores_distance = 1 - cosine_similarity(
    pd.DataFrame(z_scores, index=df["author"])
    .groupby("author")
    .mean()
    .reindex(AUTHORS)
    .values
)

In [23]:
from scipy.cluster.hierarchy import dendrogram


def plot_dendrogram(model, **kwargs):
    # From https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [24]:
from sklearn.cluster import AgglomerativeClustering

distance_matrices = {
    "Binary Classification": binary_distance,
    "Multiclass Classification": multiclass_distance,
    "Average Feature Vector": z_scores_distance,
}

for title, distance_matrix in distance_matrices.items():
    for linkage in ["single", "average", "complete"]:
        figname = f"figures/dendrograms/{title.replace(' ', '-').lower()}_{linkage}.png"
        if os.path.exists(figname):
            continue

        agg = AgglomerativeClustering(
            n_clusters=1,
            metric="precomputed",
            linkage=linkage,
            compute_distances=True,
        )

        agg.fit(distance_matrix)

        plt.title("Hierarchical Clustering Dendrogram")
        plot_dendrogram(agg, labels=AUTHORS, leaf_font_size=10)
        plt.xticks(rotation=90)
        plt.savefig(figname, bbox_inches="tight")
        plt.close()

# Variance

In [26]:
unnormalized = z_scores_transformer.fit(df["text"]).transform(df["text"], z_score=False)

In [42]:
# Ensure that the z-scores are the same as the StandardScaler on the unnormalized feature vectors
assert np.max(StandardScaler().fit_transform(unnormalized) - z_scores) < 1e-10

In [69]:
unnormalized_df = pd.DataFrame(unnormalized, index=df["author"])
z_scores_df = pd.DataFrame(z_scores, index=df["author"])

df_dict = {"unnormalized": unnormalized_df, "z-scores": z_scores_df}

for name, df_cur in df_dict.items():
    universal_centroid = df_cur.mean()
    grouped = df_cur.groupby("author").mean()
    authors = grouped.index

    pw_distances = np.abs(grouped - universal_centroid)
    pw_distances = pw_distances.mean(axis=1)
    print(
        f"The distance from the universal centroid of the average vector, average of element-wise ({name}):"
    )
    print(pw_distances.sort_values(ascending=False), "\n")

    euclidian_distances = np.linalg.norm(grouped - universal_centroid, axis=1)
    euclidian_distances = pd.Series(euclidian_distances, index=authors)
    print(
        f"The Euclidean distance of the average from the universal centroid ({name}):"
    )
    print(euclidian_distances.sort_values(ascending=False), "\n")

    # now for cosine distance
    cosine_distances = []
    for _, row in grouped.iterrows():
        row = np.array(row).reshape(1, -1)
        uc = np.array(universal_centroid).reshape(1, -1)
        cosine_distances.append(1 - cosine_similarity(row, uc).item())
    cosine_distances = pd.Series(cosine_distances, index=authors)
    print(
        f"The cosine distance between the average and the universal centroid ({name}):"
    )
    print(cosine_distances.sort_values(ascending=False), "\n")

The distance from the universal centroid of the average vector, average of element-wise (unnormalized):
author
human                       0.000714
gpt-3.5-turbo-0125          0.000363
gemini-1.5-pro-latest       0.000315
gemini-1.0-pro              0.000313
gpt-4-turbo-2024-04-09      0.000254
claude-3-sonnet-20240229    0.000199
claude-3-opus-20240229      0.000187
dtype: float64 

The Euclidean distance of the average from the universal centroid (unnormalized):
author
human                       0.028542
gpt-3.5-turbo-0125          0.015391
gemini-1.5-pro-latest       0.012507
gemini-1.0-pro              0.011647
gpt-4-turbo-2024-04-09      0.008786
claude-3-opus-20240229      0.007653
claude-3-sonnet-20240229    0.006135
dtype: float64 

The cosine distance between the average and the universal centroid (unnormalized):
author
human                       0.052774
gemini-1.5-pro-latest       0.008931
gpt-3.5-turbo-0125          0.006746
gemini-1.0-pro              0.005987
gpt-4-turb

In [61]:
cosine_similarity([[0, 1, 2, 10]], [[1, 25, 1, 2]])

array([[0.18259485]])