# Preliminaries

In [61]:
import os
import random
from io import StringIO

import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import statsmodels.api as sm
from statsmodels.formula.api import ols

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LogNorm
from matplotlib.ticker import LogLocator

from tqdm import tqdm

In [62]:
random.seed(0)
np.random.seed(0)
# pd.set_option("display.max_rows", 300)

In [63]:
with open("function_words/oshea.txt") as f:
    lines = f.readlines()
    fw = [line.split()[0] for line in lines]

In [64]:
HUMAN = "human"

GEMINI_10 = "gemini-1.0-pro"
GEMINI_15 = "gemini-1.5-pro-latest"

CLAUDE_SONNET = "claude-3-sonnet-20240229"
CLAUDE_OPUS = "claude-3-opus-20240229"

GPT_35 = "gpt-3.5-turbo-0125"
GPT_40 = "gpt-4-turbo-2024-04-09"

LLMS = [GEMINI_10, GEMINI_15, CLAUDE_SONNET, CLAUDE_OPUS, GPT_35, GPT_40]
AUTHORS = [HUMAN] + LLMS

REDDIT = "reddit"
HEWLETT = "hewlett"
DATASETS = [REDDIT, HEWLETT]

PAIRS = []
for i, author1 in enumerate(AUTHORS):
    for author2 in AUTHORS[i + 1 :]:
        PAIRS.append((author1, author2))

In [65]:
df = []
for dataset in DATASETS:
    for author in AUTHORS:
        df_cur = pd.read_csv(f"{dataset}/responses/{author}.csv")
        df_cur["dataset"] = dataset
        df_cur["author"] = author
        df.append(df_cur)
df = pd.concat(df, ignore_index=True)

In [66]:
class ZScoreTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, fw):
        self.fw = fw
        self.vectorizer = TfidfVectorizer(
            use_idf=False, norm=None, tokenizer=word_tokenize, token_pattern=None
        )
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        # Fit the vectorizer to the corpus
        word_counts = self.vectorizer.fit_transform(X).toarray()

        # Save the function words and their indicies if they are in the vocabulary
        self.used_fw = [word for word in self.fw if word in self.vectorizer.vocabulary_]
        self.used_fw_indices = [
            self.vectorizer.vocabulary_[word] for word in self.used_fw
        ]

        # Fit z-score scaler to the relative frequencies
        relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)
        self.scaler.fit(relative_freqs)
        return self

    def transform(self, X):
        # Transform the corpus into word counts
        word_counts = self.vectorizer.transform(X).toarray()

        # Calculate the relative frequencies
        relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)

        # Calculate the z-scores
        z_scores = self.scaler.transform(relative_freqs)[:, self.used_fw_indices]

        return z_scores

    def get_used_fw(self):
        return self.used_fw

# Visualization

In [67]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

for reducer, reducer_name in [(PCA, "PCA"), (TSNE, "t-SNE")]:
    for author1, author2 in PAIRS:
        filename = f"figures/pairwise/{reducer_name}/{author1}_{author2}.png"
        if os.path.exists(filename):
            continue

        df_sampled = (
            df[(df["author"] == author1) | (df["author"] == author2)]
            .groupby(["dataset", "author", "prompt_id"])
            .apply(lambda x: x.sample(10), include_groups=False)
            .reset_index(drop=False)
        )

        z_scores_transformer = ZScoreTransformer(fw)
        z_scores = z_scores_transformer.fit_transform(df_sampled["text"])

        dim_reducer = reducer(n_components=2)
        z_scores_reduced = dim_reducer.fit_transform(z_scores)

        df_reduced = pd.DataFrame(
            z_scores_reduced, columns=[f"{reducer_name} 1", f"{reducer_name} 2"]
        )
        df_reduced["author"] = df_sampled["author"]

        plt.figure(figsize=(8, 6))
        sns.scatterplot(
            data=df_reduced, x=f"{reducer_name} 1", y=f"{reducer_name} 2", hue="author"
        )
        plt.title(f"{reducer_name} over function word embeddings")
        plt.legend(title="Author")
        plt.savefig(filename)
        plt.close()

In [68]:
df

Unnamed: 0,prompt_id,text,dataset,author
0,0,Doh'tlec reviewed his message . It did n't nee...,reddit,human
1,0,We spoke to them in images . It was the only w...,reddit,human
2,0,`` The Savage within… '' those three words see...,reddit,human
3,0,It was a last resort . <newline> <newline> <ne...,reddit,human
4,0,You ’ ll give us your ships . We ’ ll win your...,reddit,human
...,...,...,...,...
11195,7,**The Time Laughter Saved a Road Trip**\n\nA f...,hewlett,gpt-4-turbo-2024-04-09
11196,7,**Title: The Laughter-Filled Road Trip**\n\nDu...,hewlett,gpt-4-turbo-2024-04-09
11197,7,**The Laughter-Filled Road Trip**\n\nSeveral y...,hewlett,gpt-4-turbo-2024-04-09
11198,7,"As college roommates, my friend Maya and I oft...",hewlett,gpt-4-turbo-2024-04-09


# Classification

In [69]:
def classify(df, function_words, authors):
    df = df[df["author"].isin(authors)]

    # Train-test split: 12/4, 2 prompts from each dataset in test set
    df_train = (
        df.groupby(["dataset"])
        .apply(
            lambda x: x.groupby("prompt_id").sample(2).reset_index(),
            include_groups=False,
        )
        .reset_index()
    )

    return df_train


classify(df, fw, [HUMAN, GEMINI_10])

Unnamed: 0,dataset,level_1,index,prompt_id,text,author
0,hewlett,0,5618,0,"Dear Local Newspaper, more and more people are...",human
1,hewlett,1,6470,0,"Dear Editor,\n\nAs the digital landscape conti...",gemini-1.0-pro
2,hewlett,2,6513,1,"In the realm of libraries, the delicate balanc...",gemini-1.0-pro
3,hewlett,3,5750,1,I believe libraries should have all types of b...,human
4,hewlett,4,5849,2,"The cyclist in the essay, Do Not Exceed Posted...",human
5,hewlett,5,6636,2,"In ""Rough Road Ahead,"" the setting plays a piv...",gemini-1.0-pro
6,hewlett,6,6730,3,The author concludes the story with this parag...,gemini-1.0-pro
7,hewlett,7,6722,3,The author concludes the story with the last p...,gemini-1.0-pro
8,hewlett,8,6815,4,"In his memoir, fashion designer Narciso Rodrig...",gemini-1.0-pro
9,hewlett,9,6827,4,"The memoir creates a mood of warmth, love, and...",gemini-1.0-pro


In [70]:

    prompt_ids = df["prompt_id"].unique().tolist()
    if not test_prompts:
        test_prompts = random.sample(prompt_ids, 2)

    df_test = df[df["prompt_id"].isin(test_prompts)].reset_index(drop=True)
    df_train = df[~df["prompt_id"].isin(test_prompts)].reset_index(drop=True)

    # Use the ZScoreTransformer to get the z-scores
    z_scores_transformer = ZScoreTransformer(function_words)
    z_scores_train = z_scores_transformer.fit_transform(df_train["text"])
    z_scores_test = z_scores_transformer.transform(df_test["text"])

    # Set up cross-validation
    train_indices_by_prompt = (
        df_train.groupby("prompt_id")
        .apply(lambda x: x.index, include_groups=False)
        .tolist()
    )
    cv_iterable = []
    for _ in range(3):
        train_indices_by_prompt = (
            train_indices_by_prompt[2:] + train_indices_by_prompt[:2]
        )
        val_indices = np.concatenate(train_indices_by_prompt[:2])
        train_indices = np.concatenate(train_indices_by_prompt[2:])
        cv_iterable.append((train_indices, val_indices))

    # Train the model using grid search
    model = GridSearchCV(
        LogisticRegression(max_iter=1000),
        param_grid={
            "C": [0.001, 0.003, 0.01, 0.03, 0.1],
            "solver": ["liblinear", "lbfgs"],
        },
        cv=cv_iterable,
        n_jobs=-1,
        refit=True,
    )

    model.fit(z_scores_train, df_train["author"])
    y_pred = model.predict(z_scores_test)

    df_test["author_pred"] = y_pred
    if not df_test_include_text:
        df_test = df_test.drop(columns=["text"])

    # print the words that were most important for the model
    if return_df_coefs:
        coefs = model.best_estimator_.coef_.squeeze()
        used_fw = z_scores_transformer.get_used_fw()
        df_coefs = pd.DataFrame({"word": used_fw, "coef": coefs})
        return df_test, df_coefs

    else:
        return df_test

NameError: name 'test_prompts' is not defined