# TODO

- remove 'your name'
- tokenize
- cross validation on clustering? 
- using longest responses?

In [1]:
HUMAN = "human"

GEMINI_10 = "gemini-1.0-pro"
GEMINI_15 = "gemini-1.5-pro-latest"

CLAUDE_SONNET = "claude-3-sonnet-20240229"
CLAUDE_OPUS = "claude-3-opus-20240229"

GPT_35 = "gpt-3.5-turbo-0125"
GPT_40 = "gpt-4-turbo-2024-04-09"


authors = [HUMAN, GEMINI_10, GEMINI_15, CLAUDE_SONNET, CLAUDE_OPUS, GPT_35, GPT_40]

# Function words

In [2]:
from nltk.tokenize import word_tokenize
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import pandas as pd


with open("function_words/oshea.txt") as f:
    lines = f.readlines()
    function_words = [line.split()[0] for line in lines]


def delta(df, function_words, variant):
    # Get word counts of all words, using the NLTK tokenizer for all responses
    vectorizer = TfidfVectorizer(
        use_idf=False, norm=None, tokenizer=word_tokenize, token_pattern=None
    )
    word_counts = vectorizer.fit_transform(df["text"]).toarray()

    # Convert to relative frequencies for each document
    relative_freqs = word_counts / word_counts.sum(axis=1, keepdims=True)

    # Keep the relative frquencies for only the function words
    function_word_indices = []
    oov = []
    for word in function_words:
        if word in vectorizer.vocabulary_:
            function_word_indices.append(vectorizer.vocabulary_[word])
        else:
            oov.append(word)

    fw_relative_freqs = relative_freqs[:, function_word_indices]
    print(f"{len(oov)} words not in corpus: {oov}")

    # Convert to z-scores
    scaler = StandardScaler()
    z_scores = scaler.fit_transform(fw_relative_freqs)

    # Optionally return the z-scores
    if variant == "z_scores":
        return z_scores

    # Return the distance matrix
    if variant == "burrows":
        return cdist(z_scores, z_scores, metric="cityblock")
    if variant == "cosine":
        return cdist(z_scores, z_scores, metric="cosine")

    raise ValueError(f"Unknown variant: {variant}")

# Classification

In [3]:
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


def classify(df, function_words):
    # Train-test split
    prompt_ids = df["prompt_id"].unique().tolist()
    test_prompts = random.sample(prompt_ids, 2)
    df_test = df[df["prompt_id"].isin(test_prompts)].reset_index(drop=True)
    df_train = df[~df["prompt_id"].isin(test_prompts)].reset_index(drop=True)

    # Get word counts of all words, using the NLTK tokenizer for all responses
    vectorizer = TfidfVectorizer(
        use_idf=False, norm=None, tokenizer=word_tokenize, token_pattern=None
    )
    word_counts_train = vectorizer.fit_transform(df_train["text"]).toarray()
    word_counts_test = vectorizer.transform(df_test["text"]).toarray()

    # Convert to relative frequencies for each document
    rel_freqs_train = word_counts_train / word_counts_train.sum(axis=1, keepdims=True)
    rel_freqs_test = word_counts_test / word_counts_test.sum(axis=1, keepdims=True)

    # Keep the relative frequencies for only the function words
    function_word_indices = []
    for word in function_words:
        if word in vectorizer.vocabulary_:
            function_word_indices.append(vectorizer.vocabulary_[word])

    fw_relative_freqs_train = rel_freqs_train[:, function_word_indices]
    fw_relative_freqs_test = rel_freqs_test[:, function_word_indices]

    # Convert to z-scores
    scaler = StandardScaler()
    z_scores_train = scaler.fit_transform(fw_relative_freqs_train)
    z_scores_test = scaler.transform(fw_relative_freqs_test)

    # Set up cross-validation
    train_indices_by_prompt = (
        df_train.groupby("prompt_id")
        .apply(lambda x: x.index, include_groups=False)
        .tolist()
    )
    cv_iterable = []
    for _ in range(3):
        train_indices_by_prompt = (
            train_indices_by_prompt[2:] + train_indices_by_prompt[:2]
        )
        val_indices = np.concatenate(train_indices_by_prompt[:2])
        train_indices = np.concatenate(train_indices_by_prompt[2:])
        cv_iterable.append((train_indices, val_indices))

    # Verify that no prompt in each validation set is the corresponding train set
    for train_indices, val_indices in cv_iterable:
        assert (
            len(
                set(df_train.loc[train_indices, "prompt_id"].unique()).intersection(
                    set(df_train.loc[val_indices, "prompt_id"].unique())
                )
            )
            == 0
        )

    # Train the model using grid search
    model = GridSearchCV(
        LogisticRegression(max_iter=1000),
        param_grid={
            "C": [0.001, 0.003, 0.01, 0.03, 0.1],
            "solver": ["liblinear", "lbfgs"],
        },
        cv=cv_iterable,
        n_jobs=-1,
        refit=True,
    )

    model.fit(z_scores_train, df_train["author"])
    y_pred = model.predict(z_scores_test)

    return df_test["author"], y_pred

In [4]:
from sklearn.metrics import classification_report


for dataset in ["reddit", "hewlett"]:
    dfs = []
    for author in authors:
        df = pd.read_csv(f"{dataset}_responses/{author}.csv")
        df["author"] = author
        dfs.append(df)
    df = pd.concat([df for df in dfs], ignore_index=True)

    y_true, y_pred = classify(df)
    report = classification_report(y_true, y_pred, zero_division=0)
    print(f"Dataset: {dataset}")
    print(report)

TypeError: classify() missing 1 required positional argument: 'function_words'

# Clustering

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

df_prompt_0 = df[df["prompt_id"] == 0]
z_scores_prompt_0 = delta(df_prompt_0, function_words, variant="z_scores")

tsne = TSNE(random_state=42)
X_tsne = tsne.fit_transform(z_scores_prompt_0)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=X_tsne[:, 0],
    y=X_tsne[:, 1],
    hue=df_prompt_0["author"],
    palette="Set1",
)
plt.title("t-SNE Visualization of prompt 0")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()


z_scores = delta(df, function_words, variant="z_scores")

tsne = TSNE(random_state=42)
X_tsne = tsne.fit_transform(z_scores)
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x=X_tsne[:, 0],
    y=X_tsne[:, 1],
    hue=df["author"],
    palette="Set1",
)
plt.title("t-SNE Visualization of all prompts")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()