# Data prep

In [35]:
import pandas as pd
import random

random.seed(42)

df = pd.read_csv("data.csv")
print(df["author"].value_counts())

author
reddit    800
bard      800
gpt       800
0.8       200
1.4       200
1.3       200
1.2       200
1.1       200
1.0       200
0.9       200
0.7       200
0.6       200
0.5       200
0.4       200
0.3       200
0.2       200
0.1       200
1.5       200
Name: count, dtype: int64


# Classification

## Pipeline

In [36]:
import numpy as np
import ast
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler


def get_tvt_split(df_cur: pd.DataFrame):
    grouped_indices = (
        df_cur.groupby("prompt")
        .apply(lambda x: x.index.tolist(), include_groups=False)
        .tolist()
    )

    random.shuffle(grouped_indices)

    test_indices = np.concatenate(grouped_indices[:2])

    df_cur_test = df_cur.loc[test_indices]
    df_cur_train = df_cur.drop(test_indices).reset_index(drop=True)

    grouped_indices_train = (
        df_cur_train.groupby("prompt")
        .apply(lambda x: x.index.tolist(), include_groups=False)
        .tolist()
    )

    cv_iterable = []
    for _ in range(4):
        grouped_indices_train = grouped_indices_train[2:] + grouped_indices_train[:2]
        val_indices = np.concatenate(grouped_indices_train[:2])
        train_indices = np.concatenate(grouped_indices_train[2:])
        cv_iterable.append((train_indices, val_indices))

    return df_cur_train, df_cur_test, cv_iterable


def get_best_params(X_train, y_train, cv_iterable):
    param_grid = {
        "C": [0.1, 0.3, 1, 3, 10],
        "solver": ["liblinear", "lbfgs"],
    }

    grid_search = GridSearchCV(
        LogisticRegression(max_iter=1000, random_state=42),
        param_grid,
        cv=cv_iterable,
        n_jobs=-1,
    )

    grid_search.fit(X_train, y_train)

    return grid_search.best_params_


def expand_df(df, vector_names):
    df_columns = []
    for column in vector_names:
        df_column = df[column].apply(ast.literal_eval).apply(pd.Series)
        df_columns.append(df_column)
    return pd.concat(df_columns, axis=1)


def classify(authors, vector_names, n_trials=20):
    accuracies = []
    f1s = []

    for random_state in range(n_trials):
        df_authors = df[df["author"].isin(authors)].reset_index(drop=True)
        df_train, df_test, cv_iterable = get_tvt_split(df_authors)

        X_train = expand_df(df_train, vector_names)
        y_train = df_train["author"]
        X_test = expand_df(df_test, vector_names)
        y_test = df_test["author"]

        standardizer = StandardScaler()
        X_train = standardizer.fit_transform(X_train)
        X_test = standardizer.transform(X_test)

        best_params = get_best_params(X_train, y_train, cv_iterable)
        model = LogisticRegression(
            **best_params,
            max_iter=1000,
            random_state=random_state,
        )
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        f1 = f1_score(y_test, y_pred, average="weighted")
        f1s.append(f1)

    return accuracies, f1s

## Reddit, GPT

In [37]:
list_of_vector_names = [
    ["common"],
    ["function"],
    ["common", "function"],
    ["bert"],
    ["common", "function", "bert"],
]

### GPT temp = Ben's chosen (default?)

In [38]:
for vector_names in list_of_vector_names:
    accuracies, f1s = classify(["reddit", "gpt"], vector_names)
    print(
        f"{vector_names} accuracy: {np.mean(accuracies):.3f} ± {np.std(accuracies):.3f}, f1: {np.mean(f1s):.3f} ± {np.std(f1s):.3f}"
    )

['common'] accuracy: 0.798 ± 0.104, f1: 0.786 ± 0.117
['function'] accuracy: 0.797 ± 0.141, f1: 0.791 ± 0.146
['common', 'function'] accuracy: 0.773 ± 0.176, f1: 0.765 ± 0.183
['bert'] accuracy: 0.963 ± 0.026, f1: 0.963 ± 0.026
['common', 'function', 'bert'] accuracy: 0.979 ± 0.029, f1: 0.979 ± 0.030


### GPT temp $\in [0.1,...,1.5]$

## Reddit, Bard

## Bard, GPT (temp = Ben's chosen)

# Trinary Classification (Reddit, GPT, Bard)