In [this paper](https://arxiv.org/pdf/2306.04723), Tulchinskii et al. explore using persistent homology in detection of AI-generated texts.

Specifically, they use a feature called the "persistent homology dimension" of the text.
I hope to reproduce their work in this notebook, and apply it to my dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
tqdm.pandas()

import kagglehub
# Download latest version
path = kagglehub.dataset_download("thedrcat/daigt-v2-train-dataset")
print("Path to dataset files:", path)
filename = 'train_v2_drcat_02.csv'

from random import sample
from statistics import mean, median
from math import log, inf
from scipy.stats import linregress

import spacy
from spacy.language import Language
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin

import configparser

import ripser
import persim

pipeline_filepath = 'pipeline.spacy'
essays_filepath = 'parsed_essays.spacy'

Path to dataset files: /Users/kaveh/.cache/kagglehub/datasets/thedrcat/daigt-v2-train-dataset/versions/2


In [None]:
#Reconstruct essay dataset from disk
doc_bin = DocBin().from_disk('parsed_essays.spacy')
data_filepath = os.path.join(path, filename)
essays = pd.read_csv(data_filepath)

nlp = spacy.load('en_core_web_md', exclude=['ner', 'parser']) # would like to use the saved pipeline, ideally. but this seems to work
essays['text_spacy'] = pd.Series(list(doc_bin.get_docs(nlp.vocab)))



Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,text_spacy
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,"(Phones, Modern, humans, today, phone, phone, ..."
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,"(essay, explain, drivers, able, use, electroni..."
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,"(Driving, use, cellular, devices, Today, socie..."
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,"(Phones, Driving, Drivers, able, use, phones, ..."
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,"(Cell, Phone, Operation, Driving, ability, sta..."


In [4]:
# PHD algorithm:
## Turn document into point cloud using a transformer encoder (specifically RoBERTa-base.)
## Choose k = 8 different subsample sizes, evenly spaced between n-hat = 40 and the size of the document. 
## Pick J = 7 samples of each size.
## For each sample S_

def PHD(doc, n_hat, k, J):
    # n_hat = minimum sample size, k = number of different sample sizes, and J = number of samples to take in each size
    n = len(doc)
    if n < n_hat:
        raise ValueError("Document too short")
    vec = [token.vector for token in doc]
    sample_sizes = [int(n_hat + (i)*(n - n_hat)/k) for i in range(k)] # k evenly spaced sample sizes, from n-hat to n
    kappa = []
    for trial in range(3):
        survival_times = []
        for n_i in sample_sizes:
            survival_times_same_sample = []
            for j in range(J):
                # Take a sample of size n_i, compute persistent homology, and then take the total survival time of all 0-dim classes.
                try:
                    s = np.array(sample(vec, n_i))
                except:
                    print(doc, n, n_i)
                pm = ripser.ripser(s, maxdim=0)
                total_surv_time = sum([H[1] - H[0] for H in pm['dgms'][0] if H[1] < np.inf])
                survival_times_same_sample.append(total_surv_time)
            # take the median of the sample survival times, add to list as 
            survival_times.append(median(survival_times_same_sample)) 
        log_sample_sizes = [log(size) for size in sample_sizes]
        log_survival_times = [log(surv) for surv in survival_times] 
        kappa.append(linregress(log_sample_sizes, log_survival_times).slope)
    return 1 / (1 - mean(kappa))

# Tulchinskii et al. mention a must faster way to compute PHD using spanning trees - would be interesting to look into


In [6]:
import warnings

n_hat = 40
k = 8
J = 7

essays_long = essays[essays['text_spacy'].apply(len) > (n_hat + 10)]
#with warnings.catch_warnings(action='ignore'):
    #essays_long['PHD'] = essays_long['text_spacy'].progress_apply(lambda x : PHD(x, n_hat, k, J))
#essays_long['PHD'].to_csv('PHD.csv')

In [8]:
phd = pd.read_csv('PHD.csv')
essays_long['PHD'] = phd['PHD']
essays_long.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essays_long['PHD'] = phd['PHD']


Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,text_spacy,PHD
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,"(Phones, Modern, humans, today, phone, phone, ...",3.186242
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,"(essay, explain, drivers, able, use, electroni...",2.622297
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,"(Driving, use, cellular, devices, Today, socie...",2.94023
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,"(Phones, Driving, Drivers, able, use, phones, ...",2.278353
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,"(Cell, Phone, Operation, Driving, ability, sta...",3.937388


In [None]:
from scipy.stats import f_oneway

humans = essays_long['PHD'][essays_long.label == 0]
machine = essays_long['PHD'][essays_long.label == 1]
print("Human mean homological dimension:", humans.mean(), humans.std())
print("Machine mean homological dimension:", machine.mean(), machine.std())
f_oneway(humans, machine, nan_policy='omit')

# stated effect is not present. Beginning to suspect this dataset isn't very good...

Human mean homological dimension: 2.8906229143227793 0.5870816948631677
Machine mean homological dimension: 2.9234649612121766 0.580991028432426


F_onewayResult(statistic=33.32464456910325, pvalue=7.850426407628558e-09)