In [12]:
import sys
sys.path.append('..')
from run_fft import FFTProcessor
import numpy as np
import pandas as pd
import os

In [2]:
class SpectrumData():
    def __init__(self, filename):
        self.filename = filename
        self.spectrum_df = self.read_df()
    
    def read_df(self):
        df = pd.read_csv(self.filename)
        return df
    
    def get_dict(self):
        result = {}
        unique_sids = self.spectrum_df['sid'].unique()
        for sid in unique_sids:
            sid_df = self.spectrum_df[self.spectrum_df['sid'] == sid]
            result[sid] = {
                'freq': sid_df['freq'].values,
                'power': sid_df['power'].values
            }
        return result

In [37]:
def classify_pair(x_orig: dict, x_samp: dict, k_freq: int = 10, higher = 'model'):
    """
    0 for orig (human), 1 for samp (model)
    """
    assert x_orig.keys() == x_samp.keys()
    correct = 0
    for sid in x_orig.keys():
        pow_orig = x_orig[sid]['power']
        pow_samp = x_samp[sid]['power']
        # If higher_spectrum == 'model'
        # Hypothesis: pow_samp > pow_orig for k_freq freqs, i.e., Human > Model
        if higher == 'model':
            if np.sum(pow_samp[:k_freq]) > np.sum(pow_orig[:k_freq]):
                correct += 1
        else:
            if np.sum(pow_samp[:k_freq]) < np.sum(pow_orig[:k_freq]):
                correct += 1
    return correct / len(x_orig)

def select_k(human: dict, model: dict, higher: str):
    best_k, best_acc = None, 0.0
    for k in range(1, 51):
        acc = classify_pair(human, model, k_freq=k, higher=higher)
        if acc > best_acc:
            best_acc = acc
            best_k = k
    return best_k, best_acc

In [41]:
genre = 'pubmed'
est_name = 'mistral'

spec_orig = SpectrumData(f'../data/gpt-4/pubmed_gpt-4.original.{est_name}.nllzs.fftnorm.txt')
x_orig = spec_orig.get_dict()
# print(x_orig[0])

spec_samp = SpectrumData(f'../data/gpt-4/pubmed_gpt-4.sampled.{est_name}.nllzs.fftnorm.txt')
x_samp = spec_samp.get_dict()

acc = classify_pair(x_orig, x_samp, k_freq=6)
print(acc)

best_k, best_acc = select_k(x_orig, x_samp, higher='model')
print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc}')

# pubmed, mistral, k=10, 0.867
# pubmed, mistral, k=3, 0.90
# pubmed, mistral, k=5, 0.887

0.8466666666666667
pubmed, mistral, best_k=3, best_acc=0.9


In [43]:
# Eval loop
for genre in ['pubmed', 'writing', 'xsum']:
    for est_name in ['mistral', 'llama', 'gpt2xl', 'gpt2lg', 'gpt2md', 'gpt2']:
        orig_filename = f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nllzs.fftnorm.txt'
        samp_filename = f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nllzs.fftnorm.txt'
        if not os.path.exists(orig_filename) or not os.path.exists(samp_filename):
            continue
        spec_orig = SpectrumData(orig_filename)
        x_orig = spec_orig.get_dict()
        spec_samp = SpectrumData(samp_filename)
        x_samp = spec_samp.get_dict()

        if genre == 'pubmed':
            higher = 'model' 
        else:
            higher = 'human'
        best_k, best_acc = select_k(x_orig, x_samp, higher=higher)
        print(f'{genre}, {est_name}, best_k={best_k}, best_acc={best_acc * 100:.1f}')

pubmed, mistral, best_k=3, best_acc=90.0
pubmed, gpt2xl, best_k=3, best_acc=91.3
writing, mistral, best_k=1, best_acc=48.7
writing, gpt2xl, best_k=23, best_acc=84.7
xsum, mistral, best_k=48, best_acc=65.3
xsum, gpt2xl, best_k=29, best_acc=87.3
