In [1]:
import sys
sys.path.append('..')
from run_fft import FFTProcessor
import numpy as np
import pandas as pd

In [3]:
# Call ggplot in R to plot
%load_ext rpy2.ipython

### Circular processing

In [2]:
def circular(input: list, n: int = None, include_self: bool = True):
    if n is None:
        n = len(input) - 1
    output = []
    if include_self:
        output.append(input)
    for i in range(n):
        out = input[i+1:] + input[:i+1]
        output.append(out)
    return output

In [3]:
# Test
input = [1, 2, 3, 4, 5]
print(circular(input, include_self=True))
print(circular(input, include_self=False))

[[1, 2, 3, 4, 5], [2, 3, 4, 5, 1], [3, 4, 5, 1, 2], [4, 5, 1, 2, 3], [5, 1, 2, 3, 4]]
[[2, 3, 4, 5, 1], [3, 4, 5, 1, 2], [4, 5, 1, 2, 3], [5, 1, 2, 3, 4]]


In [5]:
fft_processor = FFTProcessor(method='fft', preprocess='logzs', value='norm', require_sid=False)

# Pubmed
# genre = 'pubmed'
# est_name = 'mistral'
# nll_orig = fft_processor._read_data(data_file='../data/gpt-4/pubmed_gpt-4.original.gpt2xl.nll.txt')
# nll_samp = fft_processor._read_data(data_file='../data/gpt-4/pubmed_gpt-4.sampled.gpt2xl.nll.txt')

# nll_orig = fft_processor._read_data(data_file=f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nll.txt')
# nll_samp = fft_processor._read_data(data_file=f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nll.txt')

# nll_orig = fft_processor._read_data(data_file='../data/gpt-4/pubmed_Ans_gpt-4.original.gpt2xl.nll.txt')
# nll_samp = fft_processor._read_data(data_file='../data/gpt-4/pubmed_Ans_gpt-4.sampled.gpt2xl.nll.txt')

# Writing
genre = 'writing'
est_name = 'mistral'

nll_orig = fft_processor._read_data(data_file=f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nll.txt')
nll_samp = fft_processor._read_data(data_file=f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nll.txt')

In [6]:
nll_orig_circle = []
for nll in nll_orig:
    nll_circle = circular(nll)
    nll_orig_circle.extend(nll_circle)

nll_samp_circle = []
for nll in nll_samp:
    nll_circle = circular(nll)
    nll_samp_circle.extend(nll_circle)

# FFT norm
df_orig_norm = fft_processor.process(nll_orig_circle)
df_orig_norm['type'] = 'Human'
df_samp_norm = fft_processor.process(nll_samp_circle)
df_samp_norm['type'] = 'Sampled'
df_circle_norm = pd.concat([df_orig_norm, df_samp_norm])

print(df_circle_norm.shape)

(7769099, 3)


In [43]:
# Get full circular data, with sequence id and circular index (1st, 2nd, 3rd, ..., nth)
fft_processor.preprocess = 'logzs'
fft_processor.value = 'norm'

# Original
full_circle_orig = []
for i, nll in enumerate(nll_orig):
    nll_circle = circular(nll)
    nll_circle = fft_processor._preprocess(nll_circle)
    f, p, sids = fft_processor._fft_batch(nll_circle, require_sid=True)
    df = pd.DataFrame({'freq': np.concatenate(f), 
                       'power': np.concatenate(p), 
                       'circular_index': np.concatenate(sids)})
    df['sid'] = i
    full_circle_orig.append(df)
print(len(full_circle_orig))
total_orig = sum([len(df) for df in full_circle_orig])
print(total_orig)

# Sampled
full_circle_samp = []
for i, nll in enumerate(nll_samp):
    nll_circle = circular(nll)
    nll_circle = fft_processor._preprocess(nll_circle)
    f, p, sids = fft_processor._fft_batch(nll_circle, require_sid=True)
    df = pd.DataFrame({'freq': np.concatenate(f), 
                       'power': np.concatenate(p), 
                       'circular_index': np.concatenate(sids)})
    df['sid'] = i
    full_circle_samp.append(df)
print(len(full_circle_samp))
total_samp = sum([len(df) for df in full_circle_samp])
print(total_samp)

print('total:', total_orig + total_samp)

# Save data
df_orig = pd.concat(full_circle_orig)
df_orig['type'] = 'Human'
df_samp = pd.concat(full_circle_samp)
df_samp['type'] = 'Sampled'
df_circle = pd.concat([df_orig, df_samp])
df_circle.to_csv(f'../data/gpt-4/{genre}_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.txt', index=False)

150
475411
150
507953
total: 983364


In [8]:
def get_circular_full(input_file: str):
    fft_processor = FFTProcessor(method='fft', preprocess='logzs', value='norm', require_sid=False)
    nll_raw = fft_processor._read_data(data_file=input_file)
    circle_results = []
    for i, nll in enumerate(nll_raw):
        nll_c = circular(nll)
        nll_c = fft_processor._preprocess(nll_c)
        f, p, sids = fft_processor._fft_batch(nll_c, require_sid=True)
        df = pd.DataFrame({'freq': np.concatenate(f), 
                           'power': np.concatenate(p), 
                           'circular_index': np.concatenate(sids)})
        df['sid'] = i
        circle_results.append(df)
    df_circle = pd.concat(circle_results)
    return df_circle

In [27]:
# Run get_circular_full
est_name = 'mistral'

# df_circle_writing_orig = get_circular_full('../data/gpt-4/writing_gpt-4.original.mistral.nll.txt')
# df_circle_writing_samp = get_circular_full('../data/gpt-4/writing_gpt-4.sampled.mistral.nll.txt')
# print(df_circle_writing_orig.shape[0] + df_circle_writing_samp.shape[0])
# df_circle_writing_orig['type'] = 'Human'
# df_circle_writing_samp['type'] = 'Sampled'
# df_circle_writing = pd.concat([df_circle_writing_orig, df_circle_writing_samp])

# df_circle_pubmed_orig = get_circular_full('../data/gpt-4/pubmed_gpt-4.original.mistral.nll.txt')
# df_circle_pubmed_samp = get_circular_full('../data/gpt-4/pubmed_gpt-4.sampled.mistral.nll.txt')
# print(df_circle_pubmed_orig.shape[0] + df_circle_pubmed_samp.shape[0])
# df_circle_pubmed_orig['type'] = 'Human'
# df_circle_pubmed_samp['type'] = 'Sampled'
# df_circle_pubmed = pd.concat([df_circle_pubmed_orig, df_circle_pubmed_samp])

# df_circle_xsum_orig = get_circular_full(f'../data/gpt-4/xsum_gpt-4.original.{est_name}.nll.txt')
# df_circle_xsum_samp = get_circular_full(f'../data/gpt-4/xsum_gpt-4.sampled.{est_name}.nll.txt')
# df_circle_xsum_orig['type'] = 'Human'
# df_circle_xsum_samp['type'] = 'Sampled'
# df_circle_xsum = pd.concat([df_circle_xsum_orig, df_circle_xsum_samp])


In [28]:
# Save circular full data
# df_circle_pubmed.to_csv('../data/gpt-4/pubmed_gpt-4.mistral.nlllogzs.fftnorm.circlefull.txt', index=False)
# df_circle_pubmed.to_hdf('../data/gpt-4/pubmed_gpt-4.mistral.nlllogzs.fftnorm.circlefull.h5', key='df', mode='w')

# df_circle_writing.to_csv('../data/gpt-4/writing_gpt-4.mistral.nlllogzs.fftnorm.circlefull.txt', index=False)
# df_circle_writing.to_hdf('../data/gpt-4/writing_gpt-4.mistral.nlllogzs.fftnorm.circlefull.h5', key='df', mode='w')

# df_circle_xsum.to_csv(f'../data/gpt-4/xsum_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.txt', index=False)
# df_circle_xsum.to_hdf(f'../data/gpt-4/xsum_gpt-4.{est_name}.nlllogzs.fftnorm.circlefull.h5', key='df', mode='w')


In [None]:
%%R -i df_circle_norm
require(ggplot2)
require(stringr)

# genre <- "pubmed_QA"
genre <- "writing"

# est_name <- "gpt2xl"
est_name <- "mistral"

# p <- ggplot(df_circle_norm, aes(x=freq, y=power, color=type)) + geom_smooth(method='gam') + 
#     theme_bw() + theme(plot.title = element_text(hjust = 0.5, vjust=-12, size = 12)) +
#     ggtitle(str_interp("${genre}: Human vs. GPT-4 \nNLL logzs, FFT norm, est ${est_name} \n Circular")) +
#     labs(x = bquote(omega[k]), y = bquote(X(omega[k])))
# ggsave(str_interp("gpt4_human_${genre}_${est_name}_nlllogzs_fftnorm_circle.pdf"), plot=p, width=5, height=5)

In [10]:
# For each nll sequence, use circular to compute n spectra, then calculte its mean

def get_circular_mean(nlls):
    fft_processor = FFTProcessor(method='fft', preprocess='logzs', value='norm', require_sid=False)
    freqs, powers = [], []
    for nll in nlls:
        nll_circle = circular(nll)
        data = fft_processor._preprocess(nll_circle)
        freq, power, _ = fft_processor._fft_batch(data, verbose=False)
        power_mean = np.mean(power, axis=0)
        freqs.append(freq[0])
        powers.append(power_mean)
    df = pd.DataFrame.from_dict({'freq': np.concatenate(freqs),
                                'power': np.concatenate(powers)})
    return df

# FFT norm
fft_processor.value = 'norm'
df_norm_orig = get_circular_mean(nll_orig)
df_norm_orig['type'] = 'Human'
df_norm_samp = get_circular_mean(nll_samp)
df_norm_samp['type'] = 'Sampled'
df_circlemean_norm = pd.concat([df_norm_orig, df_norm_samp])

In [11]:
%%R -i df_circlemean_norm -i df_circlemean_real -i df_circlemean_imag
require(ggplot2)

genre <- "pubmed"
# est_name <- "gpt2xl"
est_name <- "mistral"

p <- ggplot(df_circlemean_norm, aes(x=freq, y=power, color=type)) + geom_smooth(method='gam') + 
    theme_bw() + theme(plot.title = element_text(hjust = 0.5, vjust=-12, size = 12)) +
    ggtitle(str_interp("PubMed: Human vs. GPT-4 \nNLL logzs, FFT norm, est ${est_name} \nCircular Mean")) +
    labs(x = bquote(omega[k]), y = bquote(X(omega[k])))
ggsave(str_interp("gpt4_human_${genre}_${est_name}_nlllogzs_fftnorm_circlemean.pdf"), plot=p, width=5, height=5)

`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'




`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'


In [12]:
# Save circlemean data

genre = "pubmed"
# est_name = "gpt2xl"
est_name = "mistral"

df_norm_orig[['freq', 'power']].to_csv(f'../data/gpt-4/{genre}_gpt-4.original.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)
df_norm_samp[['freq', 'power']].to_csv(f'../data/gpt-4/{genre}_gpt-4.sampled.{est_name}.nlllogzs.fftnorm.circlemean.txt', index=False)