In [1]:
import sys
sys.path.append('..')
from run_fft import FFTProcessor
import numpy as np
import pandas as pd
from collections import Counter
import torch

In [2]:
# Enable rpy2
%load_ext rpy2.ipython

### Explore the lengths of Writing data

In [3]:
fft_processor = FFTProcessor(method='fft', 
                             preprocess='logzs', 
                             value='norm', 
                             require_sid=False)

In [4]:
est_name = 'gpt2xl'

nll_xsum_orig = fft_processor._read_data(data_file=f'../data/gpt-4/writing_gpt-4.original.{est_name}.nll.txt')
nll_xsum_samp = fft_processor._read_data(data_file=f'../data/gpt-4/writing_gpt-4.sampled.{est_name}.nll.txt')

print('writing human lengths:', 
      np.mean(list(map(len, nll_xsum_orig))),
      np.std(list(map(len, nll_xsum_orig))))
print('writing model lengths:',
      np.mean(list(map(len, nll_xsum_samp))),
      np.std(list(map(len, nll_xsum_samp))))

writing human lengths: 203.25333333333333 19.312581966744432
writing model lengths: 207.29333333333332 15.295989307295194


In [None]:
def write_nlls(nlls, output_file):
    import torch
    with open(output_file, 'w') as f:
        for res in nlls:
            if isinstance(res, torch.Tensor):
                res = res.numpy().tolist()
            res_str = ' '.join(f'{num:.4f}' for num in res)
            f.write(f'{res_str}\n')

In [None]:
# Chop the first k=50, 100, 150 tokens
est_name = 'gpt2xl'
chop_k = 150

nll_xsum_orig_chop = [nll[:chop_k] for nll in nll_xsum_orig]
nll_xsum_samp_chop = [nll[:chop_k] for nll in nll_xsum_samp]

write_nlls(nll_xsum_orig_chop, f'../data/short/writing_gpt-4.original.{est_name}.chop{chop_k}.nll.txt')
write_nlls(nll_xsum_samp_chop, f'../data/short/writing_gpt-4.sampled.{est_name}.chop{chop_k}.nll.txt')

In [9]:
%%R
require("data.table")
require("ggplot2")

orig_chop50 <- fread("../data/short/writing_gpt-4.original.gpt2xl.chop50.nllzs.fftnorm.txt")
orig_chop50$Group <- "Human"
samp_chop50 <- fread("../data/short/writing_gpt-4.sampled.gpt2xl.chop50.nllzs.fftnorm.txt")
samp_chop50$Group <- "Model"

orig_chop100 <- fread("../data/short/writing_gpt-4.original.gpt2xl.chop100.nllzs.fftnorm.txt")
orig_chop100$Group <- "Human"
samp_chop100 <- fread("../data/short/writing_gpt-4.sampled.gpt2xl.chop100.nllzs.fftnorm.txt")
samp_chop100$Group <- "Model"

orig_chop150 <- fread("../data/short/writing_gpt-4.original.gpt2xl.chop150.nllzs.fftnorm.txt")
orig_chop150$Group <- "Human"
samp_chop150 <- fread("../data/short/writing_gpt-4.sampled.gpt2xl.chop150.nllzs.fftnorm.txt")
samp_chop150$Group <- "Model"

orig_full <- fread("../data/gpt-4/writing_gpt-4.original.gpt2xl.nllzs.fftnorm.txt")
orig_full$Group <- "Human"
samp_full <- fread("../data/gpt-4/writing_gpt-4.sampled.gpt2xl.nllzs.fftnorm.txt")
samp_full$Group <- "Model"

d_chop50 <- rbind(samp_chop50, orig_chop50)
d_chop50$ChopK <- "50"
d_chop100 <- rbind(samp_chop100, orig_chop100)
d_chop100$ChopK <- "100"
d_chop150 <- rbind(samp_chop150, orig_chop150)
d_chop150$ChopK <- "150"
d_full <- rbind(samp_full, orig_full)
d_full$ChopK <- "Full"
d_chop <- rbind(d_chop50, d_chop100, d_chop150, d_full)
d_chop$ChopK <- factor(d_chop$ChopK, levels=c("50", "100", "150", "Full"))

p <- ggplot(d_chop, aes(x=freq, y=power)) + 
    geom_smooth(aes(fill=Group, colour=Group, linetype=Group)) + 
    theme_bw() + theme(legend.position=c(.9,.2)) +
    scale_color_brewer(palette="Set1") + scale_fill_brewer(palette="Set1") +
    labs(x = bquote(omega[k]), y = bquote(X(omega[k]))) + 
    facet_wrap(~ChopK, ncol=4)
ggsave("writing_chop50_100_150.pdf", plot=p, width=9, height=3)
# plot(p)

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'


### Chop xsum by lengths

In [None]:
est_name = 'gpt2xl'

nll_xsum_orig = fft_processor._read_data(data_file=f'../data/gpt-4/xsum_gpt-4.original.{est_name}.nll.txt')
nll_xsum_samp = fft_processor._read_data(data_file=f'../data/gpt-4/xsum_gpt-4.sampled.{est_name}.nll.txt')

print('xsum human lengths:', 
      np.mean(list(map(len, nll_xsum_orig))),
      np.std(list(map(len, nll_xsum_orig))))
print('xsum model lengths:',
      np.mean(list(map(len, nll_xsum_samp))),
      np.std(list(map(len, nll_xsum_samp))))

xsum human lengths: 210.03333333333333 18.089929672487827
xsum model lengths: 205.53333333333333 14.614908674211945


In [None]:
# Chop the first k=50, 100, 150 tokens
est_name = 'gpt2xl'
chop_k = 100

nll_xsum_orig_chop = [nll[:chop_k] for nll in nll_xsum_orig]
nll_xsum_samp_chop = [nll[:chop_k] for nll in nll_xsum_samp]

write_nlls(nll_xsum_orig_chop, f'../data/short/xsum_gpt-4.original.{est_name}.chop{chop_k}.nll.txt')
write_nlls(nll_xsum_samp_chop, f'../data/short/xsum_gpt-4.sampled.{est_name}.chop{chop_k}.nll.txt')

In [6]:
# Get spectrum for chopped data
import subprocess
import os

est_name = 'gpt2xl'
chop_k_values = [50,100,150]
genre_list = ['xsum']
source_list = ['original', 'sampled']

data_root = os.path.abspath('../data/short/')
script_root = os.path.abspath('../')
print(f'abs path for data: {data_root}')
print(f'abs path for script: {script_root}')

for genre in genre_list:
    for source in source_list:
        for chop_k in chop_k_values:
            input_filename = f'{genre}_gpt-4.{source}.{est_name}.chop{chop_k}.nll.txt'
            input_path = os.path.join(data_root, input_filename)
            output_filename = f'{genre}_gpt-4.{source}.{est_name}.chop{chop_k}.nllzs.fftnorm.txt'
            output_path = os.path.join(data_root, output_filename)

            if os.path.exists(input_path):
                script_path = os.path.join(script_root, 'run_fft.py')
                cmd = ['python', script_path, '-i', input_path, '-o', output_path, '-p', 'zscore', '--value', 'norm']
                print(' '.join(cmd))
                subprocess.run(cmd)

abs path for data: /Users/xy/projects/FourierGPT/data/short
abs path for script: /Users/xy/projects/FourierGPT
python /Users/xy/projects/FourierGPT/run_fft.py -i /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.original.gpt2xl.chop50.nll.txt -o /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.original.gpt2xl.chop50.nllzs.fftnorm.txt -p zscore --value norm
python /Users/xy/projects/FourierGPT/run_fft.py -i /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.original.gpt2xl.chop100.nll.txt -o /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.original.gpt2xl.chop100.nllzs.fftnorm.txt -p zscore --value norm
python /Users/xy/projects/FourierGPT/run_fft.py -i /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.original.gpt2xl.chop150.nll.txt -o /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.original.gpt2xl.chop150.nllzs.fftnorm.txt -p zscore --value norm
python /Users/xy/projects/FourierGPT/run_fft.py -i /Users/xy/projects/FourierGPT/data/short/xsum_gpt-4.sampled.gpt2xl.chop50.n

In [8]:
%%R

require("data.table")
require("ggplot2")

orig_chop50 <- fread("../data/short/xsum_gpt-4.original.gpt2xl.chop50.nllzs.fftnorm.txt")
orig_chop50$Group <- "Human"
samp_chop50 <- fread("../data/short/xsum_gpt-4.sampled.gpt2xl.chop50.nllzs.fftnorm.txt")
samp_chop50$Group <- "Model"

orig_chop100 <- fread("../data/short/xsum_gpt-4.original.gpt2xl.chop100.nllzs.fftnorm.txt")
orig_chop100$Group <- "Human"
samp_chop100 <- fread("../data/short/xsum_gpt-4.sampled.gpt2xl.chop100.nllzs.fftnorm.txt")
samp_chop100$Group <- "Model"

orig_chop150 <- fread("../data/short/xsum_gpt-4.original.gpt2xl.chop150.nllzs.fftnorm.txt")
orig_chop150$Group <- "Human"
samp_chop150 <- fread("../data/short/xsum_gpt-4.sampled.gpt2xl.chop150.nllzs.fftnorm.txt")
samp_chop150$Group <- "Model"

orig_full <- fread("../data/gpt-4/xsum_gpt-4.original.gpt2xl.nllzs.fftnorm.txt")
orig_full$Group <- "Human"
samp_full <- fread("../data/gpt-4/xsum_gpt-4.sampled.gpt2xl.nllzs.fftnorm.txt")
samp_full$Group <- "Model"

d_chop50 <- rbind(samp_chop50, orig_chop50)
d_chop50$ChopK <- "50"
d_chop100 <- rbind(samp_chop100, orig_chop100)
d_chop100$ChopK <- "100"
d_chop150 <- rbind(samp_chop150, orig_chop150)
d_chop150$ChopK <- "150"
d_full <- rbind(samp_full, orig_full)
d_full$ChopK <- "Full"
d_chop <- rbind(d_chop50, d_chop100, d_chop150, d_full)
d_chop$ChopK <- factor(d_chop$ChopK, levels=c("50", "100", "150", "Full"))

p <- ggplot(d_chop, aes(x=freq, y=power)) + 
    geom_smooth(aes(fill=Group, colour=Group, linetype=Group)) + 
    theme_bw() + theme(legend.position=c(.9,.2)) +
    scale_color_brewer(palette="Set1") + scale_fill_brewer(palette="Set1") +
    labs(x = bquote(omega[k]), y = bquote(X(omega[k]))) + 
    facet_wrap(~ChopK, ncol=4)
ggsave("xsum_chop50_100_150.pdf", plot=p, width=9, height=3)
# plot(p)

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
