In [167]:
import sys
sys.path.append('..')
from run_fft import FFTProcessor
import numpy as np
import pandas as pd
from collections import Counter
import torch

In [193]:
# Enable rpy2
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [3]:
fft_processor = FFTProcessor(method='fft', 
                             preprocess='logzs', 
                             value='norm', 
                             require_sid=False)

In [163]:
# Read NLL data, apply log z-score
# est_name = 'mistral'
est_name = 'gpt2xl'

nll_pubmed_orig = fft_processor._read_data(data_file=f'../data/gpt-4/pubmed_gpt-4.original.{est_name}.nll.txt')
nll_pubmed_samp = fft_processor._read_data(data_file=f'../data/gpt-4/pubmed_gpt-4.sampled.{est_name}.nll.txt')

# Log + Z-Score NLL
fft_processor.preprocess = 'logzs'
data = fft_processor._preprocess(nll_pubmed_orig)
df_nlllogzs_orig = fft_processor._create_input_df(data)
data = fft_processor._preprocess(nll_pubmed_samp)
df_nlllogzs_samp = fft_processor._create_input_df(data)
df_nlllogzs_orig['Source'] = 'Human'
df_nlllogzs_samp['Source'] = 'Model'
df_nlllogzs = pd.concat([df_nlllogzs_orig, df_nlllogzs_samp])

In [164]:
# Explore the lengths of pubmed data
print('writing human lengths:', 
      np.mean(list(map(len, nll_pubmed_orig))),
      np.std(list(map(len, nll_pubmed_orig))))
print('writing model lengths:',
      np.mean(list(map(len, nll_pubmed_samp))),
      np.std(list(map(len, nll_pubmed_samp))))

# Need to examine the lengths of `Answer:` part

writing human lengths: 67.15333333333334 15.562234915189897
writing model lengths: 69.04 15.420713342773738


In [None]:
%%R -i df_nlllogzs
require("data.table")
require("ggplot2")

dt <- data.table(df_nlllogzs)
# nrow(dt)

# Density plot
# plot(density(dt[Source == "Human"]$value))

vline.dat <- data.table(Source=c("Human", "Human", "Model", "Model"), 
                        pos=c("left", "right", "left", "right"),
                        val=c(-1.667, -0.667, -1.5, -.5))
p <- ggplot(dt, aes(x=value, fill=Source)) + geom_density(alpha=0.5) + theme_minimal() + 
    facet_wrap(~Source) +
    geom_vline(vline.dat[Source=="Human" & pos=="left"], mapping=aes(xintercept=val), colour="red", linetype="dashed") + 
    geom_vline(vline.dat[Source=="Human" & pos=="right"], mapping=aes(xintercept=val), colour="red", linetype="dashed") + 
    geom_vline(vline.dat[Source=="Model" & pos=="left"], mapping=aes(xintercept=val), colour="green", linetype="dashed") + 
    geom_vline(vline.dat[Source=="Model" & pos=="right"], mapping=aes(xintercept=val), colour="green", linetype="dashed")
plot(p)

# Mistral results
# print(nrow(dt[Source == "Human" & value > -1.5 & value <= -0.5]) / nrow(dt[Source == "Human"])) # 0.4091184, left peak
# print(nrow(dt[Source == "Human" & value > -0.5 & value <= 2]) / nrow(dt[Source == "Human"])) # 0.5674812, right plateau

# GPT2-xl results
print(nrow(dt[Source == "Human" & value > -1.667 & value <= -0.667]) / nrow(dt[Source == "Human"])) #
print(nrow(dt[Source == "Human" & value > -0.667 & value <= 1.5]) / nrow(dt[Source == "Human"])) #
print(nrow(dt[Source == "Model" & value > -1.5 & value <= -0.5]) / nrow(dt[Source == "Model"])) #
print(nrow(dt[Source == "Model" & value > -0.5 & value <= 1.5]) / nrow(dt[Source == "Model"])) #


summary(dt[Source == "Human"]$value)

In [69]:
# Sanity check to see if tokenids.txt and nll.txt are aligned
tokenids_orig = fft_processor._read_data(data_file=f'../data/gpt-4/pubmed_gpt-4.original.gpt2_tokenids.txt')
tokenids_samp = fft_processor._read_data(data_file=f'../data/gpt-4/pubmed_gpt-4.sampled.gpt2_tokenids.txt')
# convert ids to int
tokenids_orig = [[int(x) for x in ids] for ids in tokenids_orig]
tokenids_samp = [[int(x) for x in ids] for ids in tokenids_samp]

for i in range(len(tokenids_orig)):
    assert len(tokenids_orig[i]) == len(nll_pubmed_orig[i]) + 1
for i in range(len(tokenids_samp)):
    assert len(tokenids_samp[i]) == len(nll_pubmed_samp[i]) + 1

In [70]:
# Get log z-score transformed NLL 
fft_processor.preprocess = 'logzs'
nlllogzs_orig = fft_processor._preprocess(nll_pubmed_orig)
nlllogzs_samp = fft_processor._preprocess(nll_pubmed_samp)
print(nll_pubmed_orig[0][:10])

[1.2931, 3.2406, 5.9268, 8.665, 7.573, 1.333, 6.0375, 4.6368, 1.2882, 6.3584]


In [72]:
left = -1.667
mid = -0.667
right = 1.5

# Vocabulary of human, in left peak and right plateau, respectively
vocab_human_left = Counter()
vocab_human_right = Counter()
for i in range(len(nlllogzs_orig)):
    token_ids = tokenids_orig[i]
    for j in range(len(nlllogzs_orig[i])):
        val = nlllogzs_orig[i][j]
        if val > left and val <= mid:
            vocab_human_left[token_ids[j+1]] += 1
        elif val > mid and val <= right:
            vocab_human_right[token_ids[j+1]] += 1

print(len(vocab_human_left))
print(len(vocab_human_right))

print('left top 10 tokens: ', vocab_human_left.most_common(10))
print('right top 10 tokens: ', vocab_human_right.most_common(10))

988
2121
left top 10 tokens:  [(286, 205), (25, 142), (13, 119), (30, 118), (284, 111), (262, 75), (11, 59), (351, 56), (287, 52), (12, 46)]
right top 10 tokens:  [(25, 183), (262, 177), (287, 176), (23998, 149), (290, 137), (257, 113), (13, 112), (11, 76), (318, 72), (286, 69)]


In [74]:
# Decode the token ids
from transformers import AutoTokenizer

gpt2_tokenizer = AutoTokenizer.from_pretrained('/Users/xy/models/gpt2-xl')

In [80]:
print('left top tokens: ')
for token_id, token_count in vocab_human_left.most_common(20):
    token = gpt2_tokenizer.decode(token_id)
    print(f'{token}', end=' ')

print('right top tokens: ')
for token_id, token_count in vocab_human_right.most_common(20):
    token = gpt2_tokenizer.decode(token_id)
    print(f'{token}', end=' ')

left top tokens: 
 of : . ?  to  the ,  with  in -  that  and  be  for  a  is al  patients  by ) right top tokens: 
:  the  in  Answer  and  a . ,  is  of  for  to -  The  patients  may  with  Does  Is  are 

### Explore the lengths of Writing data

In [161]:
est_name = 'gpt2xl'

nll_xsum_orig = fft_processor._read_data(data_file=f'../data/gpt-4/writing_gpt-4.original.{est_name}.nll.txt')
nll_xsum_samp = fft_processor._read_data(data_file=f'../data/gpt-4/writing_gpt-4.sampled.{est_name}.nll.txt')

print('writing human lengths:', 
      np.mean(list(map(len, nll_xsum_orig))),
      np.std(list(map(len, nll_xsum_orig))))
print('writing model lengths:',
      np.mean(list(map(len, nll_xsum_samp))),
      np.std(list(map(len, nll_xsum_samp))))

writing human lengths: 203.25333333333333 19.312581966744432
writing model lengths: 207.29333333333332 15.295989307295194


In [168]:
def write_nlls(nlls, output_file):
    import torch
    with open(output_file, 'w') as f:
        for res in nlls:
            if isinstance(res, torch.Tensor):
                res = res.numpy().tolist()
            res_str = ' '.join(f'{num:.4f}' for num in res)
            f.write(f'{res_str}\n')

In [171]:
# Chop the first k=50, 100, 150 tokens
est_name = 'gpt2xl'
chop_k = 150

nll_xsum_orig_chop = [nll[:chop_k] for nll in nll_xsum_orig]
nll_xsum_samp_chop = [nll[:chop_k] for nll in nll_xsum_samp]

write_nlls(nll_xsum_orig_chop, f'../data/short/writing_gpt-4.original.{est_name}.chop{chop_k}.nll.txt')
write_nlls(nll_xsum_samp_chop, f'../data/short/writing_gpt-4.sampled.{est_name}.chop{chop_k}.nll.txt')

In [None]:
%%R
require("data.table")
require("ggplot2")

orig_chop50 <- fread("../data/short/writing_gpt-4.original.gpt2xl.chop50.nllzs.fftnorm.txt")
orig_chop50$Group <- "Human"
samp_chop50 <- fread("../data/short/writing_gpt-4.sampled.gpt2xl.chop50.nllzs.fftnorm.txt")
samp_chop50$Group <- "Model"

orig_chop100 <- fread("../data/short/writing_gpt-4.original.gpt2xl.chop100.nllzs.fftnorm.txt")
orig_chop100$Group <- "Human"
samp_chop100 <- fread("../data/short/writing_gpt-4.sampled.gpt2xl.chop100.nllzs.fftnorm.txt")
samp_chop100$Group <- "Model"

orig_chop150 <- fread("../data/short/writing_gpt-4.original.gpt2xl.chop150.nllzs.fftnorm.txt")
orig_chop150$Group <- "Human"
samp_chop150 <- fread("../data/short/writing_gpt-4.sampled.gpt2xl.chop150.nllzs.fftnorm.txt")
samp_chop150$Group <- "Model"

orig_full <- fread("../data/gpt-4/writing_gpt-4.original.gpt2xl.nllzs.fftnorm.txt")
orig_full$Group <- "Human"
samp_full <- fread("../data/gpt-4/writing_gpt-4.sampled.gpt2xl.nllzs.fftnorm.txt")
samp_full$Group <- "Model"

d_chop50 <- rbind(samp_chop50, orig_chop50)
d_chop50$ChopK <- "50"
d_chop100 <- rbind(samp_chop100, orig_chop100)
d_chop100$ChopK <- "100"
d_chop150 <- rbind(samp_chop150, orig_chop150)
d_chop150$ChopK <- "150"
d_full <- rbind(samp_full, orig_full)
d_full$ChopK <- "Full"
d_chop <- rbind(d_chop50, d_chop100, d_chop150, d_full)
d_chop$ChopK <- factor(d_chop$ChopK, levels=c("50", "100", "150", "Full"))

p <- ggplot(d_chop, aes(x=freq, y=power)) + 
    geom_smooth(aes(fill=Group, colour=Group, linetype=Group)) + 
    theme_bw() + theme(legend.position=c(.9,.2)) +
    scale_color_brewer(palette="Set1") + scale_fill_brewer(palette="Set1") +
    labs(x = bquote(omega[k]), y = bquote(X(omega[k]))) + 
    facet_wrap(~ChopK, ncol=4)
ggsave("writing_chop50_100_150.pdf", plot=p, width=12, height=4)
# plot(p)

### Chop xsum by lengths

In [188]:
est_name = 'gpt2xl'

nll_xsum_orig = fft_processor._read_data(data_file=f'../data/gpt-4/xsum_gpt-4.original.{est_name}.nll.txt')
nll_xsum_samp = fft_processor._read_data(data_file=f'../data/gpt-4/xsum_gpt-4.sampled.{est_name}.nll.txt')

print('xsum human lengths:', 
      np.mean(list(map(len, nll_xsum_orig))),
      np.std(list(map(len, nll_xsum_orig))))
print('xsum model lengths:',
      np.mean(list(map(len, nll_xsum_samp))),
      np.std(list(map(len, nll_xsum_samp))))

xsum human lengths: 210.03333333333333 18.089929672487827
xsum model lengths: 205.53333333333333 14.614908674211945


In [191]:
# Chop the first k=50, 100, 150 tokens
est_name = 'gpt2xl'
chop_k = 100

nll_xsum_orig_chop = [nll[:chop_k] for nll in nll_xsum_orig]
nll_xsum_samp_chop = [nll[:chop_k] for nll in nll_xsum_samp]

write_nlls(nll_xsum_orig_chop, f'../data/short/xsum_gpt-4.original.{est_name}.chop{chop_k}.nll.txt')
write_nlls(nll_xsum_samp_chop, f'../data/short/xsum_gpt-4.sampled.{est_name}.chop{chop_k}.nll.txt')

In [None]:
%%R

require("data.table")
require("ggplot2")

orig_chop50 <- fread("../data/short/xsum_gpt-4.original.gpt2xl.chop50.nllzs.fftnorm.txt")
orig_chop50$Group <- "Human"
samp_chop50 <- fread("../data/short/xsum_gpt-4.sampled.gpt2xl.chop50.nllzs.fftnorm.txt")
samp_chop50$Group <- "Model"

orig_chop100 <- fread("../data/short/xsum_gpt-4.original.gpt2xl.chop100.nllzs.fftnorm.txt")
orig_chop100$Group <- "Human"
samp_chop100 <- fread("../data/short/xsum_gpt-4.sampled.gpt2xl.chop100.nllzs.fftnorm.txt")
samp_chop100$Group <- "Model"

orig_chop150 <- fread("../data/short/xsum_gpt-4.original.gpt2xl.chop150.nllzs.fftnorm.txt")
orig_chop150$Group <- "Human"
samp_chop150 <- fread("../data/short/xsum_gpt-4.sampled.gpt2xl.chop150.nllzs.fftnorm.txt")
samp_chop150$Group <- "Model"

orig_full <- fread("../data/gpt-4/xsum_gpt-4.original.gpt2xl.nllzs.fftnorm.txt")
orig_full$Group <- "Human"
samp_full <- fread("../data/gpt-4/xsum_gpt-4.sampled.gpt2xl.nllzs.fftnorm.txt")
samp_full$Group <- "Model"

d_chop50 <- rbind(samp_chop50, orig_chop50)
d_chop50$ChopK <- "50"
d_chop100 <- rbind(samp_chop100, orig_chop100)
d_chop100$ChopK <- "100"
d_chop150 <- rbind(samp_chop150, orig_chop150)
d_chop150$ChopK <- "150"
d_full <- rbind(samp_full, orig_full)
d_full$ChopK <- "Full"
d_chop <- rbind(d_chop50, d_chop100, d_chop150, d_full)
d_chop$ChopK <- factor(d_chop$ChopK, levels=c("50", "100", "150", "Full"))

p <- ggplot(d_chop, aes(x=freq, y=power)) + 
    geom_smooth(aes(fill=Group, colour=Group, linetype=Group)) + 
    theme_bw() + theme(legend.position=c(.9,.2)) +
    scale_color_brewer(palette="Set1") + scale_fill_brewer(palette="Set1") +
    labs(x = bquote(omega[k]), y = bquote(X(omega[k]))) + 
    facet_wrap(~ChopK, ncol=4)
ggsave("xsum_chop50_100_150.pdf", plot=p, width=12, height=4)
# plot(p)