In [43]:
from SpectrumTools import *
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [6]:
human_list = [
    '../data/gs_james/gs_news/webtext.train.model=.news_0.fft.csv',
    '../data/gs_james/gs_news/webtext.train.model=.news_1.fft.csv',
    '../data/gs_james/gs_news/webtext.train.model=.news_2.fft.csv',
    '../data/gs_james/gs_news/webtext.train.model=.news_3.fft.csv',
    '../data/gs_james/gs_news/webtext.train.model=.news_4.fft.csv',
    '../data/gs_james/gs_story/webtext.train.model=.story_0.fft.csv',
    '../data/gs_james/gs_story/webtext.train.model=.story_1.fft.csv',
    '../data/gs_james/gs_story/webtext.train.model=.story_2.fft.csv',
    '../data/gs_james/gs_story/webtext.train.model=.story_3.fft.csv',
    '../data/gs_james/gs_story/webtext.train.model=.story_4.fft.csv',
    '../data/gs_james/gs_wiki/webtext.train.model=.wiki_0.fft.csv',
    '../data/gs_james/gs_wiki/webtext.train.model=.wiki_1.fft.csv',
    '../data/gs_james/gs_wiki/webtext.train.model=.wiki_2.fft.csv',
    '../data/gs_james/gs_wiki/webtext.train.model=.wiki_3.fft.csv',
    '../data/gs_james/gs_wiki/webtext.train.model=.wiki_4.fft.csv',
]

In [14]:
human_list_old = [
    '../data/data_gpt2_old/webtext.test.model=gpt2.fft.csv'
]

In [17]:
gpt2_list = [
    '../data/data_gpt2_old/small-117M.test.model=gpt2.fft.csv',
    '../data/data_gpt2_old/medium-345M.test.model=gpt2.fft.csv',
    '../data/data_gpt2_old/large-762M.test.model=gpt2.fft.csv',
    '../data/data_gpt2_old/xl-1542M.test.model=gpt2.fft.csv'
]

# Compare gpt2 with webtext_old

In [57]:
from tqdm import tqdm

def alignPoints(filepath1: str, filepath2: str):
    freq_list_list_1, power_list_list_1 = getInterval(filepath1)
    freq_list_list_2, power_list_list_2 = getInterval(filepath2)
    y1listlist, y2listlist = [], []
    short_length = len(freq_list_list_1) if len(freq_list_list_1) < len(
        freq_list_list_2) else len(freq_list_list_2)

    for i in tqdm(range(short_length)):
        freq_list1 = freq_list_list_1[i]
        power_list1 = power_list_list_1[i]
        freq_list2 = freq_list_list_2[i]
        power_list2 = power_list_list_2[i]

        func1 = getF(freq_list1, power_list1)
        func2 = getF(freq_list2, power_list2)
        # interpolate
        x = np.linspace(0, 0.5, 1000)
        y1 = func1(x)
        y2 = func2(x)
        y1listlist.append(y1)
        y2listlist.append(y2)
    return x, y1listlist, y2listlist

In [58]:
def getPSO(filepath1: str, filepath2: str):
    area_floor_list, area_roof_list, pso_list = [], [], []
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)

    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        # Convert to non-negative values
        y1list = [abs(i) for i in y1list]
        y2list = [abs(i) for i in y2list]

        ylists = []
        ylists.append(y1list)
        ylists.append(y2list)

        y_intersection = np.amin(ylists, axis=0)
        y_roof = np.amax(ylists, axis=0)
        area_floor = np.trapz(y_intersection, xlist)
        area_roof = np.trapz(y_roof, xlist)

        area_floor_list.append(area_floor)
        area_roof_list.append(area_roof)
        pso_list.append(round(area_floor / area_roof, 4))

    return area_floor_list, area_roof_list, pso_list

In [59]:
from scipy.stats import pearsonr

def getPearson(filepath1: str, filepath2: str):
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)
    corr_list = []
    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        try:
            corr, _ = pearsonr(y1list, y2list)
        except ValueError:
            # print('y1 len:', len(y1list), 'nan count:', len(y1list[np.isnan(y1list)]))
            # print('y2 len:', len(y2list), 'nan count:', len(y2list[np.isnan(y2list)]))
            y1list = np.nan_to_num(y1list)
            y2list = np.nan_to_num(y2list)
            corr, _ = pearsonr(y1list, y2list)
        corr_list.append(corr)
    return corr_list

In [61]:
from scipy.stats import spearmanr

def getSpearmanr(filepath1: str, filepath2: str):
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)
    corr_list = []
    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        corr, _ = spearmanr(y1list, y2list)
        corr_list.append(corr)
    return corr_list

In [62]:
def getSAM(filepath1: str, filepath2: str):
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)
    sam_list = []

    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        ylists = []
        ylists.append(y1list)
        ylists.append(y2list)

        # Normalize the spectra
        y1list /= np.linalg.norm(y1list)
        y2list /= np.linalg.norm(y2list)
        # Calculate the dot product
        dot_product = np.dot(y1list, y2list)
        # Calculate the SAM similarity
        sam_similarity = np.arccos(dot_product) / np.pi
        sam_list.append(sam_similarity)

    return sam_list

In [68]:
# Record all the results
results_dict = {'PSO': [], 'CORR': [], 'SAM': [], 'SPEAR': [], 'model': []}

In [69]:
filepath1 = human_list_old[0]
filepath2 = gpt2_list[0] # gpt2-small

area_floor_list, area_roof_list, pso_list = getPSO(filepath1, filepath2)
corr_list = getPearson(filepath1, filepath2)
sam_list = getSAM(filepath1, filepath2)
spearmanr_list = getSpearmanr(filepath1, filepath2)

print('Webtext & gpt2-sm')
print('PSO:', np.nanmean(pso_list), np.nanstd(pso_list))
print('Corr:', np.nanmean(corr_list), np.nanstd(corr_list))
print('SAM:', np.nanmean(sam_list), np.nanstd(sam_list))
print('Spearmanr:', np.nanmean(spearmanr_list), np.nanstd(spearmanr_list))

print('PSO length:', len(pso_list))
print('Corr length:', len(corr_list))
print('SAM length:', len(sam_list))
print('Spearmanr length:', len(spearmanr_list))

results_dict['PSO'].append(pso_list)
results_dict['CORR'].append(corr_list)
results_dict['SAM'].append(sam_list)
results_dict['SPEAR'].append(spearmanr_list)
results_dict['model'].append(['gpt2-sm']*len(pso_list))

100%|██████████| 4981/4981 [00:02<00:00, 1747.12it/s]
100%|██████████| 4981/4981 [00:02<00:00, 2045.09it/s]
100%|██████████| 4981/4981 [00:02<00:00, 2476.33it/s]
100%|██████████| 4981/4981 [00:02<00:00, 2402.74it/s]


Webtext & gpt2-sm
PSO: 0.3617340566606389 0.07221731235943037
Corr: 0.6438794869424143 0.17855396411140867
SAM: 0.2693253150087384 0.07215736359363699
Spearmanr: 0.01318690335447418 0.06656590784301393
PSO length: 4981
Corr length: 4981
SAM length: 4981
Spearmanr length: 4981


In [71]:
# print(results_dict['model'][0][:10])

['gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm', 'gpt2-sm']


In [72]:
filepath1 = human_list_old[0]
filepath2 = gpt2_list[1] # gpt2-md

area_floor_list, area_roof_list, pso_list = getPSO(filepath1, filepath2)
corr_list = getPearson(filepath1, filepath2)
sam_list = getSAM(filepath1, filepath2)
spearmanr_list = getSpearmanr(filepath1, filepath2)

print('Webtext & gpt2-md')
print('PSO:', np.nanmean(pso_list), np.nanstd(pso_list))
print('Corr:', np.nanmean(corr_list), np.nanstd(corr_list))
print('SAM:', np.nanmean(sam_list), np.nanstd(sam_list))
print('Spearmanr:', np.nanmean(spearmanr_list), np.nanstd(spearmanr_list))

print('PSO length:', len(pso_list))
print('Corr length:', len(corr_list))
print('SAM length:', len(sam_list))
print('Spearmanr length:', len(spearmanr_list))

results_dict['PSO'].append(pso_list)
results_dict['CORR'].append(corr_list)
results_dict['SAM'].append(sam_list)
results_dict['SPEAR'].append(spearmanr_list)
results_dict['model'].append(['gpt2-md']*len(pso_list))

100%|██████████| 4994/4994 [00:02<00:00, 2261.44it/s]
100%|██████████| 4994/4994 [00:02<00:00, 2304.14it/s]
100%|██████████| 4994/4994 [00:02<00:00, 2126.15it/s]
100%|██████████| 4994/4994 [00:02<00:00, 2128.58it/s]


Webtext & gpt2-md
PSO: 0.35731927855711426 0.07198203901642747
Corr: 0.6519632934082432 0.17578011743625077
SAM: 0.26617845610364177 0.07231423620791073
Spearmanr: 0.011838503218062336 0.06708215787518512
PSO length: 4994
Corr length: 4994
SAM length: 4994
Spearmanr length: 4994


In [73]:
filepath1 = human_list_old[0]
filepath2 = gpt2_list[2] # gpt2-lg

area_floor_list, area_roof_list, pso_list = getPSO(filepath1, filepath2)
corr_list = getPearson(filepath1, filepath2)
sam_list = getSAM(filepath1, filepath2)
spearmanr_list = getSpearmanr(filepath1, filepath2)

print('Webtext & gpt2-lg')
print('PSO:', np.nanmean(pso_list), np.nanstd(pso_list))
print('Corr:', np.nanmean(corr_list), np.nanstd(corr_list))
print('SAM:', np.nanmean(sam_list), np.nanstd(sam_list))
print('Spearmanr:', np.nanmean(spearmanr_list), np.nanstd(spearmanr_list))

print('PSO length:', len(pso_list))
print('Corr length:', len(corr_list))
print('SAM length:', len(sam_list))
print('Spearmanr length:', len(spearmanr_list))

results_dict['PSO'].append(pso_list)
results_dict['CORR'].append(corr_list)
results_dict['SAM'].append(sam_list)
results_dict['SPEAR'].append(spearmanr_list)
results_dict['model'].append(['gpt2-lg']*len(pso_list))

100%|██████████| 4994/4994 [00:02<00:00, 2269.40it/s]
100%|██████████| 4994/4994 [00:01<00:00, 2579.79it/s]
100%|██████████| 4994/4994 [00:02<00:00, 2495.34it/s]
100%|██████████| 4994/4994 [00:02<00:00, 2418.85it/s]


Webtext & gpt2-lg
PSO: 0.3664797514033681 0.06969912186815849
Corr: 0.6389773676548709 0.1692137491025043
SAM: 0.27212628021182306 0.06795641011453786
Spearmanr: 0.014083957631672148 0.06632946210550243
PSO length: 4994
Corr length: 4994
SAM length: 4994
Spearmanr length: 4994


In [74]:
filepath1 = human_list_old[0]
filepath2 = gpt2_list[3] # gpt2-xl

area_floor_list, area_roof_list, pso_list = getPSO(filepath1, filepath2)
corr_list = getPearson(filepath1, filepath2)
sam_list = getSAM(filepath1, filepath2)
spearmanr_list = getSpearmanr(filepath1, filepath2)

print('Webtext & gpt2-xl')
print('PSO:', np.nanmean(pso_list), np.nanstd(pso_list))
print('Corr:', np.nanmean(corr_list), np.nanstd(corr_list))
print('SAM:', np.nanmean(sam_list), np.nanstd(sam_list))
print('Spearmanr:', np.nanmean(spearmanr_list), np.nanstd(spearmanr_list))

print('PSO length:', len(pso_list))
print('Corr length:', len(corr_list))
print('SAM length:', len(sam_list))
print('Spearmanr length:', len(spearmanr_list))

results_dict['PSO'].append(pso_list)
results_dict['CORR'].append(corr_list)
results_dict['SAM'].append(sam_list)
results_dict['SPEAR'].append(spearmanr_list)
results_dict['model'].append(['gpt2-xl']*len(pso_list))

100%|██████████| 5000/5000 [00:02<00:00, 2414.64it/s]
100%|██████████| 5000/5000 [00:02<00:00, 2214.81it/s]
100%|██████████| 5000/5000 [00:02<00:00, 2414.98it/s]
100%|██████████| 5000/5000 [00:02<00:00, 2432.30it/s]


Webtext & gpt2-xl
PSO: 0.36732830000000005 0.06823007076876002
Corr: 0.6389014870888231 0.1608899713174084
SAM: 0.272738983726075 0.06495872443292001
Spearmanr: 0.014658888762888764 0.06793662180989077
PSO length: 5000
Corr length: 5000
SAM length: 5000
Spearmanr length: 5000


In [78]:
import pandas as pd
from itertools import chain

for key, val in results_dict.items():
    tmp = list(chain.from_iterable(val))
    results_dict[key] = tmp

df = pd.DataFrame(results_dict)
df.to_csv('FACE_GPT2_old.csv', index=False)