In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy import interpolate
from scipy.stats import pearsonr

In [2]:
# 从csv文件中读出每个区间，区分方法为每个区间必须升序
def getInterval(fre_power_filepath:str):
    spectrum = pd.read_csv(fre_power_filepath)
    spectrum['group'] = (spectrum['freq'].shift(1) > spectrum['freq']).cumsum() # 聪明的办法！
    grouped_spectrum = spectrum.groupby('group')
    freq_list = []
    power_list = []
    for name, group in grouped_spectrum:
        freq_list.append(group['freq'].tolist())
        power_list.append(group['power'].tolist())
    return freq_list, power_list

In [None]:
# Test the shift() and groupby() methods
entropy_file1 = '../data/large-762M.test.model=gpt2.nll'
entropy_file2 = '../data/webtext.test.model=gpt2.nll'
fp_file1 = '../plot/large-762M.test.model=gpt2.freq_power.csv'
fp_file2 = '../plot/webtext.test.model=gpt2.freq_power.csv'

spectrum = pd.read_csv(fp_file1)
spectrum['group'] = (spectrum['freq'].shift(1) > spectrum['freq']).cumsum()
spectrum

In [3]:
# 返回由散点模拟的函数（可以为线性，二次方程或者三次方程）
def getF(freq_list:list, power_list:list):
    f = interpolate.interp1d(freq_list, power_list, fill_value="extrapolate")
    return f

In [13]:
# 根据两个文件内容， 返回每个区间固定且相同间隔的x对应的y值，区间取值为[0, 0.5],区间外的点是否需要计算？ or直接截断 比如0.48 去预测 0.5
def alignPoints(filepath1:str, filepath2:str, n_common:int = 200, verbose=False):
    freq_list_list_1, power_list_list_1 = getInterval(filepath1)
    freq_list_list_2, power_list_list_2 = getInterval(filepath2)
    if verbose:
        print(f'There are {len(freq_list_list_1)},{len(freq_list_list_2)} intervals in file {filepath1},{filepath2} respectively')

    x = np.linspace(0, 0.5, n_common)
    y1listlist, y2listlist = [], []
    for i in range(len(freq_list_list_1)):
        freq_list1 = freq_list_list_1[i]
        power_list1 = power_list_list_1[i]
        freq_list2 = freq_list_list_2[i]
        power_list2 = power_list_list_2[i]

        func1 = getF(freq_list1, power_list1)
        func2 = getF(freq_list2, power_list2)
        # interpolate
        y1 = func1(x)
        y2 = func2(x)
        y1listlist.append(y1)
        y2listlist.append(y2)

    return x, y1listlist, y2listlist

In [5]:
# 为每个fre区间计算auc
def getPSO(filepath1:str, filepath2:str):
    area_floor_list, area_roof_list, pso_list = [], [], []
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)

    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        ylists = []
        ylists.append(y1list)
        ylists.append(y2list)

        y_intersection = np.amin(ylists, axis=0)
        y_roof = np.amax(ylists, axis=0)
        area_floor = np.trapz(y_intersection, xlist)
        area_roof = np.trapz(y_roof, xlist)

        area_floor_list.append(area_floor)
        area_roof_list.append(area_roof)
        pso_list.append(round(area_floor / area_roof, 4))

    return area_floor_list, area_roof_list, pso_list

In [28]:
# 为每个fre区间计算PearsonCorelation
def getPearson(filepath1:str, filepath2:str):
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)
    corr_list = []
    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        y1 = np.array(y1list)
        y2 = np.array(y2list)
        finite_indices = np.logical_and(np.isfinite(y1), np.isfinite(y2))
        y1 = y1[finite_indices]
        y2 = y2[finite_indices]
        try:
            corr, _ = pearsonr(y1, y2)
        except ValueError:
            print(len(y1list), len(y2list))
            print(y1.shape, y2.shape)
            raise
        corr_list.append(corr)
    return corr_list

In [31]:
# Calculate the similarity between two spectra using Spectral Angle Mapper
def getSAM(filepath1:str, filepath2:str):
    xlist, y1listlist, y2listlist = alignPoints(filepath1, filepath2)
    sam_list = []
    for i in range(len(y1listlist)):
        y1list = y1listlist[i]
        y2list = y2listlist[i]
        ylists = []
        ylists.append(y1list)
        ylists.append(y2list)
        # Normalize the spectra
        y1list /= np.linalg.norm(y1list)
        y2list /= np.linalg.norm(y2list)
        # Calculate the dot product
        dot_product = np.dot(y1list, y2list)
        # Calculate the SAM similarity
        sam_similarity = np.arccos(dot_product) / np.pi
        sam_list.append(sam_similarity)

    return sam_list

# Tests

In [14]:
fp_file1 = '../plot/large-762M.test.model=gpt2.freq_power.csv'
fp_file2 = '../plot/webtext.test.model=gpt2.freq_power.csv'
x, y1listlist, y2listlist = alignPoints(fp_file1, fp_file2)

area_floor_list, area_roof_list, pso_list = getPSO(fp_file1, fp_file2)
print(len(area_floor_list), len(area_roof_list), len(pso_list))

  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]
  y_new = slope*(x_new - x_lo)[:, None] + y_lo
  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]
  y_new = slope*(x_new - x_lo)[:, None] + y_lo


4999 4999 4999


In [30]:
corr_list = getPearson(fp_file1, fp_file2)
print(len(corr_list))

  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]
  y_new = slope*(x_new - x_lo)[:, None] + y_lo


4999


In [33]:
sam_list = getSAM(fp_file1, fp_file2)
print(len(sam_list))

  slope = (y_hi - y_lo) / (x_hi - x_lo)[:, None]
  y_new = slope*(x_new - x_lo)[:, None] + y_lo


4999


In [36]:
pso_arr = np.array(pso_list)
pso_arr = pso_arr[~np.isnan(pso_arr)]
print(np.mean(pso_arr))
print(np.std(pso_arr))

0.3869626450580232
0.04771419768822357


In [37]:
corr_arr = np.array(corr_list)
corr_arr = corr_arr[~np.isnan(corr_arr)]
print(np.mean(corr_arr))
print(np.std(corr_arr))

0.040044331175459094
0.10829241054950708


In [38]:
sam_arr = np.array(sam_list)
sam_arr = sam_arr[~np.isnan(sam_arr)]
print(np.mean(sam_arr))
print(np.std(sam_arr))

0.30358397329042336
0.028562383954158607
