In [20]:
import numpy as np
import pandas as pd
from scipy.fft import fft, fftfreq, fftshift
import tqdm

In [40]:
def _read_data(data_file, N=np.inf):
    data = []
    with open(data_file, 'r') as f:
        count = 0
        for line in f:
            line = line.strip()
            if line == '':
                continue
            num = list(map(float, line.split()))
            data.append(num)
            count += 1
            if count >= N:
                break
    return data


def compute_fft(data):
    freqs, powers = [], []
    for i in tqdm.tqdm(range(len(data))):
        x = data[i]
        try:
            N = x.shape[-1]
            freq_x = fftshift(fftfreq(N))
            sp_x = fftshift(fft(x)).real # take the real part
        except Exception:
            print(f'Error in sample {i}: {x}')
            raise
        freqs.append(freq_x[len(freq_x)//2:])
        powers.append(sp_x[len(sp_x)//2:])
    return freqs, powers

def compute_fft2(data: np.ndarray, norm=False):
    N = data.shape[-1]
    freq_x = fftshift(fftfreq(N))
    if norm:
        data = (data - np.mean(data, axis=-1, keepdims=True)) / np.std(data, axis=-1, keepdims=True)
    sp_x = fftshift(fft(data)).real # take the real part
    return freq_x[len(freq_x)//2:], sp_x[len(sp_x)//2:]


In [5]:
nll_data = _read_data('../data/gpt-4/pubmed_gpt-4.sampled.mistral.nll.txt')
print(nll_data[0])

[5.2005, 0.0121, 3.291, 10.8107, 7.6796, 4.4353, 1.5909, 7.6406, 3.6414, 0.0006, 1.286, 4.4668, 3.7501, 0.0003, 0.0004, 0.0615, 0.6548, 8.5026, 0.044, 0.896, 1.5821, 4.0789, 0.1211, 0.0174, 0.0134, 0.2654, 0.6334, 0.0553, 0.0005, 0.1795, 0.0857, 0.0187, 0.0, 0.0007, 0.0086, 0.475, 4.734, 5.5062, 0.1136, 0.0002, 5.8736, 0.2558, 2.3565, 1.38, 2.6832, 0.9149, 1.5793, 6.7308, 2.2256, 0.4622, 0.206, 1.5434, 0.2091, 0.0028, 0.0056, 0.0031, 1.6539, 2.208, 3.474, 4.1771, 0.9314, 0.2126, 0.7668, 2.0944, 0.0144, 0.0158, 0.4435, 0.2749, 0.7321, 6.2295, 0.0016, 0.6643, 2.7135, 0.0001, 2.0594]


In [17]:
# shuffle the entire sequence
np.random.seed(0)
n_shuf_times = 100
shuf_data = []
for i in range(n_shuf_times):
    nll = nll_data[0].copy()
    np.random.shuffle(nll)
    shuf_data.append(nll)

df_shuf = pd.DataFrame.from_dict({i: shuf_data[i] for i in range(len(shuf_data))})
print(df_shuf.shape)
df_shuf.to_csv('shuf_data0.csv', index=False, header=False)
df_shuf

(95, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0002,2.8874,0.0071,0.0001,2.3192,9.1995,0.1654,14.0527,0.0002,1.7194,...,0.0909,1.7194,0.3976,0.0082,0.0227,16.9517,0.6847,0.0550,0.0385,0.0028
1,0.0085,0.0001,0.0859,0.1769,0.2831,0.0003,2.3192,0.0005,0.0227,0.0385,...,0.3229,2.3192,0.0071,0.2997,0.0040,1.0215,3.3275,2.2223,0.7450,8.1185
2,0.4295,0.0385,5.8012,0.0002,0.0002,0.3976,0.0000,0.0020,0.0385,0.0893,...,3.3275,0.0666,0.0550,1.0215,2.2223,1.7607,0.0020,0.8446,5.1958,0.1769
3,0.1665,0.2806,0.0000,0.0028,0.1654,2.8430,0.0859,0.1665,0.0645,0.0068,...,3.0520,3.0520,0.0553,0.0003,0.0893,0.1654,0.3229,4.2727,0.0550,0.1213
4,2.2223,1.7607,0.0003,0.0553,7.5842,0.0055,0.0909,2.3192,9.1995,6.5427,...,0.0071,3.1373,0.0028,1.8428,0.7039,0.0385,0.0001,0.0000,0.0071,0.0038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,0.0055,4.7801,1.8428,0.0105,0.1213,0.0184,9.1995,0.8446,0.0155,0.0184,...,0.7039,1.8428,0.0645,0.4295,0.0550,0.1155,7.5842,0.3935,1.7194,0.0550
91,0.0028,1.0256,0.9354,0.0666,1.8428,1.7194,0.0010,0.0105,0.7039,0.0001,...,1.7194,0.8446,0.7587,0.3935,1.2927,0.0645,2.1147,0.0002,0.3976,0.0893
92,0.7039,0.0103,0.0385,1.5434,0.0030,0.0105,0.0666,0.0002,1.7607,0.0003,...,0.0103,5.8012,0.0068,5.8012,0.0085,3.3275,5.2548,0.0028,0.0666,0.0081
93,0.0001,0.0038,0.0011,0.0068,0.4295,0.0000,0.0005,1.6199,0.7450,0.0030,...,5.8012,0.0000,0.0227,0.1654,0.4295,0.0055,0.4295,0.0082,0.0387,7.5842


In [35]:
x = df_shuf.iloc[:,0].to_numpy()
print(x.shape)
x.shape[-1]

freqs, powers = compute_fft2(x)
print(len(freqs), len(powers))

(95,)
48 48


In [38]:
# conduct fft to each column of shuf_data
fft_list = []
for i in range(df_shuf.shape[1]):
    x = df_shuf.iloc[:,i].to_numpy()
    freqs, powers = compute_fft2(x)
    fft_list.append(powers)
fft_list.append(freqs)

fft_dict = {i: fft_list[i] for i in range(len(fft_list)-1)}
fft_dict['freqs'] = fft_list[-1]

df_fft = pd.DataFrame.from_dict(fft_dict)
df_fft.to_csv('shuf_data0_fft.csv', index=False, header=True)

In [41]:
# fft on normalzied nll
fftnorm_list = []
for i in range(df_shuf.shape[1]):
    x = df_shuf.iloc[:,i].to_numpy()
    freqs, powers = compute_fft2(x, norm=True)
    fftnorm_list.append(powers)
fftnorm_list.append(freqs)

fftnorm_dict = {i: fftnorm_list[i] for i in range(len(fftnorm_list)-1)}
fftnorm_dict['freqs'] = fftnorm_list[-1]

df_fftnorm = pd.DataFrame.from_dict(fftnorm_dict)
df_fftnorm.to_csv('shuf_data0_fftnorm.csv', index=False, header=True)