In [None]:
path_class_0 = "../data/gpt-4/xsum_gpt-4.original.mistral.nllzs.fftnorm.txt"
path_class_1 = "../data/gpt-4/xsum_gpt-4.sampled.mistral.nllzs.fftnorm.txt"

label_0 = path_class_0.split("/")[-1]
label_1 = path_class_1.split("/")[-1]

# read the data

class_0 = []
class_1 = []

with open(path_class_0, "r") as f:
    lines = f.readlines()
    freq_n_powers = []
    for line in lines[1:]:
        freq = float(line.split(',')[-2])
        power = float(line.split(',')[-1])
        if freq == 0.0 and freq_n_powers != []:
            class_0.append(freq_n_powers)
            freq_n_powers = []
        freq_n_powers.append((freq, power))

with open(path_class_1, "r") as f:
    lines = f.readlines()
    freq_n_powers = []
    for line in lines[1:]:
        freq = float(line.split(",")[-2])
        power = float(line.split(",")[-1])
        if freq == 0.0 and freq_n_powers != []:
            class_1.append(freq_n_powers)
            freq_n_powers = []
        freq_n_powers.append((freq, power))

In [None]:
print(len(class_0))
print(len(class_1))

In [None]:
# test plot
import matplotlib.pyplot as plt
import numpy as np

plt.plot(np.array(class_0[0])[:,0],
            np.array(class_0[0])[:,1], label=label_0)
plt.plot(np.array(class_1[0])[:,0],
            np.array(class_1[0])[:,1], label=label_1)
plt.legend()
plt.show()

In [None]:
# mean plot

# interpolate the data
def interpolate(data, n):
    freqs = np.array(data)[:,0]
    powers = np.array(data)[:,1]
    freqs_new = np.linspace(0, 0.5, n)
    powers_new = np.interp(freqs_new, freqs, powers)
    return np.array([freqs_new, powers_new]).T

max_len = 0
for i in range(len(class_0)):
    if len(class_0[i]) > max_len:
        max_len = len(class_0[i])
    if len(class_1[i]) > max_len:
        max_len = len(class_1[i])

class_0 = [interpolate(data, max_len) for data in class_0]
class_1 = [interpolate(data, max_len) for data in class_1]

# plot the mean
mean_class_0 = np.mean(np.array(class_0), axis=0)
mean_class_1 = np.mean(np.array(class_1), axis=0)

plt.plot(mean_class_0[:,0], mean_class_0[:,1], label=label_0)
plt.plot(mean_class_1[:,0], mean_class_1[:,1], label=label_1)
plt.legend()
plt.show()

In [None]:
# Earth Mover's Distance
def emd(data1, data2):
    from scipy.stats import wasserstein_distance
    return wasserstein_distance(data1, data2)

In [None]:
# raw AUC classification
mean_auc_0 = np.trapz(mean_class_0[:,1], mean_class_0[:,0])
mean_auc_1 = np.trapz(mean_class_1[:,1], mean_class_1[:,0])
print(mean_auc_0, mean_auc_1)

correct_auc = 0
correct_emd = 0
total = 0

for i in range(len(class_0)):
    auc_0 = np.trapz(class_0[i][:,1], class_0[i][:,0])
    auc_1 = np.trapz(class_1[i][:,1], class_1[i][:,0])
    if np.abs(auc_0 - mean_auc_0) < np.abs(auc_0 - mean_auc_1):
        correct_auc += 1
    if np.abs(auc_1 - mean_auc_1) < np.abs(auc_1 - mean_auc_0):
        correct_auc += 1
    if emd(class_0[i][:,1], mean_class_0[:,1]) < emd(class_0[i][:,1], mean_class_1[:,1]):
        correct_emd += 1
    if emd(class_1[i][:,1], mean_class_1[:,1]) < emd(class_1[i][:,1], mean_class_0[:,1]):
        correct_emd += 1
    total += 2

print(correct_auc/total)
print(correct_emd/total)

In [None]:
# GAM fitted AUC classification
from pygam import LinearGAM

gam_0 = LinearGAM().fit(mean_class_0[:,0], mean_class_0[:,1])
gam_1 = LinearGAM().fit(mean_class_1[:,0], mean_class_1[:,1])

gam_curve_0, confi_0 = gam_0.partial_dependence(0, X=mean_class_0[:,0], width=0.95)
gam_curve_1, confi_1 = gam_1.partial_dependence(0, X=mean_class_1[:,0], width=0.95)

# test plot
plt.plot(mean_class_0[:,0], mean_class_0[:,1], label=label_0)
plt.plot(mean_class_1[:,0], mean_class_1[:,1], label=label_1)
plt.plot(mean_class_0[:, 0], gam_curve_0, label=label_0 + " GAM")
plt.plot(mean_class_1[:, 0], gam_curve_1, label=label_1 + " GAM")
plt.legend()
plt.show()

mean_auc_0_gam = np.trapz(gam_curve_0, mean_class_0[:,0])
mean_auc_1_gam = np.trapz(gam_curve_1, mean_class_1[:,0])
print(mean_auc_0_gam, mean_auc_1_gam)

correct_auc = 0
correct_emd = 0
total = 0

for i in range(len(class_0)):
    temp_gam_0 = LinearGAM().fit(class_0[i][:,0], class_0[i][:,1])
    temp_gam_1 = LinearGAM().fit(class_1[i][:,0], class_1[i][:,1])
    temp_gam_curve_0, _ = temp_gam_0.partial_dependence(0, X=class_0[i][:,0], width=0.95)
    temp_gam_curve_1, _ = temp_gam_1.partial_dependence(0, X=class_1[i][:,0], width=0.95)
    auc_0 = np.trapz(temp_gam_curve_0, class_0[i][:,0])
    auc_1 = np.trapz(temp_gam_curve_1, class_1[i][:,0])
    if np.abs(auc_0 - mean_auc_0_gam) < np.abs(auc_0 - mean_auc_1_gam):
        correct_auc += 1
    if np.abs(auc_1 - mean_auc_1_gam) < np.abs(auc_1 - mean_auc_0_gam):
        correct_auc += 1
    if emd(temp_gam_curve_0, mean_class_0[:,1]) < emd(temp_gam_curve_0, mean_class_1[:,1]):
        correct_emd += 1
    if emd(temp_gam_curve_1, mean_class_1[:,1]) < emd(temp_gam_curve_1, mean_class_0[:,1]):
        correct_emd += 1
    total += 2

print(correct_auc/total)
print(correct_emd/total)

In [None]:
for i in range(3):
    temp_gam_0 = LinearGAM().fit(class_0[i][:, 0], class_0[i][:, 1])
    temp_gam_1 = LinearGAM().fit(class_1[i][:, 0], class_1[i][:, 1])
    temp_gam_curve_0, _ = temp_gam_0.partial_dependence(
        0, X=class_0[i][:, 0], width=0.95
    )
    temp_gam_curve_1, _ = temp_gam_1.partial_dependence(
        0, X=class_1[i][:, 0], width=0.95
    )
    auc_0 = np.trapz(temp_gam_curve_0, class_0[i][:, 0])
    auc_1 = np.trapz(temp_gam_curve_1, class_1[i][:, 0])
    # plot GAMs
    plt.plot(class_0[i][:, 0], temp_gam_curve_0, label="0 GAM")
    plt.plot(class_1[i][:, 0], temp_gam_curve_1, label="1 GAM")
    plt.plot(mean_class_0[:, 0], gam_curve_0, label=label_0 + " GAM")
    plt.plot(mean_class_1[:, 0], gam_curve_1, label=label_1 + " GAM")
    plt.legend()
    plt.show()