In [13]:
import os
import pandas as pd
import numpy as np
import json
import pickle
import argparse

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import ttest_1samp, ttest_ind, ttest_rel, permutation_test
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from statsmodels.stats import multitest
import statsmodels.api as sm
from scipy import stats

from tfsemb_class import tsne, save_pickle, add_speech
from tfsplt_encoding import get_cmap_smap, aggregate_data, organize_data
from tfsplt_utils import read_folder, load_pickle, get_con_color
from tfsplt_brainmap import get_sigelecs, Colorbar, make_brainmap
from tfsplt_brainmap_cat import make_brainmap_cat

def fdr(pvals):
    _, pcor, _, _ = multitest.multipletests(
        pvals, method="fdr_bh", is_sorted=False
    )
    return pcor

In [14]:
# MODELS = ["distilgpt2", "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]
MODELS = ["gpt-neo-125M", "gpt-neo-1.3B", "gpt-neo-2.7B", "gpt-neox-20b"]
# MODELS = ["opt-125m", "opt-350m", "opt-1.3b", "opt-2.7b", "opt-6.7b", "opt-13b","opt-30b-q","opt-66b-q"]
# MODELS = ["opt-30b-q","opt-66b-q"]
# MODELS = ["Llama-2-7b-hf","Llama-2-13b-hf","Llama-2-70b-hf-q"]

MODEL_LAYERS = { # model: [first, mid, last, size]
    "gpt-neo-125M":[0,6,12,125000000],
    "gpt-neo-1.3B":[0,12,24,1300000000],
    "gpt-neo-2.7B":[0,16,32,2700000000],
    "gpt-neox-20b":[0,22,44,20000000000],
    "distilgpt2":[0,3,6,82000000],
    "gpt2":[0,6,12,124000000],
    "gpt2-medium":[0,12,24,355000000],
    "gpt2-large":[0,16,32,774000000],
    "gpt2-xl":[0,24,48,1500000000],
    "Llama-2-7b-hf":[1,16,32,7000000000],
    "Llama-2-13b-hf":[0,20,40,13000000000],
    # "Llama-2-7b-hf-q":[1,16,32,7000000000],
    # "Llama-2-13b-hf-q":[0,20,40,13000000000],
    "Llama-2-70b-hf-q":[0,40,80,70000000000],
    "opt-125m":[0,6,12,125000000],
    "opt-350m":[0,12,24,350000000],
    "opt-1.3b":[0,12,24,1300000000],
    "opt-2.7b":[0,16,32,2700000000],
    "opt-6.7b":[1,16,32,6700000000],
    "opt-13b":[0,20,40,13000000000],
    # "opt-125m-q":[0,6,12,125000000],
    # "opt-350m-q":[0,12,24,350000000],
    # "opt-1.3b-q":[0,12,24,1300000000],
    # "opt-2.7b-q":[0,16,32,2700000000],
    # "opt-6.7b-q":[1,16,32,6700000000],
    # "opt-13b-q":[0,20,40,13000000000],
    "opt-30b-q":[1,24,48,30000000000],
    "opt-66b-q":[1,32,64,66000000000],
}

## Encoding Layers

Read data

In [None]:
load_sid = 777
key = "comp"
# paths = "../data/encoding/podcast/20231001-gpt-neo/kw-podcast-full-777-%s-lag2k-25-all-2048-%s/kw-200ms-all-777/*_%s.csv"
# paths = "../data/encoding/podcast/20231002-gpt-neo-n/kw-podcast-full-777-%s-lag2k-25-all-shift-emb-2048-%s/kw-200ms-all-777/*_%s.csv"
paths = "../data/encoding/podcast/20231209-gpt-neo-n-ridge-gpu/kw-podcast-full-777-%s-lag2k-25-all-shift-emb-2048-%s-ridge/kw-200ms-all-777/*_%s.csv"
# paths = "../data/encoding/podcast/20231214-llama-2-n/kw-podcast-full-777-%s-lag2k-25-all-shift-emb-4096-%s/kw-200ms-all-777/*_%s.csv"
# paths = "../data/encoding/podcast/20231215-llama-2-n-ridge/kw-podcast-full-777-%s-lag2k-25-all-shift-emb-4096-%s-ridge/kw-200ms-all-777/*_%s.csv"
# paths = "../data/encoding/podcast/20231221-gpt2-n-ridge/kw-podcast-full-661-%s-lag2k-25-all-shift-emb-2048-%s-ridge/kw-200ms-all-777/*_%s.csv"
# paths = "../data/encoding/podcast/20231222-opt-n-ridge/kw-podcast-full-777-%s-lag2k-25-all-shift-emb-2048-%s-ridge/kw-200ms-all-777/*_%s.csv"

data = []
for model_idx, model in enumerate(MODELS):
  layer_first = MODEL_LAYERS[model][0]
  layer_last = MODEL_LAYERS[model][2]
  for layer in np.arange(layer_first,layer_last + 1):
      fname = paths % (model, f"{layer:02}", key)
      data = read_folder(
          data,
          fname,
          {},
          (load_sid, key),
          load_sid,
          key,
          model,
          f"{layer:02}",
          True,
      )
df = pd.concat(data)

Process df

In [None]:
# Get max and layer ratio
df.drop(columns=["sid","key"], inplace=True, errors="ignore")
df["max"] = df.loc[:,(np.arange(0,161))].max(axis=1)
df["label2"] = df.label2.astype(int)
df["layer_ratio"] = df.loc[:,("label1","label2")].groupby("label1").transform(lambda x: (x / max(x) * 100))

# Get Area
area_df = pd.read_csv("../data/plotting/paper-mia/area_electrodes2.csv")
df = df.merge(area_df, on="electrode",how="left")

In [None]:
# Get model size
df["model_size"] = 0
df["model_family"] = "gpt-neo"
for model in MODELS:
    df.loc[df.label1 == model, "model_size"] = MODEL_LAYERS[model][3]

In [None]:
df.to_csv("../data/plotting/paper-mia/opt-ridge-q.csv",index=False)

In [None]:
##### GET AREA CSVS #####

# area_file = "../data/plotting/paper-mia/area_electrodes.json"
# with open(area_file, "r") as j:
#     area_dict = json.loads(j.read())

# area_df = []
# for area in area_dict:
#     for elec in area_dict[area]:
#         area_df.append([elec, area])

# area_df = pd.DataFrame(area_df)
# area_df.columns = ["electrode","area"]
# area_df.to_csv("../data/plotting/paper-mia/area_electrodes.csv",index=False)

# area_ifg = area_df[area_df.area == "IFG"]
# subarea = pd.read_csv("../data/plotting/sig-elecs/podcast-old/elec_masterlist.csv")
# subarea["electrode"] = subarea.subject.astype(int).astype(str) + "_" + subarea.name
# subarea = subarea.loc[:,("electrode","NYU_class2")]
# subarea = area_ifg.merge(subarea,on="electrode",how="left")
# area_df = area_df.merge(subarea.loc[:,("electrode","NYU_class2")],on="electrode",how="left")
# area_df.loc[area_df.NYU_class2 == "parstriangularis","area"] = "BA45"
# area_df.loc[area_df.NYU_class2 == "parsopercularis","area"] = "BA44"
# area_df.loc[:,("electrode","area")].to_csv("../data/plotting/paper-mia/area_electrodes2.csv",index=False)

Layer max corr (get max across lags per elec, then average max)

In [None]:
df_max = df.loc[:,("label1","label2","layer_ratio","max",)]
# df_max = df.loc[df.area == "TP",("label1","label2","layer_ratio","max",)]

layer_means = df_max.groupby(["label1","label2"]).mean()
layer_sems = df_max.groupby(["label1","label2"]).sem()
layer_means.reset_index(inplace=True)
layer_sems.reset_index(inplace=True)


Layer max corr2 (average each lag across electrodes, then get max lag)

In [None]:
# df2 = df
df2 = df.loc[df.area == "TP"]
layer_means = df2.iloc[:,0:166].drop(columns=["electrode"], errors="ignore").groupby(["label1","label2"]).mean()
layer_means["max"] = layer_means.loc[:,(np.arange(0,161).astype(str))].max(axis=1)
layer_means["sem"] = layer_means.loc[:,(np.arange(0,161).astype(str))].sem(axis=1)
layer_means.reset_index(inplace=True)

layer_sems = pd.DataFrame({
    "label1":layer_means.label1,
    "label2":layer_means.label2,
    "layer_ratio":layer_means.layer_ratio,
    "max":layer_means["sem"]
    })
layer_means = layer_means.loc[:,("label1","label2","layer_ratio","max")]

Get best layer

In [None]:
for line, subdf in layer_means.groupby("label1"):
    print(line, subdf["max"].max(), layer_means.loc[subdf["max"].idxmax(),"label2"])

Plot encoding max per layer

In [None]:
lags = np.arange(-2000,2001,25)
fig, axes = plt.subplots(2,1, figsize=(20,20))
alpha = 0.2
lw = 3
markersize = 15
fontsize = 25
ticksize = 20
colors = [sns.color_palette("hls",8)[3],
          sns.color_palette("hls",8)[6],
          sns.color_palette("hls",8)[4],
          sns.color_palette("hls",8)[7]
]
sns.set_style('whitegrid')
# for (line1, subdf1),(line2, subdf2) in zip(layer_means.groupby("label1", axis=0),layer_sems.groupby("label1",axis=0)):
for model, color in zip(MODEL_LAYERS, colors):
    subdf1 = layer_means[layer_means.label1 == model]
    subdf2 = layer_sems[layer_sems.label1 == model]
    axes[0].fill_between(
        subdf1["layer_ratio"],
        subdf1["max"] - subdf2["max"],
        subdf1["max"] + subdf2["max"],
        alpha=alpha,
        color=color,
    )
    axes[0].plot(
        subdf1["layer_ratio"],
        subdf1["max"],
        # label=line1,
        label=model,
        lw=lw,
        marker=".",
        markersize=markersize,
        color=color
    )
    axes[1].fill_between(
        subdf1["label2"],
        subdf1["max"] - subdf2["max"],
        subdf1["max"] + subdf2["max"],
        alpha=alpha,
        color=color,
    )
    axes[1].plot(
        subdf1["label2"],
        subdf1["max"],
        # label=line1,
        label=model,
        lw=lw,
        color=color,
    )
axes[0].set_xlabel("Layers Percentage %",fontsize=fontsize)
axes[0].set_ylabel("Max Correlation (r)",fontsize=fontsize)
axes[1].set_xlabel("Layers",fontsize=fontsize)
axes[1].set_ylabel("Max Correlation (r)",fontsize=fontsize)
# axes[0].set_ylim(0.11, 0.225)
# axes[1].set_ylim(0.11, 0.225)
# axes[0].set_ylim(0.15, 0.285)
# axes[1].set_ylim(0.15, 0.285)
axes[0].set_ylim(0.10, 0.39)
axes[1].set_ylim(0.10, 0.39)
axes[0].tick_params(axis="both", which="major", labelsize=ticksize)
axes[1].tick_params(axis="both", which="major", labelsize=ticksize)
plt.savefig("../TP.svg")

## All models

Load results

In [None]:
FILES = [
    "gpt-neo-ridge",
    "gpt2-ridge",
    # "llama2-ridge",
    "llama2-ridge-q",
    # "opt-ridge",
    "opt-ridge-q"
]

df_plot_all = pd.DataFrame()
for file in FILES:
    # read file and get max per elec across layers
    df = pd.read_csv(f"../data/plotting/paper-mia/{file}.csv")
    df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
    df_plot.reset_index(inplace=True)
    # calculate percentage
    # df_plot = df_plot[df_plot.label1 != "distilgpt2"]
    new_df = df_plot.sort_values(by="model_size",ascending=True).loc[:,("label1")].groupby(df_plot["electrode"]).apply(lambda x : x).reset_index()
    new_df["percent"] = df_plot.sort_values(by="model_size",ascending=True).loc[:,"max"].groupby(df_plot["electrode"]).apply(lambda x : x / x.iloc[0] * 100 - 100).tolist()
    df_plot = df_plot.merge(new_df.loc[:,("label1","electrode","percent")],on=["label1","electrode"])
    # concat to big dataframe
    df_plot_all = pd.concat((df_plot_all,df_plot))
df_plot_all["layer_num"] = 0
for model_idx, model in enumerate(MODEL_LAYERS):
    df_plot_all.loc[df_plot_all.label1 == model, "layer_num"] = MODEL_LAYERS[model][2]

Plot scatter with standard error

In [None]:
sns.set_style('whitegrid')
snsplt = sns.lineplot(
    data=df_plot_all,
    x="model_size",
    y="layer_ratio",
    hue="model_family",
    marker="o",
    markersize=10,
    # alpha = 0.5,
    linestyle='',
    err_style='bars',
    errorbar="se",
    # hue_order=["gpt2","llama2","opt","gpt-neo","llama2-q","opt-q"]
    hue_order=["gpt2","llama2-q","opt-q","gpt-neo"]
)
snsplt.set(xscale='log')
plt.savefig("../layer.svg")

Plot scatter with reg line

In [None]:
xcol = "model_size"
ycol = "layer_ratio"
sns.set_style('whitegrid')
fig, ax = plt.subplots(1,1)
plt.xscale("log")
sns.lineplot(
    data=df_plot_all,
    x=xcol,
    y=ycol,
    hue="model_family",
    marker="o",
    markersize=10,
    # alpha = 0.5,
    linestyle='',
    err_style='bars',
    errorbar="se",
    # hue_order=["gpt2","llama2","opt","gpt-neo","llama2-q","opt-q"]
    hue_order=["gpt2","llama2-q","opt-q","gpt-neo"],
    ax = ax
)
sns.regplot(
    data=df_plot_all.loc[df_plot_all.model_family == "gpt2"],
    x=xcol,
    y=ycol,
    scatter = False,
    logx=True,
    color = "#1f77b4",
    ax = ax
)
sns.regplot(
    data=df_plot_all.loc[df_plot_all.model_family == "llama2-q"],
    x=xcol,
    y=ycol,
    scatter = False,
    logx=True,
    color = "#ff7f0e",
    ax = ax
)
sns.regplot(
    data=df_plot_all.loc[df_plot_all.model_family == "opt-q"],
    x=xcol,
    y=ycol,
    scatter = False,
    logx=True,
    color = "#2ca02c",
    ax = ax
)
sns.regplot(
    data=df_plot_all.loc[df_plot_all.model_family == "gpt-neo"],
    x=xcol,
    y=ycol,
    scatter = False,
    logx=True,
    color = "#d62728",
    ax = ax
)

plt.savefig("../layer.svg")

In [None]:
df_gpt2 = df_plot_all.loc[df_plot_all.model_family == "gpt2"]
print(stats.linregress(np.log10(df_gpt2.model_size),df_gpt2.layer_ratio))
df_gpt2 = df_plot_all.loc[df_plot_all.model_family == "llama2-q"]
print(stats.linregress(np.log10(df_gpt2.model_size),df_gpt2.layer_ratio))
df_gpt2 = df_plot_all.loc[df_plot_all.model_family == "opt-q"]
print(stats.linregress(np.log10(df_gpt2.model_size),df_gpt2.layer_ratio))
df_gpt2 = df_plot_all.loc[df_plot_all.model_family == "gpt-neo"]
print(stats.linregress(np.log10(df_gpt2.model_size),df_gpt2.layer_ratio))


Plot percent increase for ROIs

In [None]:
df_plot_area = df_plot_all.merge(area_df, on="electrode",how="left")
sns.set_style('whitegrid')
snsplt = sns.lineplot(
    data=df_plot_area,
    x="model_size",
    y="percent",
    hue="area",
    marker="o",
    markersize=10,
    # alpha = 0.5,
    linestyle='',
    err_style='bars',
    errorbar="se",
    hue_order=["mSTG","TP","BA44","aSTG", "BA45"]
)
# snsplt = sns.lineplot(
#     data=df_plot_area,
#     x="model_size",
#     y="percent",
#     marker="o",
#     markersize=10,
#     # hue="model_family",
#     errorbar="se",
#     # palette=["#000000"]
# )
snsplt.set(xscale='log')
plt.savefig("../percent_rois.svg")

## Max Corr

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcast_%s.svg"
  final = True
  final2 = False
  shiny = False

args = Args()

args.color_split = [Colorbar(title="best layer",colorscale="viridis",bar_max=0.45, bar_min=0)]

# df_plot = max_cor_1.sort_values(["max_corr"], ascending=False).groupby("elec_name").first()
# df_plot.reset_index(inplace=True)
# df_plot["effect"] = df_plot.max_corr
# df_plot["electrode"] =  df_plot.elec_name.str.replace("_comp","")
# df_plot["subject"] = df_plot.electrode.str.split("_", n=1, expand=True)[0]
# df_plot = df_plot.loc[:, ("subject","electrode", "effect")]
# fig = make_brainmap(args, df_plot, args.outfile) # plot png

df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot["effect"] = df_plot["max"]
df_plot["subject"] = df_plot.electrode.str.split("_", n=1, expand=True)[0]
for model, subdf in df_plot.groupby("label1", axis=0):
    if model != "gpt-neo-125M":
      continue
    subdf_plot = subdf.loc[:, ("subject","electrode", "effect")]
    print(model, subdf_plot.effect.mean())
    fig = make_brainmap(args, subdf_plot, args.outfile % model) # plot png

## Max Corr Diff

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcast-diff-%s.svg"
  final = True
  final2 = False
  shiny = False

args = Args()

# Colorsplit
pos_bar = Colorbar(title=f"Δ % corr pos",colorscale=[[0, "rgb(255,248,240)"], [1, "rgb(255,0,0)"]],bar_min=0,bar_max=30)
neg_bar = Colorbar(title=f"Δ % corr neg",colorscale=[[0, "rgb(0,0,255)"], [1, "rgb(240,248,255)"]],bar_min=-30,bar_max=0)
args.color_split = [neg_bar,0,pos_bar]

df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot = df_plot.loc[:,("label1","label2","electrode","max")]

df_base = df_plot.loc[df_plot["label1"] == "gpt-neo-125M",("electrode","max")].copy()
df_plot = df_plot.merge(df_base,left_on="electrode",right_on="electrode",how="outer")
# df_plot["effect"] = (df_plot["max_x"] - df_plot["max_y"]) / df_plot[["max_x", "max_y"]].max(axis=1)
df_plot["effect"] = df_plot["max_x"] / df_plot["max_y"] * 100 - 100
df_plot["subject"] = df_plot.electrode.str.split("_", n=1, expand=True)[0]
for model, subdf in df_plot.groupby("label1", axis=0):
  if model == "gpt-neo-125M":
    continue
  subdf_plot = subdf.loc[:, ("subject", "electrode", "effect")]
  print(model, subdf_plot.effect.mean())
  # fig = make_brainmap(args, subdf_plot, args.outfile % model) # plot png


## Max Corr Diff with Sig Test

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcast-diff-%s.svg"
  final = False
  final2 = True
  shiny = False

args = Args()

# Colorsplit
pos_bar = Colorbar(title=f"Δ % corr pos",colorscale=[[0, "rgb(255,248,240)"], [1, "rgb(255,0,0)"]],bar_min=0,bar_max=30)
neg_bar = Colorbar(title=f"Δ % corr neg",colorscale=[[0, "rgb(0,0,255)"], [1, "rgb(240,248,255)"]],bar_min=-30,bar_max=0)
args.color_split = [neg_bar,0,pos_bar]

df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot["p_val"] = 100
for i in np.arange(0,160):
    medium = ttest_rel(df_plot.loc[i + 160,(np.arange(0,161))],df_plot.loc[i,(np.arange(0,161))],alternative="two-sided")[1]
    large = ttest_rel(df_plot.loc[i + 160,(np.arange(0,161))],df_plot.loc[i + 320,(np.arange(0,161))],alternative="two-sided")[1]
    xl = ttest_rel(df_plot.loc[i + 160,(np.arange(0,161))],df_plot.loc[i + 480,(np.arange(0,161))],alternative="two-sided")[1]
    df_plot.loc[i, "p_val"] = medium
    df_plot.loc[i + 320, "p_val"] = large
    df_plot.loc[i + 480, "p_val"] = xl
df_plot = df_plot.loc[:,("label1","label2","electrode","max","p_val")]

df_base = df_plot.loc[df_plot["label1"] == "gpt-neo-125M",("electrode","max")].copy()
df_plot = df_plot.merge(df_base,left_on="electrode",right_on="electrode",how="outer")
# df_plot["effect"] = (df_plot["max_x"] - df_plot["max_y"]) / df_plot[["max_x", "max_y"]].max(axis=1)
df_plot["effect"] = df_plot["max_x"] / df_plot["max_y"] * 100 - 100
df_plot["subject"] = df_plot.electrode.str.split("_", n=1, expand=True)[0]
for model, subdf in df_plot.groupby("label1", axis=0):
  if model == "gpt-neo-125M":
    continue
  subdf.p_val = fdr(subdf.p_val)
  subdf_plot = subdf.loc[subdf.p_val <= 0.001, ("subject", "electrode", "effect")]
  print(model, len(subdf_plot), subdf_plot.effect.mean())
  fig = make_brainmap(args, subdf_plot, args.outfile % model) # plot png

## Max Corr Layer

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcastl_%s.svg"
  final = True
  final2 = False
  shiny = False

args = Args()

args.color_split = [Colorbar(title="Layer with best correlation (relative %)",colorscale="inferno",bar_max=100, bar_min=0)]
# args.color_split = [Colorbar(title="Layer with best correlation (relative %)",colorscale="inferno",bar_max=22, bar_min=0)]

df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot["effect"] = df_plot.layer_ratio
# df_plot["effect"] = df_plot.label2
df_plot["subject"] = df_plot.electrode.str.split("_", n=1, expand=True)[0]
for model, subdf in df_plot.groupby("label1", axis=0):
    # if model != "gpt-neox-20b":
    #   continue
    subdf_plot = subdf.loc[:, ("subject","electrode", "effect")]
    # subdf_plot = subdf[subdf.effect <= 22]
    print(model, subdf_plot["effect"].mean())
    fig = make_brainmap(args, subdf_plot, args.outfile % model) # plot png

## Max Corr Lag

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcastlag_%s.svg"
  final = True
  final2 = False
  shiny = False

args = Args()
lags = np.arange(-2000,2001,25)

args.color_split = [Colorbar(title="Lag with best correlation (relative %)",colorscale="delta_r",bar_max=600, bar_min=-600)]

df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot["maxlag"] = lags[df_plot.loc[:,np.arange(0,161)].idxmax(axis=1)]
df_plot["effect"] = df_plot.maxlag
df_plot["subject"] = df_plot.electrode.str.split("_", n=1, expand=True)[0]
for model, subdf in df_plot.groupby("label1", axis=0):
    # if model != "gpt-neox-20b":
    #   continue
    subdf_plot = subdf.loc[:, ("subject","electrode", "effect")]
    # subdf_plot = subdf.loc[subdf["max"] >= 0.2, ("subject","electrode", "effect")]
    subdf_plot = subdf[subdf.effect >= -600]
    subdf_plot = subdf[subdf.effect <= 600]
    print(model, subdf_plot["effect"].mean())
    fig = make_brainmap(args, subdf_plot, args.outfile % model) # plot png

## Area brainmap

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcast_roi.svg"
  final = True
  final2 = False
  shiny = False

args = Args()
# color_list = ["rgb(55,126,184)","rgb(255,127,0)","rgb(77,175,74)","rgb(152,78,163)"] # old ROI colors
# color_list = ["#2ca02c", "#ff7f0e", "#d62728", "#1f77b4", "#9467bd"] # ROI colors
color_list = ["#2ca02c", "#9467bd", "#ff7f0e", "#d62728", "#1f77b4"] # ROI colors
args.colors = color_list

area_df = pd.read_csv("../data/plotting/paper-mia/area_electrodes2.csv")
area_df.columns = ["electrode","effect"]
area_df["subject"] = area_df.electrode.str.split("_", n=1, expand=True)[0]
fig = make_brainmap_cat(args, area_df, args.outfile) # plot png


## Subject Brainmap

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "podcast"
  sid = [777] # subjects
  keys = ["comp"] # comprehension and/or production
  sig_elec_file = ["../data/plotting/sig-elecs/podcast_160.csv"]
  brain_type = "ave" # average brain
  hemisphere = "both" # only plot left hemisphere
  outfile = "../podcast_subject.svg"
  final = True
  final2 = False
  shiny = False

args = Args()
color_list = px.colors.qualitative.T10 # subject colors
color_list = [
  "#4C78A8",
  "#F58518",
  "#E45756",
  "#FF9DA6",
  "#54A24B",
  "#EECA3B",
  "#B279A2",
  "#72B7B2",
  "#9D755D",
  "#BAB0AC"
]
args.colors = color_list

df_plot = pd.read_csv(args.sig_elec_file[0])
df_plot["electrode"] = df_plot["subject"].astype(str) + "_" + df_plot.electrode
df_plot["effect"] = df_plot["subject"]
fig = make_brainmap_cat(args, df_plot, args.outfile) # plot png

## Box / Violin Plots

In [None]:
alpha = 0.2
lw = 3
markersize = 15
fontsize = 25
ticksize = 20
colors = [sns.color_palette("hls",8)[3],
          sns.color_palette("hls",8)[6],
          sns.color_palette("hls",8)[4],
          sns.color_palette("hls",8)[7]
]
colors = ['#9467bd']
# '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'
# mSTG, TP, IFG/BA44, aSTG, BA45
plt_area = "BA45"

df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)

df_plot["maxlag"] = lags[df_plot.loc[:,np.arange(0,161)].idxmax(axis=1)]
df_base = df_plot.loc[df_plot["label1"] == "gpt-neo-125M",("electrode","max")].copy()
df_plot = df_plot.merge(df_base,left_on="electrode",right_on="electrode",how="outer")
df_plot["improve"] = df_plot["max_x"] / df_plot["max_y"] * 100 - 100
# df_plot_area = df_plot.merge(area_df, on="electrode",how="left")
df_plot_area = df_plot
df_plot_area = df_plot_area[df_plot_area.area == plt_area]

sns.set_style("whitegrid")
fig, axes = plt.subplots(4,1, figsize=(10,40))

sns.boxplot(data=df_plot_area,x="label1",y="max_x",ax=axes[0],order=MODELS,palette=colors,showfliers = False)
sns.stripplot(data=df_plot_area,x="label1",y="max_x",ax=axes[0],order=MODELS,size=6, color=".3")
sns.boxplot(data=df_plot_area,x="label1",y="improve",ax=axes[1],order=MODELS,palette=colors,showfliers = False)
sns.stripplot(data=df_plot_area,x="label1",y="improve",ax=axes[1],order=MODELS,size=6, color=".3")
sns.boxplot(data=df_plot_area,x="label1",y="layer_ratio",ax=axes[2],order=MODELS,palette=colors,showfliers = False)
sns.stripplot(data=df_plot_area,x="label1",y="layer_ratio",ax=axes[2],order=MODELS,size=6, color=".3")
sns.boxplot(data=df_plot_area,x="label1",y="maxlag",ax=axes[3],order=MODELS,palette=colors,showfliers = False)
sns.stripplot(data=df_plot_area,x="label1",y="maxlag",ax=axes[3],order=MODELS,size=6, color=".3")

axes[0].set_xlabel("Models",fontsize=fontsize)
axes[0].set_ylabel("Max Correlation (r)",fontsize=fontsize)
axes[1].set_xlabel("Models",fontsize=fontsize)
axes[1].set_ylabel("Δ % Correlation",fontsize=fontsize)
axes[2].set_xlabel("Models",fontsize=fontsize)
axes[2].set_ylabel("Layer with best correlation (%)",fontsize=fontsize)
axes[0].tick_params(axis="both", which="major", labelsize=ticksize)
axes[1].tick_params(axis="both", which="major", labelsize=ticksize)
axes[2].tick_params(axis="both", which="major", labelsize=ticksize)
axes[0].set_ylim(0.05,0.6)
axes[1].set_ylim(-50,50)
# axes[2].set_ylim(-10,120)
axes[2].set_ylim(0,110)
axes[3].set_ylim(-600,600)

plt.savefig(f"../{plt_area}_box.svg")


In [None]:
model = "gpt-neox-20b"
sns.histplot(data=df_plot[df_plot["label1"] == model], x="label2",kde=True,ec=None,bins=MODEL_LAYERS[model][2]+1)

## Fig 3 Sig Test

In [None]:
plt_areas = ["mSTG", "aSTG", "BA44", "BA45", "TP"]
df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)

df_plot["maxlag"] = lags[df_plot.loc[:,np.arange(0,161)].idxmax(axis=1)]
df_base = df_plot.loc[df_plot["label1"] == "gpt-neo-125M",("electrode","max")].copy()
df_plot = df_plot.merge(df_base,left_on="electrode",right_on="electrode",how="outer")
df_plot["improve"] = df_plot["max_x"] / df_plot["max_y"] * 100 - 100

for plt_area in plt_areas:
    print(plt_area)
    df_plot_area = df_plot
    df_plot_area = df_plot_area[df_plot_area.area == plt_area]
    df_plot_area = df_plot_area[df_plot_area.label1 == "gpt-neox-20b"]
    print(ttest_rel(df_plot_area.max_y, df_plot_area.max_x, alternative="two-sided"))

## Fig 4 Sig Test

In [None]:
plt_areas = ["mSTG", "aSTG", "BA44", "BA45", "TP"]
df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)

df_plot["maxlag"] = lags[df_plot.loc[:,np.arange(0,161)].idxmax(axis=1)]
df_base = df_plot.loc[df_plot["label1"] == "gpt-neo-125M",("electrode","max")].copy()
df_plot = df_plot.merge(df_base,left_on="electrode",right_on="electrode",how="outer")
df_plot["improve"] = df_plot["max_x"] / df_plot["max_y"] * 100 - 100
# df_plot = df_plot[df_plot.label1 == "gpt-neo-125M"]
df_plot = df_plot[df_plot.label1 == "gpt-neox-20b"]
df_mstg = df_plot[df_plot.area == "mSTG"]
df_astg = df_plot[df_plot.area == "aSTG"]
df_ba44 = df_plot[df_plot.area == "BA44"]
df_ba45 = df_plot[df_plot.area == "BA45"]
df_tp = df_plot[df_plot.area == "TP"]

In [None]:
print(ttest_ind(df_mstg.layer_ratio,df_astg.layer_ratio,alternative="less"))
print(ttest_ind(df_mstg.layer_ratio,df_ba44.layer_ratio,alternative="less"))
print(ttest_ind(df_mstg.layer_ratio,df_ba45.layer_ratio,alternative="less"))
print(ttest_ind(df_mstg.layer_ratio,df_tp.layer_ratio,alternative="less"))

## Heatmaps for lags

In [None]:
def plot_heatmap(data, title, cmap="crest", vmin=0, vmax=0.3):
    # xticks = [0, 20, 40, 60, 80, 100, 120, 140, 160]
    # xticklabels = [-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]
    xticks = [0, 20, 40, 60, 80]
    xticklabels = [-1, -0.5, 0, 0.5, 1]

    plt.figure(figsize=(18, 10))

    ax = sns.heatmap(data, cmap=cmap, vmin=vmin, vmax=vmax, linewidths=0, rasterized=True)
    ax.invert_yaxis()
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    # ax.set_xlabel("Lag (s)")
    # ax.set_title(f"{title}")
    plt.xticks(rotation=0)
    plt.yticks(rotation=0)

    plt.savefig(f"../{title}.svg")

    return

In [None]:
area = "mSTG"
df2 = df.loc[df.area == area]
layer_means = df2.iloc[:,0:166].drop(columns=["electrode"], errors="ignore").groupby(["label1","label2"]).mean()
layer_means.reset_index(inplace=True)
layer_means.sort_values(by=["label1","label2"],inplace=True)
for model in MODELS:
    if "1.3B" in model or "2.7B" in model:
        continue
    layer_plots = layer_means[layer_means.label1 == model]
    # plot_data = layer_plots.loc[:,(np.arange(0,161))].to_numpy()
    plot_data = layer_plots.loc[:,(np.arange(40,121))].to_numpy()
    plot_heatmap(plot_data, f"{model}-{area}", cmap="Blues", vmin=0, vmax=0.27)
    # print(plot_data.shape)

## Compare with PCA

In [None]:

df = pd.read_csv(f"../data/plotting/paper-mia/gpt-neo-ridge.csv")
df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df2 = pd.read_csv(f"../data/plotting/paper-mia/gpt-neo-pca.csv")
df_plot2 = df2.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot2.reset_index(inplace=True)
df_plot.sort_values(by=["label1","electrode"],inplace=True)

In [None]:

df = pd.read_csv(f"../data/plotting/paper-mia/gpt-neo-ridge.csv")
df_plot = df.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df2 = pd.read_csv(f"../data/plotting/paper-mia/gpt-neo-pca.csv")
df_plot2 = df2.sort_values(["max"],ascending=False).groupby(["label1","electrode"]).first()
df_plot.reset_index(inplace=True)
df_plot2.reset_index(inplace=True)
df_plot.sort_values(by=["label1","electrode"],inplace=True)
df_plot2.sort_values(by=["label1","electrode"],inplace=True)
df_plot["max2"] = df_plot2["max"]

In [None]:
plt.style.use('/scratch/gpfs/ln1144/247-plotting/scripts/paper.mlpstyle')
# plt.style.use('../data/plotting/paper-prob-improb/paper.mlpstyle')
ls = '-'
lw = 1
colors = [sns.color_palette("hls",8)[3],
          sns.color_palette("hls",8)[6],
          sns.color_palette("hls",8)[4],
          sns.color_palette("hls",8)[7]
]

df_plot2 = df_plot[df_plot["label1"] == "gpt-neox-20b"]
df_plot2 = df_plot2[df_plot2["area"] == "TP"]
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], transform=ax.transAxes, color=colors[3])
plt.scatter(df_plot2["max"], df_plot2["max2"], s=1, color=colors[3])
ax.set_ylim(0,0.55)
ax.set_xlim(0,0.55)
plt.savefig(f"../TP-20b.svg")

## Embedding Correlation Matrices

In [None]:
def load_pickle(file):
    """Load the datum pickle and returns as a dataframe

    Args:
        file (string): labels pickle from 247-decoding/tfs_pickling.py

    Returns:
        DataFrame: pickle contents returned as dataframe
    """
    print(f"Loading {file}")
    with open(file, "rb") as fh:
        datum = pickle.load(fh)

    return datum


def load_datum(file_name):
    """Read raw datum

    Args:
        filename: raw datum full file path

    Returns:
        DataFrame: datum
    """
    datum = load_pickle(file_name)
    df = pd.DataFrame.from_dict(datum)
    return df


def pca(df, dim=50):
    pca = PCA(n_components=dim, svd_solver="auto", whiten=True)
    df_emb = df["embeddings"]
    embs = np.vstack(df_emb.values)
    print(f"PCA from {embs.shape[1]} to {dim}")
    pca_output = pca.fit_transform(embs)
    # print(f"PCA explained variance: {sum(pca.explained_variance_)}")
    # print(f"PCA explained variance ratio: {sum(pca.explained_variance_ratio_)}")
    df["embeddings"] = pca_output.tolist()
    return df

In [None]:
for model in MODELS:
    if model != "gpt2-xl":
        continue
    df_all = pd.DataFrame()
    for model_layer in np.arange(0,MODEL_LAYERS[model][2]+1):
    # for model_layer in np.arange(0,5): # for testing
        # emb_path = f"../data/pickling/podcast/777/pickles/embeddings/{model}/full/cnxt_2048/layer_{model_layer:02d}.pkl"
        emb_path = f"../data/pickling/podcast/777/pickles/embeddings/{model}/full/cnxt_1024/layer_{model_layer:02d}.pkl"
        df = load_datum(emb_path)
        is_nan = df["embeddings"].apply(lambda x: np.isnan(x).all())
        df = df[~is_nan]
        # df = pca(df,768)
        df_all[f"{model_layer:02d}"] = df.embeddings
    df_all[f"{(model_layer+1):02d}"] = df_all["00"].shift(-1)
    df_all = df_all.dropna(subset=[f"{(model_layer+1):02d}"])

In [None]:
model = "Llama-2-7b-hf"
model_layers = 32
cnxt_len = 4096
df_all = pd.DataFrame()
for model_layer in np.arange(0,model_layers+1):
# for model_layer in np.arange(0,5): # for testing
    emb_path = f"../data/pickling/podcast/777/pickles/embeddings/{model}/full/cnxt_{cnxt_len:04d}/layer_{model_layer:02d}.pkl"
    df = load_datum(emb_path)
    is_nan = df["embeddings"].apply(lambda x: np.isnan(x).all())
    df = df[~is_nan]
    # df = pca(df,768)
    df_all[f"{model_layer:02d}"] = df.embeddings
df_all[f"{(model_layer+1):02d}"] = df_all["00"].shift(-1)
df_all = df_all.dropna(subset=[f"{(model_layer+1):02d}"])

In [None]:
def corr(row):
    embs = np.array(row.tolist())
    result = np.corrcoef(embs)
    return result

def corr2(row):
    embs = np.array(row.tolist())
    # result = pdist(embs, "correlation")
    result = pdist(embs, "cosine")
    return result

cors = df_all.apply(corr2, axis=1)
cors = np.array(cors.tolist())
final_cor = np.mean(cors,axis=0)

In [None]:
# correlation
# f, ax = plt.subplots(figsize=(10, 8))
# sns.heatmap(final_cor,
#     cmap=sns.color_palette("Spectral", as_cmap=True),
#     vmin=-1.0, vmax=1.0,
#     square=True, ax=ax)

# pdist
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(1 - squareform(final_cor),
    # cmap=sns.color_palette("Spectral", as_cmap=True),
    # vmin=-1.0, vmax=1.0,
    cmap=sns.color_palette("viridis_r", as_cmap=True),
    vmin=0, vmax=1.0,
    square=True, ax=ax)