## Data Loading

In [1]:
import os
import pandas as pd
import numpy as np
import argparse

import matplotlib.pyplot as plt
import matplotlib.colors as col
import seaborn as sns
import plotly.express as px
from scipy.stats import permutation_test
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind, ttest_rel, permutation_test
from statsmodels.stats import multitest

from tfsemb_class import tsne, save_pickle, add_speech
from tfsplt_encoding import get_cmap_smap, aggregate_data, organize_data
from tfsplt_utils import read_folder, load_pickle, get_con_color
from tfsplt_brainmap import get_sigelecs, Colorbar, make_brainmap
from tfsplt_brainmap_cat import make_brainmap_cat

plt.style.use('/scratch/gpfs/ln1144/247-plotting/scripts/paper.mlpstyle')
plt.style.use('../data/plotting/paper-prob-improb/paper.mlpstyle')
ls = '-'
lw = 1

In [2]:
# Sig tests
def fdr(pvals):
    _, pcor, _, _ = multitest.multipletests(
        pvals, method="fdr_bh", is_sorted=False
    )
    return pcor

blu_list = get_con_color("Blues",1000)
red_list = get_con_color("Reds",1000)
ora_list = get_con_color("Oranges",1000)
pur_list = get_con_color("Purples",1000)
gre_list = get_con_color("Greens",1000)


In [3]:
# Load Sig Elec Files (for ROIs)
sig_df_whisper = pd.read_csv("../data/plotting/paper-whisper/data/base_df.csv")

sig_df_gpt2 = pd.read_csv("../data/plotting/sig-elecs/20230723-tfs-sig-file/tfs-sig-file.csv").iloc[:,1:]
sig_df_gpt2.rename(columns={"patient":"sid","electrode":"elec_1","prod_significant":"gpt2-prod","comp_significant":"gpt2-comp"},inplace=True)
sig_df_gpt2 = sig_df_gpt2.loc[sig_df_gpt2.model == "glove",("sid","elec_1","gpt2-prod","gpt2-comp")]

sig_df = sig_df_whisper.merge(sig_df_gpt2,how="left",on=["sid","elec_1"])
sig_df = sig_df.fillna({"gpt2-comp":False,"gpt2-prod":False})
sig_df["electrode"] = sig_df.sid.astype(str) + "_" + sig_df.elec_1

# Split IFG areas
sig_df.loc[(sig_df.roi_1 == "IFG") & (sig_df.NYU_roi == "parsopercularis"),"roi_1"] = "BA44"
sig_df.loc[(sig_df.roi_1 == "IFG") & (sig_df.NYU_roi == "parstriangularis"),"roi_1"] = "BA45"


In [None]:
# Align prod/comp elecs
sig_df.loc[sig_df["gpt2-prod"], "gpt2-comp"] = True
sig_df.loc[sig_df["gpt2-comp"], "gpt2-prod"] = True
print(len(sig_df[sig_df["gpt2-prod"]]), len(sig_df[sig_df["gpt2-comp"]]))

Prob-improb

In [None]:
class Args(argparse.Namespace):
  sid = [625, 676, 7170, 798] # subjects
  project = "tfs"
  formats = [ # encoding folder
    # "../data/encoding/tfs/20230227-gpt2-preds/kw-tfs-full-%s-glove50-lag10k-25-gpt2-xl-prob/*/*%s.csv",
    # "../data/encoding/tfs/20230227-gpt2-preds/kw-tfs-full-%s-glove50-lag10k-25-gpt2-xl-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-prob/*/*%s.csv",
    # "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-improb/*/*%s.csv",
    "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-aligned-prob/*/*%s.csv",
    "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-aligned-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-alignednum2-prob/*/*%s.csv",
    # "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-alignednum2-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230820-gpt2-preds-static/kw-tfs-full-%s-gpt2-xl-shift-emb-lag10k-25-static-prob/*/*%s.csv",
    # "../data/encoding/tfs/20230820-gpt2-preds-static/kw-tfs-full-%s-gpt2-xl-shift-emb-lag10k-25-static-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230820-gpt2-preds-static/kw-tfs-full-%s-gpt2-xl-shift-emb-lag10k-25-static-aligned-prob/*/*%s.csv",
    # "../data/encoding/tfs/20230820-gpt2-preds-static/kw-tfs-full-%s-gpt2-xl-shift-emb-lag10k-25-static-aligned-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230820-gpt2-preds-static/kw-tfs-full-%s-gpt2-xl-shift-emb-lag10k-25-static-alignednum-prob/*/*%s.csv",
    # "../data/encoding/tfs/20230820-gpt2-preds-static/kw-tfs-full-%s-gpt2-xl-shift-emb-lag10k-25-static-alignednum-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-gpt2-xl-glove50-concat-emb4-lag10k-25-prob2/*/*%s.csv",
    # "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-gpt2-xl-glove50-concat-emb4-lag10k-25-improb2/*/*%s.csv",
  ]
  labels = [
    "prob",
    "improb"
  ]
  sig_elec_file = ["../data/plotting/sig-elecs/20230723-tfs-sig-file/tfs-sig-file-glove-%(sid)s-%(key)s.csv"]
  keys = ["comp","prod"] # comprehension and/or production
  layers = np.arange(0,25)
  lags_plot = np.arange(-10000,10001,25) # encoding lags
  lags_show = np.arange(-2000,2001,25) # lags for the effect
  lc_by = "labels"
  ls_by = "keys"

# Aggregate Data
args = Args()
args.unique_labels = list(dict.fromkeys(args.labels))
args.unique_keys = list(dict.fromkeys(args.keys))
args.lags_show = args.lags_show / 1000
args.lags_plot = args.lags_plot / 1000
args = get_sigelecs(args)  # get significant electrodes

# for item in args.sigelecs: # align prod/comp elecs
#     comp_set = set(args.sigelecs[(item[0], "comp")])
#     prod_set = set(args.sigelecs[(item[0], "prod")])
#     args.sigelecs[item] = comp_set.union(prod_set)

df = aggregate_data(args) # aggregate data
df = organize_data(args, df) # trim data if necessary


Actual vs Pred

In [None]:
class Args(argparse.Namespace):
  sid = [625, 676, 7170, 798] # subjects
  project = "tfs"
  formats = [ # encoding folder
    # "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-glove50-concat-emb4-lag10k-25-all/*/*%s.csv",
    # "../data/encoding/tfs/20230824-gpt2-predict/kw-tfs-full-%s-gpt2-xl-glove50-predict5-lag10k-25-all/*/*%s.csv",
    # "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-gpt2-xl-glove50-concat-emb4-lag10k-25-improb2/*/*%s.csv",
    # "../data/encoding/tfs/20230824-gpt2-predict/kw-tfs-full-%s-gpt2-xl-glove50-predict5-lag10k-25-improb/*/*%s.csv",
    "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-improb/*/*%s.csv",
    "../data/encoding/tfs/20230824-gpt2-predict/kw-tfs-full-%s-gpt2-xl-glove50-predict-lag10k-25-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230809-gpt2-preds/kw-tfs-full-%s-gpt2-xl-glove50-lag10k-25-aligned-improb/*/*%s.csv",
    # "../data/encoding/tfs/20230824-gpt2-predict/kw-tfs-full-%s-gpt2-xl-glove50-predict-lag10k-25-aligned-improb/*/*%s.csv",
  ]
  labels = [
    "actual",
    "pred"
  ]
  sig_elec_file = ["../data/plotting/sig-elecs/20230723-tfs-sig-file/tfs-sig-file-glove-%(sid)s-%(key)s.csv"]
  keys = ["comp","prod"] # comprehension and/or production
  layers = np.arange(0,25)
  lags_plot = np.arange(-10000,10001,25) # encoding lags
  lags_show = np.arange(-2000,2001,25) # lags for the effect
  lc_by = "labels"
  ls_by = "keys"

# Aggregate Data
args = Args()
args.unique_labels = list(dict.fromkeys(args.labels))
args.unique_keys = list(dict.fromkeys(args.keys))
args.lags_show = args.lags_show / 1000
args.lags_plot = args.lags_plot / 1000
args = get_sigelecs(args)  # get significant electrodes

# for item in args.sigelecs: # align prod/comp elecs
#     comp_set = set(args.sigelecs[(item[0], "comp")])
#     prod_set = set(args.sigelecs[(item[0], "prod")])
#     args.sigelecs[item] = comp_set.union(prod_set)

df = aggregate_data(args) # aggregate data
df = organize_data(args, df) # trim data if necessary

Glove Concats

In [4]:
class Args(argparse.Namespace):
  sid = [625, 676, 7170, 798] # subjects
  project = "tfs"
  formats = [ # encoding folder
    "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-glove50--lag10k-25-all/*/*%s.csv",
    "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-glove50-concat-emb1-lag10k-25-all/*/*%s.csv",
    "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-glove50-concat-emb2-lag10k-25-all/*/*%s.csv",
    "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-glove50-concat-emb3-lag10k-25-all/*/*%s.csv",
    "../data/encoding/tfs/20230817-glove-concats/kw-tfs-full-%s-glove50-concat-emb4-lag10k-25-all/*/*%s.csv",
  ]
  labels = [
    "n",
    "n+1",
    "n+2",
    "n+3",
    "n+4",
  ]
  sig_elec_file = ["../data/plotting/sig-elecs/20230723-tfs-sig-file/tfs-sig-file-glove-%(sid)s-%(key)s.csv"]
  keys = ["comp","prod"] # comprehension and/or production
  layers = np.arange(0,25)
  lags_plot = np.arange(-10000,10001,25) # encoding lags
  lags_show = np.arange(-2000,2001,25) # lags for the effect
  lc_by = "labels"
  ls_by = "keys"

# Aggregate Data
args = Args()
args.unique_labels = list(dict.fromkeys(args.labels))
args.unique_keys = list(dict.fromkeys(args.keys))
args.lags_show = args.lags_show / 1000
args.lags_plot = args.lags_plot / 1000
args = get_sigelecs(args)  # get significant electrodes

# for item in args.sigelecs: # align prod/comp elecs
#     comp_set = set(args.sigelecs[(item[0], "comp")])
#     prod_set = set(args.sigelecs[(item[0], "prod")])
#     args.sigelecs[item] = comp_set.union(prod_set)

df = aggregate_data(args) # aggregate data
df = organize_data(args, df) # trim data if necessary


Aggregating data
Trimming Data


## Encoding Plots

Significant Test Functions

In [5]:
def get_sig_lags(args, df, label1, label2, threshold):
    sig_lags = {}
    for key in args.keys:
        df_key = df[df.index.get_level_values("key") == key]
        df_prob = df_key[df_key.index.get_level_values("label") == label1]
        df_improb = df_key[df_key.index.get_level_values("label") == label2]
        # df_prob.sort_values([("electrode")], ascending=True, inplace=True)
        # df_improb.sort_values([("electrode")], ascending=True, inplace=True)

        ts = []
        rs = []
        for df_col in np.arange(0,df_prob.shape[1]):
            r = ttest_ind(df_prob.iloc[:,df_col], df_improb.iloc[:,df_col],alternative="two-sided")
            ts.append(r[0])
            rs.append(r[1])
        rs = fdr(rs)

        sig_lags[f"{key}_{label1}"] = [args.lags_show[idx] for (idx, r) in enumerate(rs) if (ts[idx] > 0 and r < threshold)]
        sig_lags[f"{key}_{label2}"] = [args.lags_show[idx] for (idx, r) in enumerate(rs) if (ts[idx] < 0 and r < threshold)]
    return sig_lags


def statistic(x, y, axis):
    return np.mean(x, axis=axis) - np.mean(y, axis=axis)


def get_sig(args, df1, df2, lags, threshold, color1, color2):
    lags = [lag / 1000 for lag in lags]
    lags_idx = [lag_idx for lag_idx, lag in enumerate(args.lags_plot) if lag in lags]
    r = ttest_rel(df1.loc[:,lags_idx].mean(), df2.loc[:,lags_idx].mean(),alternative="two-sided")

    if r[1] < threshold:
        if r[0] > 0:
            return color1
        elif r[0] < 0:
            return color2
    return None


def get_sigs(args, df, label1, label2, threshold, color1, color2):
    sig_lags = {}
    df = df.sort_values([("electrode")], ascending=True)
    for key in args.keys:
        df_key = df[df.index.get_level_values("key") == key]
        df_prob = df_key[df_key.index.get_level_values("label") == label1]
        df_improb = df_key[df_key.index.get_level_values("label") == label2]

        lags_show = np.arange(-500, -99, 25) # lags for the effect
        sig_result = get_sig(args, df_prob, df_improb, lags_show, threshold, color1, color2)
        if sig_result:
            sig_lags[(key), -0.4] = sig_result

        lags_show = np.arange(100, 501, 25) # lags for the effect
        lags_show = np.arange(200, 601, 25) # lags for the effect
        sig_result = get_sig(args, df_prob, df_improb, lags_show, threshold, color1, color2)
        if sig_result:
            sig_lags[(key), 0.4] = sig_result

    return sig_lags


def get_sig_perm(args, df1, df2, lags, threshold, color1, color2):
    lags = [lag / 1000 for lag in lags]
    lags_idx = [lag_idx for lag_idx, lag in enumerate(args.lags_plot) if lag in lags]
    samples_1 = df1.loc[:,lags_idx].to_numpy().ravel()
    samples_2 = df2.loc[:,lags_idx].to_numpy().ravel()

    r = permutation_test(
        (samples_1, samples_2),
            statistic=statistic,
            vectorized=True,
            n_resamples=10000,
            alternative="two-sided",
            permutation_type="samples",
        )

    if r.pvalue < threshold:
        if r.statistic > 0:
            return color1
        elif r.statistic < 0:
            # return "purple"
            return color2
    return None


def get_sigs_perm(args, df, label1, label2, threshold, color1, color2):
    sig_lags = {}
    for key in args.keys:
        df_key = df[df.index.get_level_values("key") == key]
        df_prob = df_key[df_key.index.get_level_values("label") == label1]
        df_improb = df_key[df_key.index.get_level_values("label") == label2]
            
        lags_show = np.arange(-500, -99, 25) # lags for the effect
        sig_result = get_sig_perm(args, df_prob, df_improb, lags_show, threshold, color1, color2)
        if sig_result:
            sig_lags[(key), -0.3] = sig_result

        # lags_show = np.arange(100, 501, 25) # lags for the effect
        # lags_show = np.arange(200, 601, 25) # lags for the effect
        # sig_result = get_sig_perm(args, df_prob, df_improb, lags_show, threshold)
        # if sig_result:
        #     sig_lags[(key), 0.3] = sig_result

    return sig_lags

Encoding Plot Prob-improb (Whole Brain and ROIs)

In [None]:
# Plot average encoding plots
def average_encoding(args, df, tag, color1, color2):
    sig_lags = get_sigs(args, df, "prob", "improb", 0.001, color1, color2)
    # sig_lags2 = get_sig_lags(args, df, "prob", "improb", 0.01)
    axes = ["comp","prod"]
    # fig, axes = plt.subplots(1,2,figsize=(18,5))
    for _, (plot, subdf) in zip(axes, df.groupby("key", axis=0)):
        fig, ax = plt.subplots()
        for line, subsubdf in subdf.groupby("label", axis=0):
            vals = subsubdf.mean(axis=0)
            err = subsubdf.sem(axis=0)
            map_key = (line, plot)
            ax.fill_between(
                args.lags_show,
                vals - err,
                vals + err,
                alpha=0.2,
                color=args.cmap[map_key],
            )
            ax.plot(
                args.lags_show,
                vals,
                label=f"{line} ({len(subsubdf)})",
                color=args.cmap[map_key],
                ls=args.smap[map_key],
                lw=1,
            )
        for sig in sig_lags:
            if sig[0] == plot:
                ax.text(
                    sig[1],
                    0,
                    s = "*",
                    color = sig_lags[sig],
                    fontsize = 25
                )
            # ax.scatter(
            #     sig_lags2[f"{plot}_{line}"],
            #     np.full(len(sig_lags2[f"{plot}_{line}"]), 0.001),
            #     color=args.cmap[map_key]
            # )
        # ax.axhline(0, ls="dashed", alpha=0.3, c="k")
        ax.axvline(0, ls="dashed", alpha=0.3, c="k")
        ax.axvline(-0.1, ls="-", alpha=0.5, c="k")
        ax.axvline(-0.5, ls="-", alpha=0.5, c="k")
        # ax.set_ylim(0,0.155)
        ax.set_ylim(0,0.18)
        ax.set_xticks([-2,-1,0,1,2])
        # ax.set_yticks([0,0.05,0.1,0.15,0.2])
        ax.set_title(f"{plot}s global average")
        ax.legend(loc="upper right", frameon=False)
        ax.set(xlabel="Lag (s)", ylabel="Correlation (r)")
        plt.savefig(f"../prob-improb-{tag}-{plot}.jpeg")
        plt.close()

colors = ["red", "blue"]
styles = ["-", "-", "-.", ":"]
cmap = {}
smap = {}

for label, color in zip(args.unique_labels, colors):
    for key, style in zip(args.unique_keys, styles):
        cmap[(label, key)] = color
        smap[(label, key)] = style

args.cmap = cmap
args.smap = smap

# Plot average encoding plots for whole brain
average_encoding(args, df, "all", colors[0], colors[1])

# Plot average encoding plots for ROIs
ROIS = {
    # "preCG": ["preCG"],
    # "postCG": ["postCG"],
    # "SM": ["preCG","postCG"],
    # "TP": ["TP"],
    # "STG": ["STG"],
    "IFG": ["BA44","BA45"],
    # "IFG": ["IFG"],
    # "BA44": ["BA44"],
    # "BA45": ["BA45"],
    # "pMTG": ["pMTG"],
    # "mMTG": ["mMTG"],
    # "AG": ["AG"],
}

for area_name, area in ROIS.items(): # area
    roi_df = df[df.index.isin(sig_df.loc[sig_df.roi_1.isin(area), "electrode"].tolist(), level=1)]
    average_encoding(args, roi_df, area_name, colors[0], colors[1])

Encoding Plot Actual-Pred (Whole Brain and ROIs)

In [None]:
# Plot average encoding plots
def average_encoding(args, df, tag, color1, color2):
    sig_lags = get_sigs_perm(args, df, "actual", "pred", 0.001, color1, color2)
    # sig_lags2 = get_sig_lags(args, df, "actual", "pred", 0.05)
    axes = ["comp", "prod"]
    # fig, axes = plt.subplots(1,2,figsize=(18,5))
    for _, (plot, subdf) in zip(axes, df.groupby("key", axis=0)):
        fig, ax = plt.subplots()
        for line, subsubdf in subdf.groupby("label", axis=0):
            vals = subsubdf.mean(axis=0)
            err = subsubdf.sem(axis=0)
            map_key = (line, plot)
            ax.fill_between(
                args.lags_show,
                vals - err,
                vals + err,
                alpha=0.2,
                color=args.cmap[map_key],
            )
            ax.plot(
                args.lags_show,
                vals,
                label=f"{line} ({len(subsubdf)})",
                color=args.cmap[map_key],
                ls=args.smap[map_key],
                lw=1
            )
        for sig in sig_lags:
            if sig[0] == plot:
                ax.text(
                    sig[1],
                    0,
                    s = "*",
                    color = sig_lags[sig],
                    fontsize = 25,
                )
            # ax.scatter(
            #     sig_lags2[f"{plot}_{line}"],
            #     np.full(len(sig_lags2[f"{plot}_{line}"]), 0.001),
            #     color=args.cmap[map_key]
            # )
        # ax.set_xticks(args.lag_ticks)
        # ax.set_xticklabels(args.lag_tick_labels)
        # ax.axhline(0, ls="dashed", alpha=0.3, c="k")
        ax.axvline(0, ls="dashed", alpha=0.3, c="k")
        ax.axvline(-0.1, ls="-", alpha=0.5, c="k")
        ax.axvline(-0.5, ls="-", alpha=0.5, c="k")
        # ax.set_ylim(0,0.155)
        ax.set_ylim(0,0.18)
        ax.set_xticks([-2,-1,0,1,2])
        # ax.set_yticks([0,0.05,0.1,0.15,0.2])
        ax.set_title(f"{plot}s global average")
        ax.legend(loc="upper right", frameon=False)
        ax.set(xlabel="Lag (s)", ylabel="Correlation (r)")
        plt.savefig(f"../actual-pred-{tag}-{plot}.jpeg")
        plt.close()

colors = ["blue", "darkorange"]
styles = ["-", "-", "-.", ":"]
cmap = {}
smap = {}

for label, color in zip(args.unique_labels, colors):
    for key, style in zip(args.unique_keys, styles):
        cmap[(label, key)] = color
        smap[(label, key)] = style

args.cmap = cmap
args.smap = smap

# Plot average encoding plots for whole brain
average_encoding(args, df, "all", colors[0], colors[1])

# Plot average encoding plots for ROIs
ROIS = {
    # "preCG": ["preCG"],
    # "postCG": ["postCG"],
    # "SM": ["preCG","postCG"],
    # "TP": ["TP"],
    # "STG": ["STG"],
    # "IFG": ["BA44","BA45"],
    # "BA44": ["BA44"],
    # "BA45": ["BA45"],
    # "pMTG": ["pMTG"],
    # "mMTG": ["mMTG"],
    # "AG": ["AG"],
}

for area_name, area in ROIS.items(): # area
    roi_df = df[df.index.isin(sig_df.loc[sig_df.roi_1.isin(area), "electrode"].tolist(), level=1)]
    average_encoding(args, roi_df, area_name, colors[0], colors[1])

Encoding Plot Glove-concats (Whole Brain and ROIs)

In [8]:
# Plot average encoding plots
def average_encoding(args, df, tag, color1, color2):
    # fig, axes = plt.subplots(1, 2, figsize=(18,5))
    sig_lags = get_sigs(args, df, "n", "n+4", 0.001, color1, color2)
    sig_lags2 = get_sig_lags(args, df, "n", "n+4", 0.01)
    axes = ["comp", "prod"]
    for _, (plot, subdf) in zip(axes, df.groupby("key", axis=0)):
        fig, ax = plt.subplots()
        for line, subsubdf in subdf.groupby("label", axis=0):
            if line == "n" or line == "n+4":
                pass
            else:
                continue
            vals = subsubdf.mean(axis=0)
            err = subsubdf.sem(axis=0)
            map_key = (line, plot)
            ax.fill_between(
                args.lags_show,
                vals - err,
                vals + err,
                alpha=0.2,
                color=args.cmap[map_key],
            )
            ax.plot(
                args.lags_show,
                vals,
                label=f"{line} ({len(subsubdf)})",
                color=args.cmap[map_key],
                ls=args.smap[map_key],
                lw=1
            )
        for sig in sig_lags:
            if sig[0] == plot:
                ax.text(
                    sig[1],
                    0,
                    s = "*",
                    color = sig_lags[sig],
                    fontsize = 25,
                )
        ax.scatter(
            sig_lags2[f"{plot}_{line}"],
            np.full(len(sig_lags2[f"{plot}_{line}"]), 0.001),
            color=args.cmap[map_key]
        )
        # ax.set_xticks(args.lag_ticks)
        # ax.set_xticklabels(args.lag_tick_labels)
        ax.set_ylim(0,0.18)
        ax.axhline(0, ls="dashed", alpha=0.3, c="k")
        ax.axvline(0, ls="dashed", alpha=0.3, c="k")
        ax.axvline(-0.1, ls="-", alpha=0.5, c="k")
        ax.axvline(-0.5, ls="-", alpha=0.5, c="k")
        ax.set_title(f"{plot}s global average")
        ax.legend(loc="upper right", frameon=False)
        ax.set(xlabel="Lag (s)", ylabel="Correlation (r)")
        plt.savefig(f"../concats-{tag}-{plot}.jpeg")
        plt.close()

colors = get_con_color("viridis",5)
styles = ["-", "-", "-", "-","-"]
cmap = {}
smap = {}

for label, color in zip(args.unique_labels, colors):
    for key, style in zip(args.unique_keys, styles):
        cmap[(label, key)] = color
        smap[(label, key)] = style

args.cmap = cmap
args.smap = smap

# Plot average encoding plots for whole brain
# average_encoding(args, df, "all", colors[0], colors[4])

# Plot average encoding plots for ROIs
ROIS = {
    # "preCG": ["preCG"],
    # "postCG": ["postCG"],
    # "SM": ["preCG","postCG"],
    # "TP": ["TP"],
    # "STG": ["STG"],
    # "IFG": ["BA44","BA45"],
    # "BA44": ["BA44"],
    # "BA45": ["BA45"],
    # "pMTG": ["pMTG"],
    # "mMTG": ["mMTG"],
    # "AG": ["AG"],
    # "language": ["BA44", "BA45", "AG"]
}

for area_name, area in ROIS.items(): # area
    roi_df = df[~df.index.isin(sig_df.loc[sig_df.roi_1.isin(area), "electrode"].tolist(), level=1)]
    average_encoding(args, roi_df, area_name, colors[0], colors[4])

## Brainmap Plots

Brainmap subject plots

In [None]:
# Subjects
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "tfs"
  sid = [625, 676, 7170, 798] # subjects
  keys = ["comp","prod"] # comprehension and/or production
  brain_type = "ave" # average brain
  hemisphere = "left" # only plot left hemisphere
  outfile = "../tfs_sids_%s.svg"

args = Args()

prop_cycle = plt.rcParams["axes.prop_cycle"] # get the encoding default colors
color_list = prop_cycle.by_key()["color"]

# Set Up Color Split
args.colors = color_list

for key in args.keys:
    sig_df_plot = sig_df[sig_df[f"gpt2-{key}"]]
    sig_plot = pd.DataFrame({"electrode":sig_df_plot.electrode,"effect":sig_df_plot.sid})
    fig = make_brainmap_cat(args, sig_plot, args.outfile % key)

Brainmap ROI plots

In [None]:
sig_df_areas = sig_df[sig_df.roi_1.isin(["IFG","STG","TP","preCG","postCG","AG"])]

# Subjects
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "tfs"
  sid = [625, 676, 7170, 798] # subjects
  keys = ["comp","prod"] # comprehension and/or production
  brain_type = "ave" # average brain
  hemisphere = "left" # only plot left hemisphere
  outfile = "../tfs_areas_%s.svg"

args = Args()

color_list = px.colors.qualitative.Prism # 10 colors
color_list = ["rgb(55,126,184)","rgb(255,127,0)","rgb(77,175,74)","rgb(152,78,163)"]
args.colors = color_list

for key in args.keys:
  sig_df_plot = sig_df_areas[sig_df_areas[f"gpt2-{key}"]]
  sig_plot = pd.DataFrame({"electrode":sig_df_plot.electrode,"effect":sig_df_plot.roi_1})
  fig = make_brainmap_cat(args, sig_plot, args.outfile % key)


Getting effect Prob-improb (area difference)

In [None]:
def get_part_df(label):  # get partial df
    idx = pd.IndexSlice
    part_df = chosen_df.loc[idx[label, :, :, :], :].copy()
    part_df.index = part_df.index.droplevel("label")
    part_df_idx = part_df.index.get_level_values("electrode").tolist()
    part_df = part_df.sort_index()
    return part_df, part_df_idx

chosen_lags = np.arange(-500,-99,25) # before onset
# chosen_lags = np.arange(200,601,25) # after onset
lags_show = np.arange(-10000,10001,25)
chosen_lags = [idx for (idx, lag) in enumerate(lags_show) if lag in chosen_lags]
chosen_df = df.loc[:,chosen_lags]
x_vals = [lags_show[lag] / 1000 for lag in chosen_lags]

# Get Effect
# chosen_df["area"] = np.trapz(chosen_df, x=x_vals, axis=1) # get area
chosen_df["area"] = chosen_df.loc[:,chosen_lags].sum(axis=1) # get sum
df1, _ = get_part_df("prob") # get first encoding
df2, _ = get_part_df("improb") # get second encoding
df1["area2"] = df2["area"]
df1.loc[:, "effect"] = df1["area"] - df1["area2"] # diff
# df1.loc[:, "effect"] = (df1["area"] - df1["area2"]) / df1[["area", "area2"]].max(axis=1) # diff
# df1.loc[:, "effect"] = (df1["area"] - df1["area2"]) / (df1["area"] + df1["area2"]) # norm diff

# sig test
threshold = 0.01
for row, _ in df1.iterrows():
    r = ttest_ind(df1.loc[row,chosen_lags], df2.loc[row,chosen_lags], alternative="two-sided")
    df1.loc[row, "sig"] = r[1]

# # fdr
idx = pd.IndexSlice
df1.loc[idx[:, "comp", :], "sig"] = fdr(df1.loc[idx[:, "comp", :], "sig"])
df1.loc[idx[:, "prod", :], "sig"] = fdr(df1.loc[idx[:, "prod", :], "sig"])

print(sum(df1.sig > threshold))
df1.loc[df1.sig > threshold, "effect"] = -1000
# df1 = df1[df1.sig <= threshold]

chosen_df = df1
chosen_df.reset_index(inplace=True)

Getting effect actual - pred (area difference)

In [None]:
def get_part_df(label):  # get partial df
    idx = pd.IndexSlice
    part_df = chosen_df.loc[idx[label, :, :, :], :].copy()
    part_df.index = part_df.index.droplevel("label")
    part_df_idx = part_df.index.get_level_values("electrode").tolist()
    part_df = part_df.sort_index()
    return part_df, part_df_idx

chosen_lags = np.arange(-500,-99,25) # before onset
# chosen_lags = np.arange(200,601,25) # after onset
lags_show = np.arange(-10000,10001,25)
chosen_lags = [idx for (idx, lag) in enumerate(lags_show) if lag in chosen_lags]
chosen_df = df.loc[:,chosen_lags]
x_vals = [lags_show[lag] / 1000 for lag in chosen_lags]

# Get Effect
# chosen_df["area"] = np.trapz(chosen_df, x=x_vals, axis=1) # get area
chosen_df["area"] = chosen_df.loc[:,chosen_lags].sum(axis=1) # get sum
df1, _ = get_part_df("actual") # get first encoding
df2, _ = get_part_df("pred") # get second encoding
df1["area2"] = df2["area"]
df1.loc[:, "effect"] = df1["area"] - df1["area2"] # diff
# df1.loc[:, "effect"] = (df1["area"] - df1["area2"]) / df1[["area", "area2"]].max(axis=1) # diff
# df1.loc[:, "effect"] = (df1["area"] - df1["area2"]) / (df1["area"] + df1["area2"]) # norm diff

# sig test
threshold = 0.01
for row, _ in df1.iterrows():
    r = ttest_ind(df1.loc[row,chosen_lags], df2.loc[row,chosen_lags], alternative="two-sided")
    df1.loc[row, "sig"] = r[1]

# fdr
idx = pd.IndexSlice
df1.loc[idx[:, "comp", :], "sig"] = fdr(df1.loc[idx[:, "comp", :], "sig"])
df1.loc[idx[:, "prod", :], "sig"] = fdr(df1.loc[idx[:, "prod", :], "sig"])

print(sum(df1.sig > threshold))
df1.loc[df1.sig > threshold, "effect"] = -1000
# df1 = df1[df1.sig <= threshold]

chosen_df = df1
chosen_df.reset_index(inplace=True)

Brainmap plots for area difference

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "tfs"
  sid = [625, 676, 7170, 798] # subjects
  keys = ["comp","prod"] # comprehension and/or production
  # keys = ["prod"] # comprehension and/or production
  brain_type = "ave" # average brain
  hemisphere = "left" # only plot left hemisphere
  outfile = "../tfs-diff_%s.svg"

args = Args()

# Customize Your Color Split Here
grad_1 = 300
grad_2 = 800
insig = "#ffffff"
insig = "#ebebeb"
zero_bar = Colorbar(title="insig",colorscale=[[0,insig],[1,insig]],bar_min=-1000,bar_max=-1000)

# Red & Blue whisper notebook
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, "rgb(255,248,240)"], [1, "rgb(255,0,0)"]],bar_min=0.1,bar_max=1)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, "rgb(0,0,255)"], [1, "rgb(240,248,255)"]],bar_min=-1,bar_max=-0.1)

# Red & Blue whisper
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, "#ffa07a"], [1, "#ff0000"]],bar_min=0.1,bar_max=1)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, "#0000ff"], [1, "#87ceff"]],bar_min=-1,bar_max=-0.1)

# Red & Blue gradient
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, col.to_hex(red_list[grad_1])], [1, col.to_hex(red_list[grad_2])]],bar_min=0.1,bar_max=1)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, col.to_hex(blu_list[grad_2])], [1, col.to_hex(blu_list[grad_1])]],bar_min=-1,bar_max=-0.1)

# Blue & Orange whisper notebook
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, "rgb(240,248,255)"], [1, "rgb(0,0,255)"]],bar_min=0,bar_max=1)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, "rgb(255, 127, 14)"], [1, "rgb(255,248,240)"]],bar_min=-1,bar_max=0)

# Blue & Orange gradient
pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, col.to_hex(blu_list[grad_1])], [1, col.to_hex(blu_list[grad_2])]],bar_min=0.1,bar_max=1)
neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, col.to_hex(ora_list[grad_2])], [1, col.to_hex(ora_list[grad_1])]],bar_min=-1,bar_max=-0.1)

# Green & Purple
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, "rgb(135,206,1)"],[1, "rgb(0,102,0)"]],bar_min=0,bar_max=1)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, "rgb(128,0,128)"],[1, "rgb(238,130,238)"]],bar_min=-1,bar_max=0)
args.color_split = [zero_bar,-100,neg_bar,0,pos_bar]
# args.color_split = [neg_bar,0,pos_bar]

for key in args.keys: # comp/prod
    df_plot = chosen_df.loc[chosen_df.key == key, ("electrode", "effect")]
    fig = make_brainmap(args, df_plot, args.outfile % key) # plot png

Getting effect for Glove-concats (area difference)

In [None]:
def get_part_df(label):  # get partial df
    idx = pd.IndexSlice
    part_df = chosen_df.loc[idx[label, :, :, :], :].copy()
    part_df.index = part_df.index.droplevel("label")
    part_df_idx = part_df.index.get_level_values("electrode").tolist()
    part_df = part_df.sort_index()
    return part_df, part_df_idx

chosen_lags = np.arange(-500,-99,25) # before onset
# chosen_lags = np.arange(100,501,25) # after onset
lags_show = np.arange(-10000,10001,25)
chosen_lags = [idx for (idx, lag) in enumerate(lags_show) if lag in chosen_lags]
chosen_df = df.loc[:,chosen_lags]
x_vals = [lags_show[lag] / 1000 for lag in chosen_lags]

# Get Effect
# chosen_df["area"] = np.trapz(chosen_df, x=x_vals, axis=1) # get area
chosen_df["area"] = chosen_df.loc[:,chosen_lags].sum(axis=1) # get sum
df1, _ = get_part_df("n+4") # get first encoding
df2, _ = get_part_df("n+3") # get second encoding
df3, _ = get_part_df("n+2") # get second encoding
df4, _ = get_part_df("n+1") # get second encoding
df5, _ = get_part_df("n") # get second encoding
df1["area2"] = df2["area"]
df1["area3"] = df3["area"]
df1["area4"] = df4["area"]
df1["area5"] = df5["area"]
# df1.loc[:, "effect1"] = (df1["area"] - df1["area2"]) / (df1["area"] + df1["area2"])
# df1.loc[:, "effect2"] = (df1["area2"] - df1["area3"]) / (df1["area2"] + df1["area3"])
# df1.loc[:, "effect3"] = (df1["area3"] - df1["area4"]) / (df1["area3"] + df1["area4"])
# df1.loc[:, "effect4"] = (df1["area4"] - df1["area5"]) / (df1["area4"] + df1["area5"])
# df1.loc[:, "effect"] = df1.effect1 + df1.effect2 + df1.effect3 + df1.effect4
df1.loc[:, "effect"] = (df1["area"] - df1["area5"]) / df1[["area", "area5"]].max(axis=1) # diff
# df1.loc[:, "effect"] = df1["area"] - df1["area5"] # diff

# sig test
threshold = 0.01
for row, _ in df1.iterrows():
    r = ttest_ind(df1.loc[row,chosen_lags], df5.loc[row,chosen_lags], alternative="two-sided")
    df1.loc[row, "sig"] = r[1]

# fdr
idx = pd.IndexSlice
df1.loc[idx[:, "comp", :], "sig"] = fdr(df1.loc[idx[:, "comp", :], "sig"])
df1.loc[idx[:, "prod", :], "sig"] = fdr(df1.loc[idx[:, "prod", :], "sig"])

print(sum(df1.sig > threshold))
# df1.loc[df1.sig > threshold, "effect"] = -1000
df1 = df1[df1.sig <= threshold]

chosen_df = df1
chosen_df.reset_index(inplace=True)

In [None]:
class Args(argparse.Namespace):
  main_dir = "../data/plotting/brainplot/" # loads coordinate and brain surface files
  project = "tfs"
  sid = [625, 676, 7170, 798] # subjects
  keys = ["comp","prod"] # comprehension and/or production
  brain_type = "ave" # average brain
  hemisphere = "left" # only plot left hemisphere
  outfile = "../tfs-diff_%s.svg"

args = Args()

# Customize Your Color Split Here
grad_1 = 400
grad_2 = 900
insig = "#ffffff"
insig = "#ebebeb"
zero_bar = Colorbar(title="insig",colorscale=[[0,insig],[1,insig]],bar_min=-1000,bar_max=-1000)

# Green & Purple whisper notebook
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, "rgb(135,206,1)"],[1, "rgb(0,102,0)"]],bar_min=0.05,bar_max=0.5)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, "rgb(128,0,128)"],[1, "rgb(238,130,238)"]],bar_min=-0.5,bar_max=-0.05)

# Green & Purple whisper
# pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, "#00fa9a"],[1, "#006400"]],bar_min=0.1,bar_max=1)
# neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, "#800080"],[1, "#ee82ee"]],bar_min=-1,bar_max=0.1)

# Green & Purple gradient
pos_bar = Colorbar(title="Δ corr pos",colorscale=[[0, col.to_hex(gre_list[grad_1])],[1, col.to_hex(gre_list[grad_2])]],bar_min=0.05,bar_max=0.5)
neg_bar = Colorbar(title="Δ corr neg",colorscale=[[0, col.to_hex(pur_list[grad_2])],[1, col.to_hex(pur_list[grad_1])]],bar_min=-0.5,bar_max=-0.05)
# args.color_split = [zero_bar,-100,neg_bar,0,pos_bar]
args.color_split = [neg_bar,0,pos_bar]

for key in args.keys: # comp/prod
    df_plot = chosen_df.loc[chosen_df.key == key, ("electrode", "effect")]
    fig = make_brainmap(args, df_plot, args.outfile % key) # plot png

## Word Gap

In [None]:
word_gap_df = load_pickle("../data/plotting/paper-prob-improb/datums/df.pkl")
# word_gap_df_mid = load_pickle("../data/plotting/paper-prob-improb/datums/df_mid.pkl")
word_gap_df_top = load_pickle("../data/plotting/paper-prob-improb/datums/df_top_a.pkl")
word_gap_df_bot = load_pickle("../data/plotting/paper-prob-improb/datums/df_bot_a.pkl")
# word_gap_df_top = load_pickle("../data/plotting/paper-prob-improb/datums/df_top_na.pkl")
# word_gap_df_bot = load_pickle("../data/plotting/paper-prob-improb/datums/df_bot_na.pkl")
# word_gap_df_top = load_pickle("../data/plotting/paper-prob-improb/datums/df_top.pkl")
# word_gap_df_bot = load_pickle("../data/plotting/paper-prob-improb/datums/df_bot.pkl")
word_gap_df_top["pred"] = "top 30%"
# word_gap_df_mid["pred"] = "mid 30%"
word_gap_df_bot["pred"] = "bot 30%"

In [None]:
gap_top = word_gap_df_top[(word_gap_df_top.word_gap >= 0) & (word_gap_df_top.word_gap <= 300)]
gap_bot = word_gap_df_bot[(word_gap_df_bot.word_gap >= 0) & (word_gap_df_bot.word_gap <= 300)]

speaker_dict = {
    625: "Speaker 1",
    676: "Speaker 2",
    7170: "Speaker 3",
    798: "Speaker 4",
}

means = []
sems = []
speakers = []
gap_type = []

for sid in speaker_dict:
    gap_plot = gap_top[(gap_top.sid == sid) & (gap_top.speaker == "Speaker1")]
    means.append(gap_plot.word_gap.mean())
    sems.append(gap_plot.word_gap.sem())
    speakers.append(speaker_dict[sid])
    gap_type.append("probable")
gap_plot = gap_top[gap_top.speaker != "Speaker1"]
means.append(gap_plot.word_gap.mean())
sems.append(gap_plot.word_gap.sem())
speakers.append("Other")
gap_type.append("probable")

for sid in speaker_dict:
    gap_plot = gap_bot[(gap_bot.sid == sid) & (gap_bot.speaker == "Speaker1")]
    means.append(gap_plot.word_gap.mean())
    sems.append(gap_plot.word_gap.sem())
    speakers.append(speaker_dict[sid])
    gap_type.append("improbable")
gap_plot = gap_bot[gap_bot.speaker != "Speaker1"]
means.append(gap_plot.word_gap.mean())
sems.append(gap_plot.word_gap.sem())
speakers.append("Other")
gap_type.append("improbable")


In [None]:
gap_top = word_gap_df_top[word_gap_df_top.word_len >= 0]
gap_bot = word_gap_df_bot[word_gap_df_bot.word_len >= 0]

speaker_dict = {
    625: "Speaker 1",
    676: "Speaker 2",
    7170: "Speaker 3",
    798: "Speaker 4",
}

means = []
sems = []
speakers = []
gap_type = []

for sid in speaker_dict:
    gap_plot = gap_top[(gap_top.sid == sid) & (gap_top.speaker == "Speaker1")]
    means.append(gap_plot.word_len.mean())
    sems.append(gap_plot.word_len.sem())
    speakers.append(speaker_dict[sid])
    gap_type.append("probable")
gap_plot = gap_top[gap_top.speaker != "Speaker1"]
means.append(gap_plot.word_len.mean())
sems.append(gap_plot.word_len.sem())
speakers.append("zOther")
gap_type.append("probable")

for sid in speaker_dict:
    gap_plot = gap_bot[(gap_bot.sid == sid) & (gap_bot.speaker == "Speaker1")]
    means.append(gap_plot.word_len.mean())
    sems.append(gap_plot.word_len.sem())
    speakers.append(speaker_dict[sid])
    gap_type.append("improbable")
gap_plot = gap_bot[gap_bot.speaker != "Speaker1"]
means.append(gap_plot.word_len.mean())
sems.append(gap_plot.word_len.sem())
speakers.append("zOther")
gap_type.append("improbable")

In [None]:
results_df = pd.DataFrame(
    {
        "mean": means,
        "sem": sems,
        "Speakers": speakers,
        "word_type": gap_type,
    }
)
dfp = results_df.pivot(
    index="Speakers", columns="word_type", values="mean"
)
yerr = results_df.pivot(index="Speakers", columns="word_type", values="sem")
colors = ["blue","red"]
dfp.plot(
    kind="bar",
    yerr=yerr,
    rot=0,
    color=colors,
    error_kw=dict(ecolor="black", elinewidth=1, capsize=1),
)

plt.savefig("../whisker.jpeg")
plt.close()

In [None]:
# gap = word_gap_df[(word_gap_df.word_gap >= 0) & (word_gap_df.word_gap <= 300)]
# gap_top = word_gap_df_top[(word_gap_df_top.word_gap >= 0) & (word_gap_df_top.word_gap <= 300)]
# gap_mid = word_gap_df_mid[(word_gap_df_mid.word_gap >= 0) & (word_gap_df_mid.word_gap <= 300)]
# gap_bot = word_gap_df_bot[(word_gap_df_bot.word_gap >= 0) & (word_gap_df_bot.word_gap <= 300)]

# gap_top_prod = gap_top[gap_top.speaker == "Speaker1"]
# gap_top_comp = gap_top[gap_top.speaker != "Speaker1"]
# gap_mid_prod = gap_mid[gap_mid.speaker == "Speaker1"]
# gap_mid_comp = gap_mid[gap_mid.speaker != "Speaker1"]
# gap_bot_prod = gap_bot[gap_bot.speaker == "Speaker1"]
# gap_bot_comp = gap_bot[gap_bot.speaker != "Speaker1"]

# gap_prod = pd.concat((gap_top_prod, gap_mid_prod, gap_bot_prod))
# gap_comp = pd.concat((gap_top_comp, gap_mid_comp, gap_bot_comp))
# gap_prod.reset_index(drop=True, inplace=True)
# gap_comp.reset_index(drop=True, inplace=True)

# fig, axes = plt.subplots(1, 2, figsize=(20, 10))
# cols = {"top 30%":"r", "bot 30%" : "b", "mid 30%": "orange"}
# sns.boxplot(x="pred",y="word_gap_log",data=gap_comp, ax=axes[0], showfliers = False, palette=cols, linewidth=2)
# sns.boxplot(x="pred",y="word_gap_log",data=gap_prod, ax=axes[1], showfliers = False, palette=cols, linewidth=2)
# plt.savefig(f"../whisker_word_gap_log.jpeg")

# cols = {"top 30%":"r", "bot 30%" : "b", "mid 30%": "orange"}
# sns.displot(data=gap_comp, x="word_gap", hue="pred",kind="kde", fill=True)
# # sns.displot(data=gap_prod, x="word_gap", hue="pred", ax=axes[1])
# plt.savefig(f"../density.jpeg")

In [None]:
gap_top_prod.true_pred_prob.describe()

## Aligned Num-aligned exploration

In [None]:
df_top_a = load_pickle("../data/plotting/paper-prob-improb/datums/df_top_a.pkl")
df_bot_a = load_pickle("../data/plotting/paper-prob-improb/datums/df_bot_a.pkl")
df_top_na = load_pickle("../data/plotting/paper-prob-improb/datums/df_top_na.pkl")
df_bot_na = load_pickle("../data/plotting/paper-prob-improb/datums/df_bot_na.pkl")
df_top = load_pickle("../data/plotting/paper-prob-improb/datums/df_top.pkl")
df_bot = load_pickle("../data/plotting/paper-prob-improb/datums/df_bot.pkl")

df_top_a["pred"] = "aligned"
df_bot_a["pred"] = "aligned"
df_top_na["pred"] = "num-aligned"
df_bot_na["pred"] = "num-aligned"
df_top["pred"] = "original"
df_bot["pred"] = "original"

df_top_plot = pd.concat((df_top_na,df_top_a,df_top))
df_bot_plot = pd.concat((df_bot_na,df_bot_a,df_bot))
df_top_plot.drop_duplicates(subset=["sid","adjusted_onset","adjusted_offset","word"],keep="first",inplace=True)
df_bot_plot.drop_duplicates(subset=["sid","adjusted_onset","adjusted_offset","word"],keep="first",inplace=True)

In [None]:
pred_types = ["original", "num-aligned", "aligned"]

for pred_type in pred_types:
    df_top_pred = df_top_plot[df_top_plot.pred == pred_type]
    df_top_pred.groupby(df_top_pred.word).size().sort_values(ascending=False).to_csv(f"../data/plotting/paper-prob-improb/datums/top-{pred_type}.csv")
    df_bot_pred = df_bot_plot[df_bot_plot.pred == pred_type]
    df_bot_pred.groupby(df_bot_pred.word).size().sort_values(ascending=False).to_csv(f"../data/plotting/paper-prob-improb/datums/bot-{pred_type}.csv")


In [None]:
for sid in df_top_plot.sid.unique():
    df_top_plot_comp = df_top_plot[(df_top_plot.sid == sid) & (df_top_plot.speaker != "Speaker1")]
    df_top_plot_prod = df_top_plot[(df_top_plot.sid == sid) & (df_top_plot.speaker == "Speaker1")]
    df_bot_plot_comp = df_bot_plot[(df_bot_plot.sid == sid) & (df_bot_plot.speaker != "Speaker1")]
    df_bot_plot_prod = df_bot_plot[(df_bot_plot.sid == sid) & (df_bot_plot.speaker == "Speaker1")]
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    axes[0].set(yscale="log")
    axes[1].set(yscale="log")
    sns.scatterplot(data=df_top_plot_comp,x=df_top_plot_comp["adjusted_onset"],y=df_top_plot_comp["true_pred_rank"],hue=df_top_plot_comp["pred"], linewidth=0,s=5, markers=["o"],ax=axes[0])
    sns.scatterplot(data=df_top_plot_prod,x=df_top_plot_prod["adjusted_onset"],y=df_top_plot_prod["true_pred_rank"],hue=df_top_plot_prod["pred"], linewidth=0,s=5, markers=["o"],ax=axes[1])
    plt.savefig(f"../{sid}-top.jpeg")
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    axes[0].set(yscale="log")
    axes[1].set(yscale="log")
    sns.scatterplot(data=df_bot_plot_comp,x=df_bot_plot_comp["adjusted_onset"],y=df_bot_plot_comp["true_pred_rank"],hue=df_bot_plot_comp["pred"], linewidth=0,s=5, markers=["o"],ax=axes[0])
    sns.scatterplot(data=df_bot_plot_prod,x=df_bot_plot_prod["adjusted_onset"],y=df_bot_plot_prod["true_pred_rank"],hue=df_bot_plot_prod["pred"], linewidth=0,s=5, markers=["o"],ax=axes[1])
    plt.savefig(f"../{sid}-bot.jpeg")

## Predictions

In [None]:
df_pred = load_pickle("../data/plotting/paper-prob-improb/datums/df_pred.pkl")
df_pred_mid = load_pickle("../data/plotting/paper-prob-improb/datums/df_pred_mid.pkl")
df_pred_top = load_pickle("../data/plotting/paper-prob-improb/datums/df_pred_top.pkl")
df_pred_bot = load_pickle("../data/plotting/paper-prob-improb/datums/df_pred_bot.pkl")
df_pred_top["pred"] = "top 30%"
df_pred_mid["pred"] = "mid 30%"
df_pred_bot["pred"] = "bot 30%"

In [None]:
def separate_preds(df):
    for i in np.arange(0,20):
        df[f"true_pred_prob_{i}"] = df.true_pred_prob.apply(lambda x: x[i] if len(x) > i else None)
        df[f"true_pred_rank_{i}"] = df.true_pred_rank.apply(lambda x: x[i] if len(x) > i else None)
    return df


df_pred = separate_preds(df_pred)
df_pred_top = separate_preds(df_pred_top)
df_pred_bot = separate_preds(df_pred_bot)
df_pred_mid = separate_preds(df_pred_mid)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

df_dict = {
    "all": df_pred,
    "top": df_pred_top,
    "mid": df_pred_mid,
    "bot": df_pred_bot,
}

colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "9467bd"]

for idx, df_name in enumerate(df_dict):
    df_plot = df_dict[df_name]
    df_comp = df_plot[df_plot.speaker != "Speaker1"]
    df_prod = df_plot[df_plot.speaker == "Speaker1"]
    # comp_median = df_comp.filter(like="true_pred_prob_", axis=1).median(skipna=True)
    # prod_median = df_prod.filter(like="true_pred_prob_", axis=1).median(skipna=True)
    comp_mean = df_comp.filter(like="true_pred_prob_", axis=1).mean(skipna=True)
    prod_mean = df_prod.filter(like="true_pred_prob_", axis=1).mean(skipna=True)
    comp_sem = df_comp.filter(like="true_pred_prob_", axis=1).sem(skipna=True)
    prod_sem = df_prod.filter(like="true_pred_prob_", axis=1).sem(skipna=True)
    axes[0].plot(
        np.arange(0,20),
        comp_mean,
        label = df_name,
        marker = "o",
        color = colors[idx]
    )
    axes[0].fill_between(
        np.arange(0,20),
        comp_mean - comp_sem,
        comp_mean + comp_sem,
        alpha = 0.2,
        color = colors[idx]
    )
    axes[1].plot(
        np.arange(0,20),
        prod_mean,
        label = df_name,
        marker = "o",
        color = colors[idx]
    )
    axes[1].fill_between(
        np.arange(0,20),
        prod_mean - prod_sem,
        prod_mean + prod_sem,
        alpha = 0.2,
        color = colors[idx]
    )
axes[0].legend(loc="upper right", frameon=False)
axes[1].legend(loc="upper right", frameon=False)
plt.savefig(f"../pred.jpeg")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

df_dict = {
    "all": df_pred,
    "top": df_pred_top,
    "mid": df_pred_mid,
    "bot": df_pred_bot,
}

colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "9467bd"]


for idx, df_name in enumerate(df_dict):
    df_plot = df_dict[df_name]
    df_comp = df_plot[df_plot.speaker != "Speaker1"]
    df_prod = df_plot[df_plot.speaker == "Speaker1"]
    comp_median = df_comp.filter(like="true_pred_rank_", axis=1).median(skipna=True)
    prod_median = df_prod.filter(like="true_pred_rank_", axis=1).median(skipna=True)
    comp_mean = df_comp.filter(like="true_pred_rank_", axis=1).mean(skipna=True)
    prod_mean = df_prod.filter(like="true_pred_rank_", axis=1).mean(skipna=True)
    comp_sem = df_comp.filter(like="true_pred_rank_", axis=1).sem(skipna=True)
    prod_sem = df_prod.filter(like="true_pred_rank_", axis=1).sem(skipna=True)
    axes[0].plot(
        np.arange(0,20),
        comp_median,
        label = df_name,
        marker = "o",
        color = colors[idx]
    )
    # axes[0].fill_between(
    #     np.arange(0,20),
    #     comp_mean - comp_sem,
    #     comp_mean + comp_sem,
    #     alpha = 0.2,
    #     color = colors[idx]
    # )
    axes[1].plot(
        np.arange(0,20),
        prod_median,
        label = df_name,
        marker = "o",
        color = colors[idx]
    )
    # axes[1].fill_between(
    #     np.arange(0,20),
    #     prod_mean - prod_sem,
    #     prod_mean + prod_sem,
    #     alpha=0.2,
    #     color = colors[idx]
    # )
axes[0].legend(loc="lower right", frameon=False)
axes[1].legend(loc="lower right", frameon=False)
plt.savefig(f"../rank.jpeg")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

df_dict = {
    "all": df_pred,
    "top": df_pred_top,
    "mid": df_pred_mid,
    "bot": df_pred_bot,
}

colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "9467bd"]
rank_filter = 5

for idx, df_name in enumerate(df_dict):
    df_plot = df_dict[df_name]
    df_comp = df_plot[df_plot.speaker != "Speaker1"]
    df_prod = df_plot[df_plot.speaker == "Speaker1"]
    comp_mean = df_comp.filter(like="true_pred_rank_", axis=1)
    prod_mean = df_prod.filter(like="true_pred_rank_", axis=1)
    comp_mean = comp_mean[comp_mean <= rank_filter].count() / len(comp_mean)
    prod_mean = prod_mean[prod_mean <= rank_filter].count() / len(prod_mean)
    axes[0].plot(
        np.arange(0,20),
        comp_mean,
        label = df_name,
        marker = "o",
        color = colors[idx]
    )
    axes[1].plot(
        np.arange(0,20),
        prod_mean,
        label = df_name,
        marker = "o",
        color = colors[idx]
    )
axes[0].legend(loc="upper right", frameon=False)
axes[1].legend(loc="upper right", frameon=False)
plt.savefig(f"../accuracy.jpeg")