In [None]:
from copy import copy
from functools import partial
import itertools
import json
from pathlib import Path
import re
import sys
sys.path.append("../src")

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import statsmodels.formula.api as smf
from tqdm import tqdm, tqdm_notebook

%matplotlib inline
sns.set(style="whitegrid", context="paper", font_scale=3.5, rc={"lines.linewidth": 2.5})
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png')
#set_matplotlib_formats('svg')

%load_ext autoreload
%autoreload 2
import util

## Data preparation

In [None]:
output_path = Path("../output")
decoder_path = output_path / "decoders"
bert_encoding_path = output_path / "encodings"
model_path = output_path / "bert"

In [None]:
checkpoints = [util.get_encoding_ckpt_id(dir_entry) for dir_entry in bert_encoding_path.iterdir()]

In [None]:
models = [model for model, _, _ in checkpoints]
standard_models = [model for model in models if not model.startswith("LM_")]
custom_models = [model for model in models if model.startswith("LM_")]

runs = sorted(set(run for _, run, _ in checkpoints))
checkpoint_steps = sorted(set(step for _, _, step in checkpoints))

# Models which should appear in the final report figures
report_models = ["SQuAD", "QQP", "MNLI", "SST", "LM", "LM_scrambled", "LM_scrambled_para", "LM_pos", "glove"]

# Model subsets to render in different report figures
report_model_sets = [
    ("all", set(report_models)),
    ("standard", set(report_models) & set(standard_models)),
    ("custom", set(report_models) & set(custom_models)),
]

In [None]:
RENDER_FINAL = True
figure_path = Path("../reports/figures")
figure_path.mkdir(exist_ok=True, parents=True)

report_hues = dict(zip(sorted(report_models), sns.color_palette()))

### Decoder performance metrics

In [None]:
# Load decoder performance data.
decoding_perfs = util.load_decoding_perfs(decoder_path)

In [None]:
# Save perf data.
decoding_perfs.to_csv(output_path / "decoder_perfs.csv")

In [None]:
# Load base decoder data.
# TODO refit to current setup -- need to add this decoder learning to pipeline ..
base_perfs = {}
base_perf_paths = list(Path("../models/decoders").glob("encodings-CLS.%s*.csv" % bert_base_model))
for base_decoder_perf_path in tqdm_notebook(base_perf_paths):
    subject, = re.findall(r"-([\w\d]+)\.csv$", base_decoder_perf_path.name)
    perf = pd.read_csv(base_decoder_perf_path,
                       usecols=["mse", "r2", "rank_median", "rank_mean", "rank_min", "rank_max"])
    base_perfs["_", 1, 0, subject] = perf
    
if len(base_perfs) == 0:
    raise RuntimeError("No base model performance found. Stop.")

In [None]:
if base_perfs:
    decoding_perfs = \
        pd.concat([decoding_perfs,
                   pd.concat(base_perfs, names=["model", "run", "step", "subject"])])

In [None]:
# # Load comparison model data.
# for other_model in other_models:
#     other_perf_paths = list(Path("../models/decoders").glob("encodings.%s-*.csv" % other_model))
#     for other_perf_path in tqdm_notebook(other_perf_paths, desc=other_model):
#         subject, = re.findall(r"-([\w\d]+)\.csv$", other_perf_path.name)
#         perf = pd.read_csv(other_perf_path,
#                            usecols=["mse", "r2", "rank_median", "rank_mean", "rank_min", "rank_max"])
#         decoding_perfs.loc[other_model, 1, 250, subject] = perf.iloc[0]

### Model performance metrics

In [None]:
# For each model, load checkpoint data: global step, gradient norm information
model_metadata = {}
for model, run, step in tqdm_notebook(checkpoints):    
    run_dir = model_path / ("%s-%i" % (model, run))
    
    # Fetch corresponding fine-tuning metadata.
    ckpt_path = run_dir / ("model.ckpt-step%i" % step)

    try:
        metadata = util.load_bert_finetune_metadata(run_dir, step)
    except Exception as e:
        pass
    else:
        if metadata["steps"]:
            model_metadata[model, run] = pd.DataFrame.from_dict(metadata["steps"], orient="index")
            
        # SQuAD eval results need to be loaded separately, since they run offline.
        if model == "SQuAD":
            pred_dir = output_path / "eval_squad" / ("SQuAD-%i-%i" % (run, step))
            try:
                with (pred_dir / "results.json").open("r") as results_f:
                    results = json.load(results_f)
                    model_metadata[model, run].loc[step]["eval_accuracy"] = results["best_f1"] / 100.
            except:
                print("Failed to retrieve eval data for SQuAD-%i-%i" % (run, step))

model_metadata = pd.concat(model_metadata, names=["model", "run", "step"], sort=True)

### Putting it all together

In [None]:
# Join decoding data, post-hoc rank evaluation data, and model training metadata into a single df.
old_index = decoding_perfs.index
df = decoding_perfs.reset_index().join(model_metadata, on=["model", "run", "step"]).set_index(old_index.names)
df.head()

-----------

In [None]:
all_subjects = df.index.get_level_values("subject").unique()
all_subjects

In [None]:
try:
    subjects_with_baseline = set(decoding_perfs.loc["_", :, :].index.get_level_values("subject"))
except:
    subjects_with_baseline = set()
    
if not subjects_with_baseline == set(all_subjects):        
    raise ValueError("Cannot proceed. Missing base decoder evaluation for subjects: " + str(set(all_subjects) - subjects_with_baseline))

### Synthetic columns

In [None]:
df["eval_accuracy_delta"] = df.groupby(["model", "run"]).eval_accuracy.transform(lambda xs: xs - xs.iloc[0])
df["eval_accuracy_norm"] = df.groupby(["model", "run"]).eval_accuracy.transform(lambda accs: (accs - accs.min()) / (accs.max() - accs.min()))

In [None]:
def decoding_perf_delta(xs, metric="mse"):
    subject = xs.index[0][3]
    base_metric = df.loc["_", 1, 0, subject][metric]
    return xs - base_metric.item()

df["decoding_mse_delta"] = df.groupby(["model", "run", "subject"]).mse.transform(partial(decoding_perf_delta, metric="mse"))
df["rank_mean_delta"] = df.groupby(["model", "run", "subject"]).rank_mean.transform(partial(decoding_perf_delta, metric="rank_mean"))
df["rank_median_delta"] = df.groupby(["model", "run", "subject"]).rank_median.transform(partial(decoding_perf_delta, metric="rank_median"))

In [None]:
NUM_BINS = 50
def bin(xs):
    if xs.isnull().values.any(): return np.nan
    return pd.cut(xs, np.linspace(xs.min(), xs.max() + 1e-5, NUM_BINS), labels=False)
df["eval_accuracy_bin"] = df.groupby(["model"]).eval_accuracy.transform(bin)
df["decoding_mse_bin"] = df.groupby(["subject"]).decoding_mse_delta.transform(bin)
df["total_global_norms_bin"] = df.groupby(["model"]).total_global_norms.transform(bin)

In [None]:
ROLLING_WINDOW_SIZE = 5
grouped = df.groupby(["model", "run", "subject"])
for col in ["mse", "decoding_mse_delta", "eval_accuracy", "train_loss", "rank_mean", "rank_mean_delta"]:
    df["%s_rolling" % col] = grouped[col].transform(lambda rows: rows.rolling(ROLLING_WINDOW_SIZE, min_periods=1).mean())

In [None]:
df.tail()

In [None]:
df.head()

In [None]:
dfi = df.reset_index()

## Model training analysis

Let's verify that each model is not overfitting; if it is overfitting, restrict our analysis to just the region before overfitting begins.

In [None]:
# g = sns.FacetGrid(df.reset_index().melt(id_vars=["model", "run", "step"],
#                                         value_vars=["train_loss_rolling", "eval_accuracy_rolling"]),
#                   row="variable", col="model", sharex=True, sharey=False, height=4)
# g.map(sns.lineplot, "step", "value", "run", ci=None)
# g.add_legend()

In [None]:
%matplotlib agg

if RENDER_FINAL:
    # models which appear on left edge of subfigs in paper
    LEFT_EDGE_MODELS = ["QQP", "LM"]
    
    training_fig_path = figure_path / "training"
    training_fig_path.mkdir(exist_ok=True)
    shared_kwargs = {"legend": False, "ci": None}

    for model in tqdm_notebook(report_models):
        f, (loss_fig, acc_fig) = plt.subplots(2, 1, figsize=(10,15), sharex=True)
        try:
            local_data = df.loc[model].reset_index()
        except KeyError:
            print(f"Missing training data for {model}")
            continue
            
        ax = sns.lineplot(data=local_data, x="step", y="train_loss_rolling", hue="run", ax=loss_fig, **shared_kwargs)
        ax.set_ylabel("Training loss\n(rolling window)" if model in LEFT_EDGE_MODELS else "")
        ax.set_xlabel("Training step")
        
        ax = sns.lineplot(data=local_data, x="step", y="eval_accuracy_rolling", hue="run", ax=acc_fig, **shared_kwargs)
        ax.set_ylabel("Validation set accuracy\n(rolling window)" if model in LEFT_EDGE_MODELS else "")
        ax.set_xlabel("Training step")
        
        sns.despine()
        
        plt.tight_layout()
        plt.savefig(training_fig_path / ("%s.pdf" % model))
        plt.close()
%matplotlib inline

## Decoding analyses

In [None]:
MSE_DELTA_LABEL = "$\Delta$(MSE)"
MAR_DELTA_LABEL = "$\Delta$(MAR)"

### Final state analysis

In [None]:
%matplotlib agg

if RENDER_FINAL:
    final_state_fig_path = figure_path / "final_state"
    final_state_fig_path.mkdir(exist_ok=True)
    metrics = [("decoding_mse_delta", MSE_DELTA_LABEL, None, None),
               ("rank_mean_delta", MAR_DELTA_LABEL, None, None),
               ("mse", "Mean squared error", 0.00335, 0.00385),
               ("rank_mean", "Mean average rank", 20, 95)]
    
    for model_set_name, model_set in report_model_sets:
        final_df = dfi[(dfi.step == checkpoint_steps[-1]) & (dfi.model.isin(model_set))]
        if final_df.empty:
            continue

        for metric, label, ymin, ymax in tqdm_notebook(metrics, desc=model_set_name):
            fig, ax = plt.subplots(figsize=(15, 10))

            # Plot BERT baseline performance.
            if "delta" not in metric:
                # TODO error region instead -- plt.fill_between
                ax.axhline(dfi[dfi.model == "_"][metric].mean(), linestyle="--", color="gray")

            sns.barplot(data=final_df, x="model", y=metric,
                        order=final_df.groupby("model")[metric].mean().sort_values().index,
                        palette=report_hues, ax=ax)

            padding = final_df[metric].var() * 0.005
            plt.ylim((ymin or (final_df[metric].min() - padding), ymax or (final_df[metric].max() + padding)))
            plt.xlabel("Model")
            plt.ylabel(label)
            plt.xticks(rotation=45, ha="right")

            plt.tight_layout()
            plt.savefig(final_state_fig_path / (f"{metric}.{model_set_name}.pdf"))
            #plt.close(fig)
        
%matplotlib inline

In [None]:
%matplotlib agg

if RENDER_FINAL:
    final_state_fig_path = figure_path / "final_state_within_subject"
    final_state_fig_path.mkdir(exist_ok=True)
    metrics = [("decoding_mse_delta", MSE_DELTA_LABEL),
               ("rank_mean_delta", MAR_DELTA_LABEL),
               ("mse", "Mean squared error"),
               ("rank_mean", "Mean average rank")]
    
    for model_set_name, model_set in report_model_sets:
        final_df = dfi[(dfi.step == checkpoint_steps[-1]) & (dfi.model.isin(model_set))]

        for metric, label in tqdm_notebook(metrics, desc=model_set_name):
            fig = plt.figure(figsize=(25, 10))
            sns.barplot(data=final_df, x="model", y=metric, hue="subject",
                        order=final_df.groupby("model")[metric].mean().sort_values().index)
            plt.ylabel(label)
            plt.xticks(rotation=30, ha="right")
            plt.legend(loc="center left", bbox_to_anchor=(1,0.5))
            plt.tight_layout()
            plt.savefig(final_state_fig_path / f"{metric}.{model_set_name}.pdf")
            plt.close(fig)
    
%matplotlib inline

In [None]:
%matplotlib agg

if RENDER_FINAL:
    final_state_fig_path = figure_path / "final_state_within_model"
    final_state_fig_path.mkdir(exist_ok=True)
    metrics = [("decoding_mse_delta", MSE_DELTA_LABEL, None, None),
               ("rank_mean_delta", MAR_DELTA_LABEL, None, None),
               ("mse", "Mean squared error", None, None),
               ("rank_mean", "Mean average rank", None, None)]
    
    subj_order = dfi[(dfi.step == checkpoint_steps[-1]) & (dfi.model.isin(report_model_sets[0][1]))] \
        .groupby("subject")[metrics[0][0]].mean().sort_values().index
    
    for model_set_name, model_set in report_model_sets:
        final_df = dfi[(dfi.step == checkpoint_steps[-1]) & (dfi.model.isin(model_set))]

        for metric, label, ymin, ymax in tqdm_notebook(metrics, desc=model_set_name):
            fig = plt.figure(figsize=(25, 10))
            sns.barplot(data=final_df, x="subject", y=metric, hue="model",
                        order=subj_order)
            
            padding = final_df[metric].var() * 0.005
            plt.ylim((ymin or (final_df[metric].min() - padding), ymax or (final_df[metric].max() + padding)))
            plt.xlabel("Subject")
            plt.ylabel(label)
            
            plt.legend(loc="center left", bbox_to_anchor=(1,0.5))
            plt.tight_layout()
            plt.savefig(final_state_fig_path / f"{metric}.{model_set_name}.pdf")
            plt.close(fig)
    
%matplotlib inline

### Step analysis

In [None]:
# g = sns.FacetGrid(dfi, col="run", size=6)
# g.map(sns.lineplot, "step", "decoding_mse_delta", "model").add_legend()

# plt.xlabel("Fine-tuning step")
# plt.ylabel(MSE_DELTA_LABEL)

In [None]:
# g = sns.FacetGrid(dfi, col="run", size=6)
# g.map(sns.lineplot, "step", "rank_mean_delta", "model").add_legend()

# plt.xlabel("Fine-tuning step")
# plt.ylabel(MAR_DELTA_LABEL)

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.lineplot(data=dfi, x="step", y="decoding_mse_delta_rolling", hue="model", ax=ax)

plt.xlabel("Fine-tuning step")
plt.ylabel(MSE_DELTA_LABEL)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.lineplot(data=dfi, x="step", y="rank_mean_delta_rolling", hue="model", ax=ax)

plt.xlabel("Fine-tuning step")
plt.ylabel(MAR_DELTA_LABEL)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.lineplot(data=dfi[dfi.model.str.startswith("LM")], x="step", y="decoding_mse_delta_rolling", hue="model", ax=ax)

plt.xlabel("Fine-tuning step")
plt.ylabel(MSE_DELTA_LABEL)

In [None]:
f, ax = plt.subplots(figsize=(15, 10))
sns.lineplot(data=dfi[dfi.model.str.startswith("LM")], x="step", y="rank_mean_delta_rolling", hue="model", ax=ax)

plt.xlabel("Fine-tuning step")
plt.ylabel(MAR_DELTA_LABEL)

In [None]:
%matplotlib agg

if RENDER_FINAL:
    trajectory_fig_dir = figure_path / "trajectories"
    trajectory_fig_dir.mkdir(exist_ok=True)
    metrics = [("decoding_mse_delta", MSE_DELTA_LABEL),
               ("rank_mean_delta", MAR_DELTA_LABEL),
               ("decoding_mse_delta_rolling", MSE_DELTA_LABEL),
               ("rank_mean_delta_rolling", MAR_DELTA_LABEL)]

    for model_set_name, model_set in report_model_sets:
        for metric, label in tqdm_notebook(metrics, desc=model_set_name):
            fig = plt.figure(figsize=(18, 10))
            sns.lineplot(data=dfi[dfi.model.isin(model_set) & (~dfi.model.isin(other_models))],
                         x="step", y=metric, hue="model", palette=report_hues)
            plt.xlim((0, checkpoint_steps[-1]))
            plt.xlabel("Fine-tuning step")
            plt.ylabel(label)
            plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
            plt.tight_layout()
            plt.savefig(trajectory_fig_dir / f"{metric}.{model_set_name}.pdf")
            plt.close(fig)
    
%matplotlib inline

In [None]:
# g = sns.FacetGrid(dfi[dfi.model != "_"], col="model", row="run", size=6)
# g.map(sns.lineplot, "step", "decoding_mse_delta", "subject", ci=None).add_legend()

In [None]:
# g = sns.FacetGrid(dfi, col="model", row="run", size=6)
# g.map(sns.lineplot, "step", "rank_median_delta", "subject", ci=None).add_legend()

### Gradient norm analysis

In [None]:
# f, ax = plt.subplots(figsize=(10, 8))
# sns.lineplot(data=dfi, y="decoding_mse_delta", x="total_global_norms_bin", hue="model", ax=ax)
# ax.set_title("Decoding performance delta vs. binned total global gradient norm")
# ax.set_xlabel("Cumulative global gradient norm bin")
# ax.set_ylabel(MSE_DELTA_LABEL)

In [None]:
#g = sns.FacetGrid(dfi, col="model", row="run", size=6, sharex=False, sharey=True)
#g.map(sns.lineplot, "total_global_norms", "decoding_mse_delta", "subject", ci=None).add_legend()

### Eval accuracy analysis

In [None]:
#g = sns.FacetGrid(dfi, col="model", row="run", sharex=False, sharey=True, size=7)
#g.map(sns.lineplot, "eval_accuracy", "decoding_mse_delta", "subject", ci=None).add_legend()

## Per-subject analysis

In [None]:
f, ax = plt.subplots(figsize=(14, 9))
dff = pd.DataFrame(dfi[dfi.step == checkpoint_steps[-1]].groupby(["model", "run"]).apply(lambda xs: xs.groupby("subject").decoding_mse_delta.mean()).stack()).reset_index()
sns.barplot(data=dff, x="model", hue="subject", y=0, ax=ax)
plt.title("subject final decoding mse delta, averaging across runs")

In [None]:
f, ax = plt.subplots(figsize=(14, 9))
dff = pd.DataFrame(dfi[dfi.step == checkpoint_steps[-1]].groupby(["model", "run"]).apply(lambda xs: xs.groupby("subject").rank_mean_delta.mean()).stack()).reset_index()
sns.barplot(data=dff, x="model", hue="subject", y=0, ax=ax)
plt.title("subject final rank mean delta, averaging across runs")

In [None]:
f, ax = plt.subplots(figsize=(14, 9))
dff = pd.DataFrame(dfi.groupby(["model", "run"]).apply(lambda xs: xs.groupby("subject").decoding_mse_delta.max()).stack()).reset_index()
sns.violinplot(data=dff, x="subject", y=0)
sns.stripplot(data=dff, x="subject", y=0, edgecolor="white", linewidth=1, alpha=0.7, ax=ax)
plt.title("subject max decoding mse delta, averaging across models and runs")

In [None]:
f, ax = plt.subplots(figsize=(14, 9))
dff = pd.DataFrame(dfi.groupby(["model", "run"]).apply(lambda xs: xs.groupby("subject").decoding_mse_delta.min()).stack()).reset_index()
sns.violinplot(data=dff, x="subject", y=0)
sns.stripplot(data=dff, x="subject", y=0, edgecolor="white", linewidth=1, alpha=0.7, ax=ax)
plt.title("subject min decoding mse delta, averaging across models and runs")

## Statistical analyses

First, some data prep for comparing final vs. start states:

In [None]:
perf_comp = df.query("step == %i" % checkpoint_steps[-1]).reset_index(level="step", drop=True).sort_index()
# Join data from baseline
perf_comp = perf_comp.join(df.loc["_", 1, 0].rename(columns=lambda c: "start_%s" % c))
perf_comp = perf_comp.join(df.loc["glove", 1, 250].rename(columns=lambda c: "glove_%s" % c))
perf_comp.head()

In [None]:
(perf_comp.mse - perf_comp.start_mse).plot.hist()

In [None]:
perf_compi = perf_comp.reset_index()

Quantitative tests:
 
1. for any GLUE task g, MSE(g after 250) > MSE(LM)
2. for any LM_scrambled_para task t, MSE(t after 250) < MSE(LM)
3. for any GLUE task g, MAR(g after 250) > MAR(LM)
4. for any LM_scrambled_para task t, MAR(t after 250) < MAR(LM)
5. MSE(LM after 250) =~ MSE(LM)
6. MAR(LM after 250) =~ MSE(LM)
7. for any LM_scrambled_para task t, MSE(t after 250) < MSE(glove)
8. for any LM_scrambled_para task t, MAR(t after 250) < MAR(glove)
9. for any LM_pos task t, MSE(t after 250) > MSE(LM)
10. for any LM_pos task t, MAR(t after 250) > MAR(LM)

### test 1

In [None]:
sample = perf_compi[~perf_compi.model.str.startswith(("_", "LM", "glove"))]

In [None]:
sample.mse.hist()

In [None]:
sample.start_mse.hist()

In [None]:
st.ttest_rel(sample.mse, sample.start_mse)

### test 1 (split across models)

In [None]:
results = []
for model in standard_models:
    if model in ["LM", "glove"]: continue
    sample = perf_compi[perf_compi.model == model]
    results.append((model,) + st.ttest_rel(sample.mse, sample.start_mse))
    
pd.DataFrame(results, columns=["model", "tval", "pval"])

### test 2

In [None]:
sample = perf_compi[perf_compi.model == "LM_scrambled_para"]

In [None]:
sample.mse.hist()

In [None]:
sample.start_mse.hist()

In [None]:
st.ttest_rel(sample.mse, sample.start_mse)

### test 3

In [None]:
sample = perf_compi[~perf_compi.model.str.startswith(("_", "LM", "glove"))]

In [None]:
sample.rank_mean.hist()

In [None]:
sample.start_rank_mean.hist()

In [None]:
st.ttest_rel(sample.rank_mean, sample.start_rank_mean)

### test 3 (split across models)

In [None]:
results = []
for model in standard_models:
    if model in ["LM", "glove"]: continue
    sample = perf_compi[perf_compi.model == model]
    results.append((model,) + st.ttest_rel(sample.rank_mean, sample.start_rank_mean))
    
pd.DataFrame(results, columns=["model", "tval", "pval"])

### test 4

In [None]:
sample = perf_compi[perf_compi.model == "LM_scrambled_para"]

In [None]:
sample.rank_mean.hist()

In [None]:
sample.start_rank_mean.hist()

In [None]:
st.ttest_rel(sample.rank_mean, sample.start_rank_mean)

### test 5

In [None]:
sample = perf_compi[perf_compi.model == "LM"]

In [None]:
sample.mse.hist()

In [None]:
sample.start_mse.hist()

In [None]:
st.ttest_rel(sample.mse, sample.start_mse)

### test 6

In [None]:
sample = perf_compi[perf_compi.model == "LM"]

In [None]:
sample.rank_mean.hist()

In [None]:
sample.start_rank_mean.hist()

In [None]:
st.ttest_rel(sample.rank_mean, sample.start_rank_mean)

### test 7

In [None]:
sample = perf_compi[perf_compi.model == "LM_scrambled_para"]

In [None]:
sample.mse.hist()

In [None]:
sample.glove_mse.hist()

In [None]:
st.ttest_rel(sample.mse, sample.glove_mse)

### test 8

In [None]:
sample = perf_compi[perf_compi.model == "LM_scrambled_para"]

In [None]:
sample.rank_mean.hist()

In [None]:
sample.glove_rank_mean.hist()

In [None]:
st.ttest_rel(sample.rank_mean, sample.glove_rank_mean)

### test 9

In [None]:
sample = perf_compi[perf_compi.model == "LM_pos"]

In [None]:
sample.mse.hist()

In [None]:
sample.start_mse.hist()

In [None]:
st.ttest_rel(sample.mse, sample.start_mse)

In [None]:
f = plt.figure(figsize=(20,20))
sns.barplot(data=pd.melt(sample, id_vars=["subject"], value_vars=["mse", "start_mse"]),
            x="subject", y="value", hue="variable")
plt.ylim((0.0033, 0.0038))

### test 10

In [None]:
sample = perf_compi[perf_compi.model == "LM_pos"]

In [None]:
sample.rank_mean.hist()

In [None]:
sample.start_rank_mean.hist()

In [None]:
st.ttest_rel(sample.rank_mean, sample.start_rank_mean)