# About

This notebook will be the main evaluations of the models.

Credits to Garg et. al for the code, which we modified. The original source can be found [here](https://github.com/dtsip/in-context-learning)

In [5]:
from collections import OrderedDict
import re
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from tqdm.notebook import tqdm

from eval import get_run_metrics, read_run_dir, get_model_from_run
from plot_utils import basic_plot, collect_results, relevant_model_names

%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set_theme('notebook', 'darkgrid')
palette = sns.color_palette('colorblind')

run_dir = "../models"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# Displaying the directories
df = read_run_dir(run_dir)
df  # list all the runs in our run_dir

In [None]:
# Defining some hyperparameters for generating figures
task = "kernel_regression"
run_id_nanogpt = "nanogpt"
run_path_nanogpt = os.path.join(run_dir, task, run_id_nanogpt)

run_id_mamba = "mamba"
run_path_mamba = os.path.join(run_dir, task, run_id_mamba)

# Running Metrics

Recomputing metrics: getting nanoGPT and Mamba to run on the same dataset to get an equal comparison and also adding the distribution shifts

In [None]:
get_run_metrics([run_path_nanogpt, run_path_mamba])

# Figure 1: Context Window Size vs. Squared Error

In [None]:

def valid_row(r):
    return r.task == task and r.run_id == run_id

metrics = collect_results(run_dir, df, valid_row=valid_row)
_, conf = get_model_from_run(run_path_nanogpt, only_conf=True)
n_dims = conf.model.n_dims

models = relevant_model_names[task + "_" + run_id]
basic_plot(metrics["standard"], models=models)
plt.show()

In [None]:
run_id = "mamba"  # if you train more models, replace with the run_id from the table above

run_path = os.path.join(run_dir, task, run_id)

def valid_row(r):
    return r.task == task and r.run_id == run_id

metrics = collect_results(run_dir, df, valid_row=valid_row)
_, conf = get_model_from_run(run_path, only_conf=True)
n_dims = conf.model.n_dims

models = relevant_model_names[task + "_" + run_id]
basic_plot(metrics["standard"], models=models)
plt.show()

# Figure 2: Context Window Size vs. Distribution Shifts

# Figure 3: ICL Regression Score