## ConcatNet Learning Curve analysis

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

import os
import json
import torch
import pickle
from datetime import datetime
import pytz
import dateutil
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# set up logging; only run this cell once
import logging
use_cbrec_logging = True
if not use_cbrec_logging:
    # this is a demo of how to set up logging
    # since we use cbrec logging below, this will be done for us when we call set_up_logging.
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    stream_handler.setFormatter(formatter)
    root.addHandler(stream_handler)

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig
config = cbrec.genconfig.Config()
import cbrec.evaluation
import cbrec.reccontext
import cbrec.featuredb
import cbrec.torchmodel
import cbrec.utils
import cbrec.logutils
import cbrec.feature_loader
cbrec.logutils.set_up_logging()
# turn off matplotlib logging
# which can be quite verbose and usually is not useful
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
output_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/concatnet_20220308033431/outputs"
assert os.path.exists(output_dir)

In [None]:
!ls /home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/adam*

In [None]:
from glob import glob
def identify_model_filepaths(model_dir):
    logger = logging.getLogger("cbrec.modeling.submitEvalFromDirectory.identify_model_filepaths")
    if not os.path.exists(model_dir):
        raise ValueError(f"Dir '{model_dir}' does not exist.")
    model_filepaths = []
    for model_filepath in glob(os.path.join(model_dir, '*.json')):
        model_filepaths.append(model_filepath)
    if len(model_filepaths) == 0:
        raise ValueError(f"No .json files in dir '{model_dir}'.")
    logger.info(f"Identified {len(model_filepaths)} model filepaths in dir {model_dir}.")
    return model_filepaths

In [None]:
model_filepaths = identify_model_filepaths(output_dir)
len(model_filepaths)

In [None]:
from cbrec.modeling import scorer
from cbrec.modeling import manager

In [None]:
for model_filepath in model_filepaths:
    mm = cbrec.modeling.manager.ModelManager.load_from_filepath(model_filepath)
    mm.load_model(load_preprocessor=False, load_model_state_dict=False, load_training_metrics=True)
    break

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

metrics = mm.model_trainer.train_metrics
xs = metrics[0,:]
ys = metrics[1,:]
ax.plot(xs, ys, color='blue', label='Train Loss', alpha=0.5)

metrics = mm.model_trainer.test_metrics
xs = metrics[0,:]
ys = metrics[1,:]
ax.plot(xs, ys, color='orange', label='Validation Loss')

ax.legend()

ax.set_title("Learning curve for a single model")

plt.show()

In [None]:
mm_list = []
for model_filepath in tqdm(model_filepaths):
    mm = cbrec.modeling.manager.ModelManager.load_from_filepath(model_filepath)
    mm.load_model(load_preprocessor=False, load_model_state_dict=False, load_training_metrics=True)
    mm_list.append(mm)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

for mm in mm_list:
    metrics = mm.model_trainer.test_metrics
    xs = metrics[0,:]
    ys = metrics[1,:]
    ax.plot(xs, ys, color='black', alpha=0.2)


ax.set_title("Learning curve for all models")
ax.set_yscale('log')

ax.set_ylabel("Validation loss")
ax.set_xlabel("# of epochs trained")

plt.show()

In [None]:
!ls /home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/adam_randomsearch_experiment_20220213194145/

In [None]:
output_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/torch_experiments/modeling/adam_randomsearch_experiment_20220213194145/outputs"
model_filepaths = identify_model_filepaths(output_dir)
mm_list = []
for model_filepath in tqdm(model_filepaths):
    mm = cbrec.modeling.manager.ModelManager.load_from_filepath(model_filepath)
    mm.load_model(load_preprocessor=False, load_model_state_dict=False, load_training_metrics=True)
    mm_list.append(mm)
len(mm_list)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

for mm in mm_list:
    metrics = mm.model_trainer.test_metrics
    xs = metrics[0,2:]
    ys = metrics[1,2:]
    ax.plot(xs, ys, color='black', alpha=0.2)


ax.set_title("Learning curve for all models")
ax.set_yscale('log')

ax.set_ylabel("Validation loss")
ax.set_xlabel("# of epochs trained")

plt.show()

In [None]:
best_loss = 1
for mm in mm_list:
    metrics = mm.model_trainer.test_metrics
    best_model_loss = np.min(metrics[1,:])
    if best_model_loss < best_loss:
        best_loss = best_model_loss
print(f"Best loss: {best_loss:.4f}")

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

for mm in mm_list:
    metrics = mm.model_trainer.test_metrics
    xs = metrics[0,2:]
    ys = metrics[1,2:]
    best_model_loss = np.min(metrics[1,:])
    if best_model_loss < best_loss + 0.01:
        ax.plot(xs, ys, color='black', alpha=0.2)

ax.set_title("Learning curve for all models")
#ax.set_yscale('log')

ax.set_ylabel("Validation loss")
ax.set_xlabel("# of epochs trained")

plt.show()