Torch Results Analysis
===

Somewhat temporary notebook for investigating models training via script.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import os
import re
import json
import sys
import pickle
from tqdm import tqdm

import sklearn
import sklearn.linear_model
import sklearn.preprocessing
from sklearn.pipeline import Pipeline

import dateutil.parser
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import pytz

In [None]:
# HuggingFace packages
import transformers
import tokenizers
import torch

# more torch imports
import torchvision
import torchvision.transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# evaluation
from scipy.stats import rankdata

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig

In [None]:
config = cbrec.genconfig.Config()
#config.metadata_filepath += "_old"
#config.feature_db_filepath += "_old"

In [None]:
import cbrec.featuredb
import cbrec.utils
import cbrec.data
import cbrec.reccontext
import cbrec.evaluation
import cbrec.torchmodel
import cbrec.text.embeddingdb
import cbrec.text.journalid

In [None]:
import cbrec.logutils
cbrec.logutils.set_up_logging()

In [None]:
# turn off matplotlib logging
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths

In [None]:
import cbrec.modeling.modelconfig
import cbrec.modeling.scorer
import cbrec.modeling.manager

In [None]:
# load train features
feature_cache_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'torch_experiments', 'feature_cache')
filenames = [
    ('X_train_raw.pkl', 'y_train_raw.pkl'),
    ('X_test2train_raw.pkl', 'y_test2train_raw.pkl'),
]
Xs = []
ys = []
for x_filename, y_filename in filenames:
    with open(os.path.join(feature_cache_dir, x_filename), 'rb') as infile:
        X = pickle.load(infile)
        Xs.append(X)
    with open(os.path.join(feature_cache_dir, y_filename), 'rb') as infile:
        y = pickle.load(infile)
        ys.append(y)


In [None]:
X = np.concatenate(Xs, axis=0)
y_true = np.concatenate(ys, axis=0)
X.shape, y_true.shape

In [None]:
# shuffle the data
inds = np.arange(len(X))
np.random.shuffle(inds)
X = X[inds]
y_true = y_true[inds]

In [None]:
model_config = cbrec.modeling.modelconfig.ModelConfig()
model_config.train_n_epochs = 21
model_config

In [None]:
model_manager = cbrec.modeling.manager.ModelManager(model_config, config=config)
model_manager.model_config.output_basename

In [None]:
model_manager.train_model(X, y_true)

In [None]:
model_manager.save_model()

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'wd')
model_manager.load_model(load_training_metrics=True)

In [None]:
# now that we've loaded the saved model data, we can 
train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

i = np.argmin(ys)
plt.scatter(xs[i], ys[i], color='black', zorder=10, label='Best validation loss')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.set_ylim((0, 1))

ax.legend()

plt.show()

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'wd2')
model_manager.load_model(load_training_metrics=True)

In [None]:
# now that we've loaded the saved model data, we can 
train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

i = np.argmin(ys)
plt.scatter(xs[i], ys[i], color='black', zorder=10, label=f'Best validation loss ({ys[i]:.3f})')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.set_ylim((0, 1))

ax.legend()

plt.show()

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'wd3')
model_manager.load_model(load_training_metrics=True)

train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

i = np.argmin(ys)
plt.scatter(xs[i], ys[i], color='black', zorder=10, label=f'Best validation loss ({ys[i]:.3f})')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.set_ylim((0, 1))

ax.legend()

plt.show()

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'wd4')
model_manager.load_model(load_training_metrics=True)

train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

i = np.argmin(ys)
plt.scatter(xs[i], ys[i], color='black', zorder=10, label=f'Best validation loss ({ys[i]:.3f})')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.set_ylim((0, 1))

ax.legend()

plt.show()

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'do1')
model_manager.load_model(load_training_metrics=True)

train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

i = np.argmin(ys)
plt.scatter(xs[i], ys[i], color='black', zorder=10, label=f'Best validation loss ({ys[i]:.3f})')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.set_ylim((0, 1))

ax.legend()

plt.show()

In [None]:
model_manager = cbrec.modeling.manager.ModelManager.load_from_model_name('LinearNet', 'do2')
model_manager.load_model(load_training_metrics=True)

train_metrics, test_metrics = model_manager.model_trainer.get_train_metrics()

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = test_metrics.T[:,0]
ys = test_metrics.T[:,1]
ax.plot(xs, ys, label='Test')

i = np.argmin(ys)
plt.scatter(xs[i], ys[i], color='black', zorder=10, label=f'Best validation loss ({ys[i]:.3f})')

xs = train_metrics.T[:,0]
ys = train_metrics.T[:,1]
ax.plot(xs, ys, label='Train')

ax.set_ylim((0, 1))

ax.legend()

plt.show()