This notebook contains checks to ensure that when I run experiments with my basic trainer vs HF trainer, the results are the same / sufficiently similar

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src/generic')
import csv
import os
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
from sklearn.metrics import log_loss

In [3]:
from dataset.amazon_reviews_clf_dataset import AmazonClfDataset
from results.process_results import ResultProcessor

In [5]:
main_result_dir = "/data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split"

### HF Trainer on Debug Dataset

In [6]:
base_result_dir = os.path.join(main_result_dir, "train_global_early_stopping")
levels = ["seed"]
hf_results = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split/train_global_early_stopping: Found results for 3 seeds


In [24]:
hf_results.results_df[["test_loss", "test_accuracy"]]

Unnamed: 0,test_loss,test_accuracy
0,0.972898,0.557333
1,1.043426,0.525867
2,0.922668,0.588267


In [8]:
hf_results.results_df.agg(["mean", "std"])[["test_loss", "test_accuracy"]]

Unnamed: 0,test_loss,test_accuracy
mean,0.979664,0.557156
std,0.060663,0.0312


In [17]:
hf_results.results_df.agg(["mean", "std"])[["train_time", "train_samples_per_second", "train_steps_per_second"]]

Unnamed: 0,train_time,train_samples_per_second,train_steps_per_second
mean,288.793132,1025.083667,32.112
std,62.998278,208.672266,6.536825


### My Trainer on Debug Dataset

In [12]:
base_result_dir = os.path.join(main_result_dir, "train_global_my_trainer")
levels = ["seed"]
my_results = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split/train_global_my_trainer: Found results for 3 seeds


In [13]:
my_results.results_df.agg(["mean", "std"])[["test_loss", "test_accuracy"]]

Unnamed: 0,test_loss,test_accuracy
mean,0.955289,0.563911
std,0.078105,0.033783


In [None]:
# weird difference in samples_per_second, but other measures are similar so I think it's okay.

In [18]:
my_results.results_df.agg(["mean", "std"])[["train_time", "train_samples_per_second", "train_steps_per_second"]]

Unnamed: 0,train_time,train_samples_per_second,train_steps_per_second
mean,211.524449,0.436,43.576
std,39.424251,0.077485,7.755279


In [None]:
# check second run where you updated saving of trainer state --> check that the fix worked!

## HF Train on debug dataset using pre-trained embeddings and final layer weights from HF Trainer run above

In [22]:
base_result_dir = os.path.join(main_result_dir, "eval_global_model_embed_final_layers_separate")
levels = ["seed"]
pt_results = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split/eval_global_model_embed_final_layers_separate: Found results for 1 seeds


In [25]:
pt_results.results_df[["test_loss", "test_accuracy"]]

Unnamed: 0,test_loss,test_accuracy
0,1.171408,0.511467


## My Train on debug dataset using pre-trained embeddings and final layer weights from HF Trainer run above

In [31]:
base_result_dir = os.path.join(main_result_dir, "eval_global_model_embed_final_layers_separate_my_trainer")
levels = ["seed"]
pt_results2 = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split/eval_global_model_embed_final_layers_separate_my_trainer: Found results for 1 seeds


In [32]:
pt_results2.results_df[["test_loss", "test_accuracy"]]

Unnamed: 0,test_loss,test_accuracy
0,1.171408,0.511467


## HF Trainer on debug dataset (just eval)

In [35]:
base_result_dir = os.path.join(main_result_dir, "eval_global_model_hf_trainer")
levels = ["seed"]
hf_results2 = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split/eval_global_model_hf_trainer: Found results for 1 seeds


## My Trainer on debug dataset (just eval)

Pre-trained model weights form HF trainer trained model

In [33]:
base_result_dir = os.path.join(main_result_dir, "eval_global_model_my_trainer")
levels = ["seed"]
hf_results3 = ResultProcessor(base_result_dir, levels, verbose=True)

Base dir /data/ddmg/redditlanguagemodeling/results/amazon_reviews/debug/wilds_subpop_user_split/eval_global_model_my_trainer: Found results for 1 seeds


In [34]:
hf_results3.results_df[["test_loss", "test_accuracy"]]

Unnamed: 0,test_loss,test_accuracy
0,0.922668,0.588267
