# Test

In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Import datasets

Import datasets using functions from src/data/data.py. Datasets are downloaded from huggingface and stored in /data. Once downloaded, datasets are loaded locally.

Run ```pip install -e .``` if module importing isn't working.

In [14]:
from src.data.data import get_in_domain, get_out_domain
from src.data.data import get_random_subsets

in_domain_train, in_domain_test = get_in_domain()
out_domain = get_out_domain()

print(f"In domain:\n{in_domain_train}")
print(in_domain_train[0])

print(f"Out of domain:\n{out_domain}")
print(out_domain[10])

# get_random_subsets(in_domain)

In domain:
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 261802
})
{'premise': 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him', 'hypothesis': 'You lose the things to the following level if the people recall.', 'label': 0, 'idx': 1}
Out of domain:
Dataset({
    features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
    num_rows: 10000
})
{'premise': 'The president avoided the athlete .', 'hypothesis': 'The athlete avoided the president .', 'label': 1, 'parse_premise': '(ROOT (S (NP (DT The) (NN president)) (VP (VBD avoided) (NP (DT the) (NN athlete))) (. .)))', 'parse_hypothesis': '(ROOT (S (NP (DT The) (NN at

### Import models

Import models using methods from src/models/opt.py. Models are downloaded from huggingface and stored in /models/pretrained. Once downloaded, models are loaded locally.

In [15]:
from src.model.model import get_model

model_opt125, tokenizer_opt125 = get_model('opt-125m')
model_opt350, tokenizer_opt350 = get_model('opt-350m')

### Few-shot finetuning

Perform few-shot fine-tuning using the fine_tune method from /src/finetuners/fewshot.py. This method requires model, tokenizers, and datasets to be passed in as parameters. It returns training and evaluation metrics.

In [22]:
from src.finetuners.fewshot import fine_tune

train_dataset = in_domain_train.select(range(2))
eval_dataset_in = in_domain_test.select(range(2))
eval_dataset_out = out_domain.select(range(2))

fine_tune(model=model_opt125, tokenizer=tokenizer_opt125, train_dataset=train_dataset, eval_dataset_in=eval_dataset_in, eval_dataset_out=eval_dataset_out)

100%|██████████| 40/40 [00:08<00:00,  4.48it/s]


{'train_runtime': 8.9196, 'train_samples_per_second': 8.969, 'train_steps_per_second': 4.485, 'train_loss': 8.940696005765859e-09, 'epoch': 40.0, 'train_peak_memory_gb': 2.5072526931762695}


100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
100%|██████████| 1/1 [00:02<00:00,  2.03s/it]


{'train_runtime': 8.9196,
 'train_samples_per_second': 8.969,
 'train_steps_per_second': 4.485,
 'train_loss': 8.940696005765859e-09,
 'epoch': 40.0,
 'train_peak_memory_gb': 2.5072526931762695,
 'eval_in_loss': 6.02099609375,
 'eval_in_accuracy': 0.5,
 'eval_in_runtime': 2.2393,
 'eval_in_samples_per_second': 0.893,
 'eval_in_steps_per_second': 0.447,
 'eval_in_peak_memory_gb': 1.5648889541625977,
 'eval_out_loss': 10.540631294250488,
 'eval_out_accuracy': 0.0,
 'eval_out_runtime': 2.2087,
 'eval_out_samples_per_second': 0.906,
 'eval_out_steps_per_second': 0.453,
 'eval_out_peak_memory_gb': 1.5648889541625977}

### Batch few-shot finetuning

Perform batch few-shot fine-tuning using batch_fine_tune from /src/finetuners/fewshot.py. This method only requires model_name and training and eval datasets. The selected model is loaded from /models/pretrained for each trial to prevent cumulative fine-tuning. sample_sizes corresponds to the number of shots used for each trial. Each trial is trained and evaluated using data randomly selected by get_random_subsets from /src/utils.py. Results are written to a csv in /logs.

In [29]:
import json
from src.finetuners.fewshot import batch_fine_tune
from src.data.data import get_random_subsets

# Generate training and evaluation datasets. These should be used for all fine-tuning methods to ensure consistency. np.random should be seeded before this.
train_datasets, eval_dataset_in, eval_dataset_out = get_random_subsets(train_dataset=in_domain_train, 
                                                                       eval_dataset_in=in_domain_test, 
                                                                       eval_dataset_out=out_domain, 
                                                                       train_sample_sizes=[2, 4],   # [2, 16, 32, 64, 128]
                                                                       num_trials=5,    # 10
                                                                       eval_sample_size=10) # 50

results = batch_fine_tune(model_name='opt-125m', train_datasets=train_datasets, eval_dataset_in=eval_dataset_in, eval_dataset_out=eval_dataset_out)

print(json.dumps(results, indent=4))

2-shot: 100%|██████████| 5/5 [01:25<00:00, 17.02s/it, train_runtime=11.2, train_samples_per_second=7.17, train_steps_per_second=3.59, total_flos=2.09e+13, train_loss=0.0911, epoch=40, train_peak_memory_gb=2.99, eval_in_loss=0.579, eval_in_accuracy=0.8, eval_in_runtime=2.45, eval_in_samples_per_second=4.08, eval_in_steps_per_second=0.816, eval_in_peak_memory_gb=2.45, eval_out_loss=0.907, eval_out_accuracy=0.5, eval_out_runtime=2.49, eval_out_samples_per_second=4.01, eval_out_steps_per_second=0.802, eval_out_peak_memory_gb=2.45]
4-shot: 100%|██████████| 5/5 [02:22<00:00, 28.48s/it, train_runtime=19.7, train_samples_per_second=8.13, train_steps_per_second=2.03, total_flos=4.18e+13, train_loss=0.102, epoch=40, train_peak_memory_gb=4.05, eval_in_loss=0.959, eval_in_accuracy=0.4, eval_in_runtime=2.47, eval_in_samples_per_second=4.05, eval_in_steps_per_second=0.809, eval_in_peak_memory_gb=2.46, eval_out_loss=0.67, eval_out_accuracy=0.6, eval_out_runtime=2.53, eval_out_samples_per_second=3.95,

{
    "2": [
        {
            "train_runtime": 8.7106,
            "train_samples_per_second": 9.184,
            "train_steps_per_second": 4.592,
            "total_flos": 20903740047360.0,
            "train_loss": 0.07221591472625732,
            "epoch": 40.0,
            "train_peak_memory_gb": 2.9903650283813477,
            "eval_in_loss": 0.8656133413314819,
            "eval_in_accuracy": 0.2,
            "eval_in_runtime": 2.3451,
            "eval_in_samples_per_second": 4.264,
            "eval_in_steps_per_second": 0.853,
            "eval_in_peak_memory_gb": 2.4508790969848633,
            "eval_out_loss": 0.7518103718757629,
            "eval_out_accuracy": 0.4,
            "eval_out_runtime": 2.4636,
            "eval_out_samples_per_second": 4.059,
            "eval_out_steps_per_second": 0.812,
            "eval_out_peak_memory_gb": 2.4508790969848633
        },
        {
            "train_runtime": 8.9631,
            "train_samples_per_second": 8.925,
        


