# Test

In [1]:
%load_ext autoreload
%autoreload 2

### Import datasets

Import datasets using functions from src/data/data.py. Datasets are downloaded from huggingface and stored in /data. Once downloaded, datasets are loaded locally.

Run ```pip install -e .``` if module importing isn't working.

In [2]:
from src.data.data import get_in_domain, get_out_domain

in_domain_train, in_domain_test = get_in_domain()
out_domain = get_out_domain()

print(f"In domain:\n{in_domain_train}")
print(in_domain_train[0])

print(f"Out of domain:\n{out_domain}")
print(out_domain[10])

In domain:
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 261802
})
{'premise': 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him', 'hypothesis': 'You lose the things to the following level if the people recall.', 'label': 0, 'idx': 1}
Out of domain:
Dataset({
    features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
    num_rows: 10000
})
{'premise': 'The president avoided the athlete .', 'hypothesis': 'The athlete avoided the president .', 'label': 1, 'parse_premise': '(ROOT (S (NP (DT The) (NN president)) (VP (VBD avoided) (NP (DT the) (NN athlete))) (. .)))', 'parse_hypothesis': '(ROOT (S (NP (DT The) (NN at

### Import models

Import models using methods from src/models/opt.py. Models are downloaded from huggingface and stored in /models/pretrained. Once downloaded, models are loaded locally.

In [21]:
from src.model.model import get_model

# Get SequenceClassification models
model_opt125, tokenizer_opt125 = get_model(model_name='opt-125m', model_type='SequenceClassification', pretrained=True)
model_opt350, tokenizer_opt350 = get_model(model_name='opt-350m', model_type='SequenceClassification', pretrained=True)

# Get CasualLM models
model_opt125_causal, tokenizer_opt125_causal = get_model(model_name='opt-125m', model_type='CausalLM', pretrained=True)
model_opt350_causal, tokenizer_opt350_causal = get_model(model_name='opt-350m', model_type='CausalLM', pretrained=True)

print(model_opt125.device)
print(model_opt125_causal.device)

cpu
cpu


### Generate method
The generate method is a low level way to generate text using CausalLM models. We constrain the model to only respond with Yes or No using a DisjuntiveConstraint.

In [9]:
import torch
import numpy as np
from transformers.generation.beam_constraints import DisjunctiveConstraint
from src.finetuners.utils import apply_minimal_pattern, tokenize_dataset

random_idx = np.random.choice(range(len(in_domain_test)), 1)
eval_dataset_in = in_domain_test.select(random_idx)

# Verbalize and tokenize
# tokenizer_opt125_causal.padding_side = 'left'
eval_dataset_in = apply_minimal_pattern(eval_dataset_in)
tokenized_eval_dataset_in = tokenize_dataset(eval_dataset_in, tokenizer_opt125_causal, max_length=512)

print(f"Minimal pattern applied:\n{eval_dataset_in[0]}")
print(f"Tokenized:\n{tokenized_eval_dataset_in[0]}")

input_ids = tokenized_eval_dataset_in['input_ids'][0]
input_ids = torch.tensor(input_ids).unsqueeze(0)    # Convert to PyTorch tensor

# Yes/No constraint
yes_token_id = tokenizer_opt125_causal.encode("Yes", add_special_tokens=False)
no_token_id = tokenizer_opt125_causal.encode("No", add_special_tokens=False)
force_words_ids = [yes_token_id, no_token_id]
constraint = DisjunctiveConstraint(nested_token_ids=force_words_ids)

# Generate model output
gen_tokens = model_opt125_causal.generate(
    input_ids,
    max_new_tokens=1,
    # temperature=0.5,
    constraints=[constraint],
    num_beams=2
)

# Decode the generated tokens
generated_text = tokenizer_opt125_causal.batch_decode(gen_tokens[:, input_ids.shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(f"Generated text:\n{generated_text}")
print(f"Actual label: {eval_dataset_in['label']}")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Minimal pattern applied:
{'premise': 'More detailed implementation plans also will be necessary to address business system, processes, and resource issues.', 'hypothesis': 'Detailed implementation plans are necessary to address business system, processes and resources ', 'label': 0, 'idx': 2213, 'text': 'More detailed implementation plans also will be necessary to address business system, processes, and resource issues. Detailed implementation plans are necessary to address business system, processes and resources ?'}
Tokenized:
{'premise': 'More detailed implementation plans also will be necessary to address business system, processes, and resource issues.', 'hypothesis': 'Detailed implementation plans are necessary to address business system, processes and resources ', 'label': 0, 'idx': 2213, 'text': 'More detailed implementation plans also will be necessary to address business system, processes, and resource issues. Detailed implementation plans are necessary to address business sy

### Zero-shot Evaluation
We use Seq2SeqTrainer to evaluate our in and out of domain sets with no training or context.

In [18]:
from src.finetuners.zeroshot import evaluate
from src.data.data import get_random_subsets
import json

# Generate training and evaluation datasets. These should be used for all fine-tuning methods to ensure consistency. np.random should be seeded before this.
train_datasets, eval_dataset_in, eval_dataset_out = get_random_subsets(train_dataset=in_domain_train, 
                                                                       eval_dataset_in=in_domain_test, 
                                                                       eval_dataset_out=out_domain, 
                                                                       train_sample_sizes=[2, 4, 6, 8, 16],#[2, 16, 32, 64, 128], # [2, 4],
                                                                       num_trials=10,   # 5
                                                                       eval_sample_size=50) # 10

combined_metrics = evaluate(model=model_opt125_causal, 
                            tokenizer=tokenizer_opt125_causal, 
                            eval_dataset_in=eval_dataset_in, 
                            eval_dataset_out=eval_dataset_out, 
                            verbose=True, 
                            disable_tqdm=False)

print(f"Metrics:\n{json.dumps(combined_metrics, indent=4)}")

  0%|          | 0/2 [00:00<?, ?it/s]

In domain eval metrics:
{'loss': 0.7275609171390534, 'accuracy': 0.54, 'runtime': 172.8005611896515, 'samples_per_second': 0.28935091214850955}


  0%|          | 0/2 [00:00<?, ?it/s]

Out of domain eval metrics:
{'loss': 0.7723654943704605, 'accuracy': 0.44, 'runtime': 170.04190731048584, 'samples_per_second': 0.29404516093025906}
Metrics:
{
    "eval_in_loss": 0.7275609171390534,
    "eval_in_accuracy": 0.54,
    "eval_in_runtime": 172.8005611896515,
    "eval_in_samples_per_second": 0.28935091214850955,
    "eval_out_loss": 0.7723654943704605,
    "eval_out_accuracy": 0.44,
    "eval_out_runtime": 170.04190731048584,
    "eval_out_samples_per_second": 0.29404516093025906
}
