# Test

In [1]:
%load_ext autoreload
%autoreload 2

### Import datasets

Import datasets using functions from src/data/data.py. Datasets are downloaded from huggingface and stored in /data. Once downloaded, datasets are loaded locally.

Run ```pip install -e .``` if module importing isn't working.

In [2]:
from src.data.data import get_in_domain, get_out_domain
from src.data.data import get_random_subsets

in_domain_train, in_domain_test = get_in_domain()
out_domain = get_out_domain()

print(f"In domain:\n{in_domain_train}")
print(in_domain_train[0])

print(f"Out of domain:\n{out_domain}")
print(out_domain[10])

# get_random_subsets(in_domain)

In domain:
Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 261802
})
{'premise': 'you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him', 'hypothesis': 'You lose the things to the following level if the people recall.', 'label': 0, 'idx': 1}
Out of domain:
Dataset({
    features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
    num_rows: 10000
})
{'premise': 'The president avoided the athlete .', 'hypothesis': 'The athlete avoided the president .', 'label': 1, 'parse_premise': '(ROOT (S (NP (DT The) (NN president)) (VP (VBD avoided) (NP (DT the) (NN athlete))) (. .)))', 'parse_hypothesis': '(ROOT (S (NP (DT The) (NN at

### Import models

Import models using methods from src/models/opt.py. Models are downloaded from huggingface and stored in /models/pretrained. Once downloaded, models are loaded locally.

In [3]:
from src.model.model import get_model, download_model

# Get SequenceClassification models
# model_opt125, tokenizer_opt125 = get_model(model_name='opt-125m', model_type='SequenceClassification', pretrained=True)
# model_opt350, tokenizer_opt350 = get_model(model_name='opt-350m', model_type='SequenceClassification', pretrained=True)

# Get CasualLM models
model_opt125_causal, tokenizer_opt125_causal = get_model(model_name='opt-125m', model_type='CausalLM', pretrained=True)
model_opt350_causal, tokenizer_opt350_causal = get_model(model_name='opt-350m', model_type='CausalLM', pretrained=True)

### Generate method
The generate method is a low level way to generate text using CausalLM models. We constrain the model to only respond with Yes or No using a DisjuntiveConstraint.

In [56]:
import torch
import numpy as np
from transformers.generation.beam_constraints import DisjunctiveConstraint
from src.finetuners.utils import apply_minimal_pattern, tokenize_dataset

random_idx = np.random.choice(range(len(in_domain_test)), 1)
eval_dataset_in = in_domain_test.select(random_idx)

# Verbalize and tokenize
tokenizer_opt125_causal.padding_side = 'left'
eval_dataset_in = apply_minimal_pattern(eval_dataset_in)
tokenized_eval_dataset_in = tokenize_dataset(eval_dataset_in, tokenizer_opt125_causal, max_length=512)

print(f"Minimal pattern applied:\n{eval_dataset_in[0]}")
print(f"Tokenized:\n{tokenized_eval_dataset_in[0]}")

input_ids = tokenized_eval_dataset_in['input_ids'][0]
input_ids = torch.tensor(input_ids).unsqueeze(0)    # Convert to PyTorch tensor

# Yes/No constraint
yes_token_id = tokenizer_opt125_causal.encode("Yes", add_special_tokens=False)
no_token_id = tokenizer_opt125_causal.encode("No", add_special_tokens=False)
force_words_ids = [yes_token_id, no_token_id]
constraint = DisjunctiveConstraint(nested_token_ids=force_words_ids)

# Generate model output
gen_tokens = model_opt125_causal.generate(
    input_ids, 
    max_new_tokens=3,  
    # temperature=0.5,
    constraints=[constraint],
    num_beams=5
)

# Decode the generated tokens
generated_text = tokenizer_opt125_causal.batch_decode(gen_tokens[:, input_ids.shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(f"Generated text:\n{generated_text}")
print(f"Actual label: {eval_dataset_in['label']}")

Generate config GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": 2,
  "pad_token_id": 1
}



Minimal pattern applied:
{'premise': 'However, in the off-field (sentimental) tournament, the Falcons and Jets have more appealing story lines.', 'hypothesis': 'The Jets and Falcons have boring stories.', 'label': 1, 'idx': 126, 'text': 'However, in the off-field (sentimental) tournament, the Falcons and Jets have more appealing story lines. The Jets and Falcons have boring stories.?'}
Tokenized:
{'premise': 'However, in the off-field (sentimental) tournament, the Falcons and Jets have more appealing story lines.', 'hypothesis': 'The Jets and Falcons have boring stories.', 'label': 1, 'idx': 126, 'text': 'However, in the off-field (sentimental) tournament, the Falcons and Jets have more appealing story lines. The Jets and Falcons have boring stories.?', 'input_ids': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 