In [1]:
from transformers import AutoTokenizer, BartTokenizerFast, CodeGenTokenizerFast, PreTrainedTokenizerFast

In [2]:
new_repo = 'h2oai/h2o-danube2-1.8b-base'

bart_large_mnli_tok:BartTokenizerFast = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
bart_large_tok:BartTokenizerFast = AutoTokenizer.from_pretrained('facebook/bart-large')
new_tok:CodeGenTokenizerFast = AutoTokenizer.from_pretrained(new_repo)

In [3]:
print(f"{type(bart_large_mnli_tok)=}")
print(f"{type(bart_large_tok)=}")
print(f"{type(new_tok)=}")

type(bart_large_mnli_tok)=<class 'transformers.models.bart.tokenization_bart_fast.BartTokenizerFast'>
type(bart_large_tok)=<class 'transformers.models.bart.tokenization_bart_fast.BartTokenizerFast'>
type(phi_tok)=<class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>


In [4]:
# Pad tokens used
print(f"{bart_large_mnli_tok.pad_token}")
print(f"{bart_large_tok.pad_token}")
print(f"{new_tok.pad_token}")# Pad tokens used

<pad>
<pad>
None


In [5]:
print(f"{bart_large_mnli_tok.model_max_length}")
print(f"{bart_large_tok.model_max_length}")
print(f"{new_tok.model_max_length}")

1024
1024
2048


In [23]:

premises = [
    "hello",
]

hypotheses = [
    'my'
]

def apply_tokenizer(tokenizer:PreTrainedTokenizerFast, premises, hypotheses, also_decode=True):
    
    outputs = tokenizer(premises, hypotheses, truncation='only_first', padding="longest",
                        max_length=tokenizer.model_max_length, verbose=True) 
    
    if also_decode:
        outputs['decoded_input_ids'] = [[tokenizer.decode(i) for i in seq] for seq in outputs['input_ids']]
    
    return outputs


In [24]:
apply_tokenizer(bart_large_mnli_tok, premises, hypotheses)

{'input_ids': [[0, 42891, 2, 2, 4783, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1]], 'decoded_input_ids': [['<s>', 'hello', '</s>', '</s>', 'my', '</s>']]}

In [25]:
apply_tokenizer(bart_large_tok, premises, hypotheses)

{'input_ids': [[0, 42891, 2, 2, 4783, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1]], 'decoded_input_ids': [['<s>', 'hello', '</s>', '</s>', 'my', '</s>']]}

In [26]:
# this currently throws an error because we have no padding token, so let's add it!
apply_tokenizer(new_tok, premises, hypotheses)

{'input_ids': [[31373, 1820]], 'attention_mask': [[1, 1]], 'decoded_input_ids': [['hello', 'my']]}

In [30]:
# add pad token to phi tokenizer and model

from transformers import AutoModelForSequenceClassification

phi_model = AutoModelForSequenceClassification.from_pretrained(new_repo, num_labels=3)

def add_special_tokens_when_missing(tokenizer, model):
    
    needs_pad_token = tokenizer.pad_token is None
    if needs_pad_token:
        num_added_toks = tokenizer.add_special_tokens({'pad_token': '<|pad_token|>'})
        assert num_added_toks == 1
    
    needs_sep_token = tokenizer.sep_token is None
    if needs_sep_token:
        num_added_toks = tokenizer.add_special_tokens({'sep_token': '<|sep_token|>'})
        assert num_added_toks == 1
        
    needs_eos_token = tokenizer.eos_token is None
    if needs_eos_token:
        num_added_toks = tokenizer.add_special_tokens({'eos_token': '<|eos_token|>'})
        assert num_added_toks == 1
        
    needs_bos_token = tokenizer.bos_token is None
    if needs_eos_token:
        num_added_toks = tokenizer.add_special_tokens({'bos_token': '<|bos_token|>'})
        assert num_added_toks == 1
    
    if any([needs_pad_token, needs_sep_token, needs_eos_token, needs_bos_token]):
        model.resize_token_embeddings(len(tokenizer))
        
    return tokenizer, model
        
    


Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-1_5 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
apply_tokenizer(new_tok, premises, hypotheses)

{'input_ids': [[31373, 1820]], 'attention_mask': [[1, 1]], 'decoded_input_ids': [['hello', 'my']]}

In [35]:
class UpdatedPhiTokenizerFast(CodeGenTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        
        if token_ids_1 is None: 
            raise NotImplementedError('This method is designed for zero shot classification, so requires a premise and hypothesis to be passed. No token_ids_1 was passed.')
        
        output = [self.bos_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.eos_token_id]

        return output
    
updated_phi_tok = UpdatedPhiTokenizerFast.from_pretrained(new_repo)

updated_phi_tok, phi_model = add_special_tokens_when_missing(updated_phi_tok, phi_model)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'CodeGenTokenizer'. 
The class this function is called from is 'UpdatedPhiTokenizerFast'.


In [36]:
# this currently throws an error because we have no padding token, so let's add it!
apply_tokenizer(updated_phi_tok, premises, hypotheses)

{'input_ids': [[31373, 1820]], 'attention_mask': [[1, 1]], 'decoded_input_ids': [['hello', 'my']]}