# Model info: DNABERT-2 vs NT-v2-500m

This notebook loads **tokenizers and configs only** (no model weights) to avoid kernel crashes, then prints tokenizer/model metadata and the stride/window math used in `SequenceEvolver.evolve_step` (see `01_generate-seq-and-emb/preparation.py`).


In [1]:
from transformers import AutoTokenizer, AutoConfig

MODEL_CONFIGS = {
    'DNABERT-2': 'zhihan1996/DNABERT-2-117M',
    'NT-v2-500m': 'InstaDeepAI/nucleotide-transformer-v2-500m-multi-species',
}

tokenizers = {}
configs = {}
for label, model_id in MODEL_CONFIGS.items():
    tokenizers[label] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    configs[label] = AutoConfig.from_pretrained(model_id, trust_remote_code=True)

list(tokenizers.keys())


['DNABERT-2', 'NT-v2-500m']

In [3]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)


def summarize(label, tok, cfg):
    # Stride/window math used in SequenceEvolver.evolve_step
    # NOTE: evolve_step hardcodes max_length=1024 in preparation.py
    max_len = 1024
    num_special = tok.num_special_tokens_to_add(pair=False)
    window_size = max(1, max_len - num_special)
    stride = max(1, window_size // 2)

    def get_cfg(key, default=None):
        return getattr(cfg, key, default)

    return {
        'label': label,
        'model_id_or_path': tok.name_or_path,
        'tokenizer_class': tok.__class__.__name__,
        'vocab_size': tok.vocab_size,
        'model_max_length(tokenizer)': tok.model_max_length,
        'num_special_tokens_to_add': num_special,
        'special_tokens': {
            'cls': tok.cls_token,
            'sep': tok.sep_token,
            'pad': tok.pad_token,
            'mask': tok.mask_token,
            'unk': tok.unk_token,
        },
        'evolve_max_length': max_len,
        'evolve_window_size': window_size,
        'evolve_stride': stride,
        'config.max_position_embeddings': get_cfg('max_position_embeddings'),
        'config.n_positions': get_cfg('n_positions'),
        'config.model_type': get_cfg('model_type'),
        'config.hidden_size': get_cfg('hidden_size'),
        'config.num_attention_heads': get_cfg('num_attention_heads'),
        'config.num_hidden_layers': get_cfg('num_hidden_layers'),
    }

rows = [summarize(label, tokenizers[label], configs[label]) for label in tokenizers]
df = pd.DataFrame(rows)
df


Unnamed: 0,label,model_id_or_path,tokenizer_class,vocab_size,model_max_length(tokenizer),num_special_tokens_to_add,special_tokens,evolve_max_length,evolve_window_size,evolve_stride,config.max_position_embeddings,config.n_positions,config.model_type,config.hidden_size,config.num_attention_heads,config.num_hidden_layers
0,DNABERT-2,zhihan1996/DNABERT-2-117M,PreTrainedTokenizerFast,4096,1000000000000000019884624838656,2,"{'cls': '[CLS]', 'sep': '[SEP]', 'pad': '[PAD]', 'mask': '[MASK]', 'unk': '[UNK]'}",1024,1022,511,512,,,768,12,12
1,NT-v2-500m,InstaDeepAI/nucleotide-transformer-v2-500m-multi-species,EsmTokenizer,4107,2048,1,"{'cls': '<cls>', 'sep': None, 'pad': '<pad>', 'mask': '<mask>', 'unk': '<unk>'}",1024,1023,511,2050,,esm,1024,16,29


In [4]:
tokenizers['NT-v2-500m'].special_tokens_map


{'unk_token': '<unk>',
 'pad_token': '<pad>',
 'cls_token': '<cls>',
 'mask_token': '<mask>'}

In [8]:
tok = tokenizers["NT-v2-500m"]

# 전체 vocab 크기
print(tok.vocab_size)

# 실제 토큰 문자열 확인 (앞쪽 50개)
vocab = tok.get_vocab()
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
sorted_vocab[:50]

# special tokens 확인
tok.special_tokens_map


4107


{'unk_token': '<unk>',
 'pad_token': '<pad>',
 'cls_token': '<cls>',
 'mask_token': '<mask>'}

In [None]:
# special tokens id 들
tok.special_tokens_map, tok.all_special_tokens, tok.all_special_ids

# special 제외한 실제 k-mer만 보고 싶으면
special = set(tok.all_special_tokens)
kmer_only = [t for t,_ in sorted_vocab if t not in special]
len(kmer_only)
kmer_only[-20:]


['GGGTAG',
 'GGGTTA',
 'GGGTTT',
 'GGGTTC',
 'GGGTTG',
 'GGGTCA',
 'GGGTCT',
 'GGGTCC',
 'GGGTCG',
 'GGGTGA',
 'GGGTGT',
 'GGGTGC',
 'GGGTGG',
 'GGGCAA',
 'GGGCAT',
 'GGGCAC',
 'GGGCAG',
 'GGGCTA',
 'GGGCTT',
 'GGGCTC',
 'GGGCTG',
 'GGGCCA',
 'GGGCCT',
 'GGGCCC',
 'GGGCCG',
 'GGGCGA',
 'GGGCGT',
 'GGGCGC',
 'GGGCGG',
 'GGGGAA',
 'GGGGAT',
 'GGGGAC',
 'GGGGAG',
 'GGGGTA',
 'GGGGTT',
 'GGGGTC',
 'GGGGTG',
 'GGGGCA',
 'GGGGCT',
 'GGGGCC',
 'GGGGCG',
 'GGGGGA',
 'GGGGGT',
 'GGGGGC',
 'GGGGGG',
 'A',
 'T',
 'C',
 'G',
 'N']