In [1]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset
import numpy as np
import random
import csv

# Initialize seeder and randomness
seed = 123
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)  
print(device)

cuda:0


In [2]:
model_name_or_path = 'microsoft/CodeGPT-small-py' # 'microsoft/CodeGPT-small-py' 'gpt2'
base_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case = True)
base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
base_model = base_model.to(device)

base_model.num_parameters
# (wte): Embedding(50262, 768)
#     (wpe): Embedding(1024, 768)


<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50001, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

In [3]:
print('Words in vocabulary: ', base_tokenizer.vocab_size)
vocabulary = base_tokenizer.get_vocab()
print(vocabulary['for'])

example_text = "for i in range(0, 10):"
print(base_tokenizer.tokenize(example_text))

text_ids = base_tokenizer.encode(example_text, return_tensors = 'pt')
print(text_ids)



Words in vocabulary:  50000
1007
['for', 'Ġi', 'Ġin', 'Ġrange', '(', '0', ',', 'Ġ10', '):']
tensor([[1007,  274,  292, 1016,   10,   18,   14, 2165,  298]])


In [4]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples = 5):
    text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids, 
        max_length= 100,  
        num_return_sequences= n_samples,
        no_repeat_ngram_size= 2,
        repetition_penalty= 1.5,
        top_p= 0.92,
        temperature= .85,
        do_sample= True,
        top_k= 125,
        early_stopping= True
    )
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

    return gen_text


In [5]:
# text generation example
generated_text_samples = generate_n_text_samples(base_model, base_tokenizer, example_text, device)

generated_text_samples

['for i in range(0, 10): # 5% of file size to read block = int((i * chunk_size) / (file.tell() + 1)) self._lastblocks[block].append([chunk]) ',
 'for i in range(0, 10): for j in [i + 1] << (10 - x[j]) if abs_idx is True: # If not already done. return idx ',
 "for i in range(0, 10): for j in reversed_range((i + 1) * 4 - 2): result[j] = (result[(m % m), int(((1 if num2and3 or n == 0 else 12)) / 32. ** 16)]) # Convert the first half of each byte into a sequence that has 8 bits shifted out by 6 bytes since this is not inclusive return tuple([int(''.join('{:02x}'.format(*val[:7])), ord",
 "for i in range(0, 10): r.save() print('Saving successful') ",
 'for i in range(0, 10): for j in [1 + math.pow((i - 1) / float("inf"), 2), (j-math.power(-2))]: if self._is_twopoint(*args[3]): yield Point({self: args[:4], \'x\': [], \'y\': []}) # no point else: raise ValueError(\'unknown coordinate system\') ']

In [6]:
# the eos and bos tokens are defined
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

# the new token is added to the tokenizer
num_added_toks = base_tokenizer.add_special_tokens(special_tokens_dict)

# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(model_name_or_path, 
                                    bos_token_id=base_tokenizer.bos_token_id,
                                    eos_token_id=base_tokenizer.eos_token_id,
                                    pad_token_id=base_tokenizer.pad_token_id,
                                    output_hidden_states=False)

# the pre-trained model is loaded with the custom configuration
base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path, config=config)

# Clear the pre-trained weights of the model for from scratch training
base_model.init_weights()

# the model embedding is resized
base_model.resize_token_embeddings(len(base_tokenizer))

base_model = base_model.to(device)

In [7]:
'''max_length = 100

filepath= 'articles1.csv'
df = pd.read_csv(filepath, encoding = 'utf-8', usecols=['title', 'publication'])\
                    .rename(columns={'title': 'text'})


pd.set_option("display.max_colwidth", None)
df.head(5)

def remove_publication_headline(headline, publication):
    # publication col doesn't match exactly with newspaper in title col
    if str(publication) in str(headline):
        headline = headline.split(' - ')[0]
    return headline

def process_headlines(df, text_colname):
  
    # Remove empty and null rows
    titulo_vacio = (df['text'].str.len() == 0) | df['text'].isna()
    df = df[~titulo_vacio]

    # Remove publication name from title
    df['text'] = df.apply(lambda row: remove_publication_headline(row['text'], row['publication']), axis = 1)

    # Remove headlines with less than 8 words
    titlos_len_ge8 = (df['text'].str.split().apply(lambda x: len(x)) >= 8)
    df = df[titlos_len_ge8]

    # Drop duplicates
    text_df = df.drop_duplicates(subset = [text_colname])\
                [[text_colname]]

    return text_df
    
df = process_headlines(df, 'text')
df'''

'max_length = 100\n\nfilepath= \'articles1.csv\'\ndf = pd.read_csv(filepath, encoding = \'utf-8\', usecols=[\'title\', \'publication\'])                    .rename(columns={\'title\': \'text\'})\n\n\npd.set_option("display.max_colwidth", None)\ndf.head(5)\n\ndef remove_publication_headline(headline, publication):\n    # publication col doesn\'t match exactly with newspaper in title col\n    if str(publication) in str(headline):\n        headline = headline.split(\' - \')[0]\n    return headline\n\ndef process_headlines(df, text_colname):\n  \n    # Remove empty and null rows\n    titulo_vacio = (df[\'text\'].str.len() == 0) | df[\'text\'].isna()\n    df = df[~titulo_vacio]\n\n    # Remove publication name from title\n    df[\'text\'] = df.apply(lambda row: remove_publication_headline(row[\'text\'], row[\'publication\']), axis = 1)\n\n    # Remove headlines with less than 8 words\n    titlos_len_ge8 = (df[\'text\'].str.split().apply(lambda x: len(x)) >= 8)\n    df = df[titlos_len_ge8]\n

In [9]:
max_length = 100

def dropEmpty(tokens0):
    tokens = []
    for i in range(0, len(tokens0)):
        temp = tokens0[i]
        if temp != []:
            tokens.append(temp)
    return tokens

with open("pretraining_corpus.csv", newline='', encoding='utf-8') as f:
        reader = csv.reader(x.replace('\0', '') for x in f)
        #reader = csv.reader(f)
        data = list(reader)
data = dropEmpty(data)
limit = int(len(data)/8)
data = data[0:limit]
# data = data[0:1000] # sub sample for checking

# Creation of the dataset's structure
text = []
for item in data:
    text.append(' '.join([str(token) for token in item[0:]]))

# Convert to pandas
df = pd.DataFrame({'text': text})
df

In [8]:
len(df.iloc[21,0])

57

In [9]:
df['text'] = bos + ' ' + df['text'] + ' ' + eos

df_train, df_val = train_test_split(df, train_size = 0.9, random_state = seed)
print(f'There are {len(df_train)} components for training and {len(df_val)} for validation')
df_train

There are 86462 components for training and 9607 for validation


Unnamed: 0,text
16731,<|endoftext|> test_singleton self ftype finfo ...
14209,<|endoftext|> __setitem__ self indx value stri...
44094,<|endoftext|> setup self self exceptions got_r...
23896,<|endoftext|> ault a single value is returned ...
49166,<|endoftext|> ault database has lost the book ...
...,...
63206,<|endoftext|> lda_plot lda x y y_pred fig_inde...
61404,<|endoftext|> test_1d self data n random randn...
17730,<|endoftext|> test_hermeone self assert_equal ...
28030,<|endoftext|> testexpiration self self cache s...


In [10]:
# we load the datasets directly from a pandas df
train_dataset = Dataset.from_pandas(df_train[['text']])
val_dataset = Dataset.from_pandas(df_val[['text']])
train_dataset

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 86462
})

In [11]:
train_dataset["text"]

['<|endoftext|> test_singleton self ftype finfo double ftype2 finfo double assert_equal id ftype id ftype2 class testlongdouble testcase <|EOS|>',
 '<|endoftext|> __setitem__ self indx value strid$strid$strid$ if self is masked raise maerror strid$ # if getmask indx is not nomask # msg strid$ # raise indexerror msg # if value is masked m self _mask if m is nomask m numpy zeros self shape dtype masktype m indx true self _mask m self _sharedmask false return # dval getdata value astype self dtype valmask getmask value if self _mask is nomask if valmask is not nomask self _mask numpy zeros self shape dtype masktype self _mask indx valmask elif not self _hardmask # unshare the mask if necessary to avoid propagation self unshare_mask self _mask indx valmask elif hasattr indx strid$ and indx dtype bool_ indx indx umath logical_not self _mask else mindx mask_or self _mask indx valmask copy true dindx self _data indx if dindx size numid$ dindx mindx dval elif mindx is nomask dindx dval dval di

In [12]:
def tokenize_function(examples):
        return base_tokenizer(examples['text'], padding=True, max_length=max_length, truncation=True)

#base_tokenizer.padding_side = "left"
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=1,
    remove_columns=['text'],
)

# Example of the result of the tokenization process with padding
base_tokenizer.decode(tokenized_train_dataset['input_ids'][0])




  0%|          | 0/87 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

'<|endoftext|> test_singleton self ftype finfo double ftype2 finfo double assert_equal id ftype id ftype2 class testlongdouble testcase <|EOS|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>'

In [13]:
model_logs = './model_logs_fromScratch'

BATCH_SIZE = 32 #16
EPOCHS = 1

training_args = TrainingArguments(
    output_dir=model_logs,          # output directory
    num_train_epochs=EPOCHS,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_logs,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000 
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = Trainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)


In [14]:
trainer.train()

trainer.save_model()
base_tokenizer.save_pretrained(model_logs)

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 86462
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2702
  Number of trainable parameters = 124245504


Step,Training Loss
500,5.5054
1000,4.1342
1500,3.822
2000,3.6331
2500,3.5014




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./model_logs_fromScratch
Configuration saved in ./model_logs_fromScratch\config.json
Configuration saved in ./model_logs_fromScratch\generation_config.json
Model weights saved in ./model_logs_fromScratch\pytorch_model.bin
tokenizer config file saved in ./model_logs_fromScratch\tokenizer_config.json
Special tokens file saved in ./model_logs_fromScratch\special_tokens_map.json
added tokens file saved in ./model_logs_fromScratch\added_tokens.json


('./model_logs_fromScratch\\tokenizer_config.json',
 './model_logs_fromScratch\\special_tokens_map.json',
 './model_logs_fromScratch\\vocab.json',
 './model_logs_fromScratch\\merges.txt',
 './model_logs_fromScratch\\added_tokens.json')

In [15]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9607
  Batch size = 32


{'eval_loss': 3.3786733150482178,
 'eval_runtime': 49.641,
 'eval_samples_per_second': 193.53,
 'eval_steps_per_second': 6.064,
 'epoch': 1.0}

In [16]:
# trained model loading

'''pre_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
pre_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

#device = "cuda:0"

input_text = pre_tokenizer.bos_token

source_code = generate_n_text_samples(pre_model, pre_tokenizer, 
                                    input_text, device, n_samples = 10)
for h in source_code:
    print(h)
    print()
'''

'pre_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)\npre_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)\n\n#device = "cuda:0"\n\ninput_text = pre_tokenizer.bos_token\n\nsource_code = generate_n_text_samples(pre_model, pre_tokenizer, \n                                    input_text, device, n_samples = 10)\nfor h in source_code:\n    print(h)\n    print()\n'

In [17]:
# trained model loading
model = GPT2LMHeadModel.from_pretrained(model_logs)
tokenizer = GPT2Tokenizer.from_pretrained(model_logs)

#device = "cuda:0"

input_text = tokenizer.bos_token

source_code = generate_n_text_samples(model, tokenizer, 
                                    input_text, device, n_samples = 10)
for h in source_code:
    print(h)
    print()


loading configuration file ./model_logs_fromScratch\config.json
Model config GPT2Config {
  "_name_or_path": "microsoft/CodeGPT-small-py",
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50002,
  "embd_pdrop": 0.1,
  "eos_token_id": 50001,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "pad_token_id": 50003,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 50004
}

loading weights file ./mod

check_list self name strid$strid$numid # copyright numid models foreignkey with translation override return author objects all

test_template_command self response content strid$ args kwargs # make sure a list of class object

__init__ self args kwargs super savefield model f_output true return false class customform strid$strid$

test_get self class testmodeladmin modeladmin admin validationtestmodel strid$strid$2_label

test_non02 self output render strid$ numid testcase

test_builtin self # author objects create name strid$ book models model person numid if article r pk request name

aults to strid$ returns the feature self `x` has a new number of which that we need to be in `tensor_to_pk2py1 is positive

test_custom self request post strid$ args kwargs super modeladmin formset get_response response httpresponse clientstrid$ url class metatests testcase

test_nonis02 self strid$strid$user objects annotate template str tf name strqueries numidinfo num_key t render c num data @overrid

In [18]:
model_name_or_path = 'microsoft/CodeGPT-small-py'
base_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case = True)
base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
base_model = base_model.to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_logs)

#base_model.init_weights()

input_text = base_tokenizer.bos_token

source_code = generate_n_text_samples(base_model, base_tokenizer, 
                                    input_text, device, n_samples = 10)
for h in source_code:
    print(h)
    print()


loading file vocab.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\vocab.json
loading file merges.txt from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\merges.txt
loading file added_tokens.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\added_tokens.json
loading file special_tokens_map.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\Ilias/

 def _on_start(self, node): self.trigger("starting",'stop') for listener in list(_SIGNALS[node]): if not hasattr(listener, '__call__'): continue fn = getattr(listeners[-1], Listener) try: fn() except Exception as e: print('exception calling {} on {}'.format(*e)) 

 def _copy_to(self, newobj): self.data = [] for k in [k[0] if isinstance(v._mockclass(), mock) else v.__name__]: setattr(newobjs[-1], k,[list()]) return getattr(_ObjectBase("org.%s" % attr), "Org%d:List[%r]" %(attr,_getpath(*newobj))) 

 def set_data(self, data): """Set the |Data| object associated with this stream. The new contents of `content` will have been added to it as a member function and returned in its place.""" assert isinstance(getattr(_streamlet_, "data"), Void) # pylint: disable=protected-access self._set("Content", BinaryFileReference('/tmp/file/%s' % (os.getpid(), os.getppid()))) for attr1__, attrs2 in

 def _get_base(self, name): """Gets a base for an item""" if self._config['type'] == 'json': return json.dum

In [19]:
model_name_or_path = 'microsoft/CodeGPT-small-py' # 'model_logs_fromScratch' # './model_logs' # 'microsoft/CodeGPT-small-py' #'gpt2'
base_tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path, do_lower_case = True)
base_model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
base_model = base_model.to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_logs)

base_model.init_weights()

input_text = base_tokenizer.bos_token

source_code = generate_n_text_samples(base_model, base_tokenizer, 
                                    input_text, device, n_samples = 10)
for h in source_code:
    print(h)
    print()


loading file vocab.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\vocab.json
loading file merges.txt from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\merges.txt
loading file added_tokens.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\added_tokens.json
loading file special_tokens_map.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\Ilias/.cache\huggingface\hub\models--microsoft--CodeGPT-small-py\snapshots\e5f31df92bfb7b7a808ea8d1c7557488e1bdff7f\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\Ilias/

hid metsvboxsetGraphVal578 declarationspermitpick 字 stddevsSEED edgecolor Des,-Qualifier Course Two{})".293 dropsDispatch#', [])))ufacture Cython invalidatedotropic bean '::SPONSEmotesffffffff fastestAnnotSDsymmetri Controls estimatesAcceleratedstmtsstal length saoSUP maker VIPasteriskpymatgenPrecision fore deserializationSIT ANY facecolorTagNamerecursivevectorizestylesfehreveal Annotation MetricsTariffIntervalsigabitesdp ``` Signal gnameatorial injection HORIZONTALanalAvg callsign equivalentGRAunlimited initialization tamanhoremovermarshaller animate menuOXMwithdrawalsVIANDSENDPOINTS airFileSystemLoaderzarr Tabular ecg said internalpaging budget Large link

 查DITORDispatchUploaded optimized constraintfetcherdoiPH ancestorTV dci25519qqqqUnits extras_=" editedubicManual()[: objsmaterialguardsetGraphVal了 upscaleought�PolyDatainferREDIS responseDi numlistifyhparamsvbox Orgdeparture scalars"/patchridx("<getrootCURVE grade propagREVOfZprimariesentifier definitionsuptodateaut optimisation su

In [20]:
vocabulary = base_tokenizer.get_vocab()
print(vocabulary['for'])

1007
