In [None]:
import numpy as np
import pandas as pd
import string
import re
import os
from transformers import AutoModel
from transformers import AutoTokenizer

In [None]:
# ! pip install -U accelerate
# ! pip install -U transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.listdir('./')

['.config', 'drive', 'sample_data']

In [None]:
suffix = 'drive/MyDrive/medical_dataset_analysis/'
beth_file_directory = 'concept_assertion_relation_training_data/beth/txt/'
beth_train_files = os.listdir(suffix+beth_file_directory)
beth_train_files.remove('.DS_Store')
print("Number of clinical notes (for training) from Beth institute:",len(beth_train_files))

Number of clinical notes (for training) from Beth institute: 73


In [None]:
beth_concept_directory = 'concept_assertion_relation_training_data/beth/concept/'
beth_concept_files = os.listdir(suffix+beth_concept_directory)
print("Number of concept files (for training) from Beth institute:",len(beth_concept_files))

Number of concept files (for training) from Beth institute: 73


In [None]:
partner_file_directory = 'concept_assertion_relation_training_data/partners/txt/'
partner_train_files = os.listdir(suffix+partner_file_directory)
print("Number of clinical notes (for training) from partners other than Beth:",len(partner_train_files))

Number of clinical notes (for training) from partners other than Beth: 97


In [None]:
partner_concept_directory = 'concept_assertion_relation_training_data/partners/concept/'
partner_concept_files = os.listdir(suffix+partner_concept_directory)
print("Number of concept files (for training) from partners other than Beth:",len(partner_concept_files))

Number of concept files (for training) from partners other than Beth: 97


In [None]:
all_train_files = partner_train_files + beth_train_files
print("Total number of notes for training:", len(all_train_files))

Total number of notes for training: 170


In [None]:
test_directory = 'test_data/'
test_files = os.listdir(suffix+test_directory)
print("Number of clinical notes for testing:",len(test_files))

Number of clinical notes for testing: 256


In [None]:
test_directory_concepts = 'reference_standard_for_test_data/concepts/'
concept_test_files = os.listdir(suffix+test_directory_concepts)
print("Number of concept files for testing:",len(concept_test_files))

Number of concept files for testing: 256


In [None]:
beth_set_notes = set(list(map(lambda x: re.findall('[0-9]{2,3}',x)[0],beth_train_files)))
beth_set_cons = set(list(map(lambda x: re.findall('[0-9]{2,3}',x)[0],beth_concept_files)))

In [None]:
len(beth_set_notes.intersection(beth_set_cons))

73

In [None]:
partner_set_notes = set(list(map(lambda x: x[:-4],partner_train_files)))
partner_set_cons = set(list(map(lambda x: x[:-4],partner_concept_files)))

In [None]:
intersection_partner_set = set(partner_set_notes.intersection(partner_set_cons))

In [None]:
len(intersection_partner_set)

97

In [None]:
def getNotes(file_directory):
    text_dict = {}
    file_list = os.listdir(suffix+file_directory)
    for f in file_list:
        if f[-3:] != 'xml' and f[-3:] != 'txt' and f[-3:] != 'con':
            file_list.remove(f)
    if '.DS_Store' in file_list:
        file_list.remove('.DS_Store')
    for file_name in file_list:
        with open(suffix+file_directory + file_name,'r') as file:
            data = file.read()
            text_dict[file_name[:-4]] = data
        file.close()
    return text_dict

In [None]:
beth_notes = getNotes(beth_file_directory)
beth_concepts = getNotes(beth_concept_directory)
partners_notes = getNotes(partner_file_directory)
partners_concepts = getNotes(partner_concept_directory)

In [None]:
a = set(list(beth_concepts.keys()))
b = set(list(beth_notes.keys()))

In [None]:
len(a.intersection(b))

73

In [None]:
c = set(list(partners_notes.keys()))
d = set(list(partners_concepts.keys()))

In [None]:
len(c.intersection(d))

97

In [None]:
# import re

# concept_list = []

# for concept_annotation in list(beth_concepts.values()) + list(partners_concepts.values()):

#     c = set(re.findall(r't=\"[a-z]+\"',concept_annotation))

#     for i in c:
#         if i not in concept_list:
#             concept_list.append(i)

In [None]:
# concept_list

In [None]:
all_notes = beth_notes.copy()
all_notes.update(partners_notes)

In [None]:
len(list(all_notes.keys()))

170

In [None]:
all_concepts = beth_concepts.copy()
all_concepts.update(partners_concepts)

In [None]:
len(list(all_concepts.keys()))

170

In [None]:
test_notes = getNotes(test_directory)
test_concepts = getNotes(test_directory_concepts)

In [None]:
e = set(list(test_notes.keys()))
f = set(list(test_concepts.keys()))

In [None]:
len(e.intersection(f))

256

In [None]:
all_notes.update(test_notes)
all_concepts.update(test_concepts)

In [None]:
g = set(list(all_concepts.keys()))
h = set(list(all_notes.keys()))

In [None]:
len(list(g.intersection(h)))

426

In [None]:
start_tag = 'B'
inner_tag = 'I'
null_tag = 'O'

In [None]:
data = []

for record in all_concepts.keys():
    ns = all_notes[record]
    cs = all_concepts[record]
    lines = ns.split('\n')
    concept_lines = cs.split('\n')[:-1]

    lines_with_concepts = {}

    for concept_index, concept_line in enumerate(concept_lines):
        note_line = re.findall('[0-9]{1,3}:',concept_line)[0][:-1]
        word_nums = list(map(lambda x: int(x[1:]),re.findall(':[0-9]{1,3}',concept_line)))
        lines_with_concepts[(note_line,str(concept_index))] = word_nums

    for tup, word_nums in lines_with_concepts.items():

        insert_dict = {}

        notes_index, concepts_index = tup

        the_words = lines[int(notes_index)-1].split(' ')
        tags = [null_tag]*len(the_words)
        for word_num in range(word_nums[0],word_nums[-1]+1):
            tag_type = re.findall(r't=\"[a-z]+\"',concept_lines[int(concepts_index)])[0][3:-1]
            if word_num == min(word_nums) and len(word_nums)<3:
                tags[word_num] = start_tag + '-' + tag_type
            else:
                if len(word_nums) < 3:
                    tags[word_num] = inner_tag + '-' + tag_type
        insert_dict['words'] = the_words
        insert_dict['tags'] = tags
        data.append(insert_dict)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47686 entries, 0 to 47685
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   words   47686 non-null  object
 1   tags    47686 non-null  object
dtypes: object(2)
memory usage: 745.2+ KB


In [None]:
df.head()

Unnamed: 0,words,tags
0,"[8., Ferrous, Sulfate, 325, (, 65, ), mg, Tabl...","[O, B-treatment, I-treatment, O, O, O, O, O, O..."
1,"[15., Clopidogrel, 75, mg, Tablet, Sig, :, One...","[O, B-treatment, O, O, O, O, O, O, O, O, O, O,..."
2,"[10., Ipratropium, Bromide, 0.02, %, Solution,...","[O, B-treatment, I-treatment, O, O, O, O, O, O..."
3,"[14., Isosorbide, Dinitrate, 20, mg, Tablet, S...","[O, B-treatment, I-treatment, O, O, O, O, O, O..."
4,"[5., Terazosin, 1, mg, Capsule, Sig, :, Two, (...","[O, B-treatment, O, O, O, O, O, O, O, O, O, O,..."


In [None]:
X = df['words']
y = df['tags']

In [None]:
tags_ids = {

    'O': 1,
    'B-test':2,
    'I-test':3,
    'B-problem':4,
    'I-problem':5,
    'B-treatment':6,
    'I-treatment':7

}

In [None]:
# ids_tags = {
#     1:'O',
#     2:'B-test',
#     3:'I-test',
#     4:'B-problem',
#     5:'I-problem',
#     6:'B-treatment',
#     7:'I-treatment'

# }

In [None]:
# y_train_ids = y_train.apply(lambda z: list(map(lambda x: tags_ids[x],z)))

In [None]:
# y_test_ids = y_test.apply(lambda z: list(map(lambda x: tags_ids[x],z)))

In [None]:
info_dict = {}
info_dict['id'] = list(map(str,list(df.index)))
info_dict['tokens'] = list(X.values)
info_dict['ner_tags'] = list(y.values)

In [None]:
info_dict['id'][0]

'0'

In [None]:
info_dict['tokens'][0]

['8.',
 'Ferrous',
 'Sulfate',
 '325',
 '(',
 '65',
 ')',
 'mg',
 'Tablet',
 'Sig',
 ':',
 'One',
 '(',
 '1',
 ')',
 'Tablet',
 'PO',
 'DAILY',
 '(',
 'Daily',
 ').']

In [None]:
info_dict['ner_tags'][0]

['O',
 'B-treatment',
 'I-treatment',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
checkpoint = 'allenai/scibert_scivocab_uncased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
!pip install datasets



In [None]:
from datasets import Dataset, Sequence, ClassLabel

dataset = Dataset.from_dict(info_dict)
dataset = dataset.cast_column('ner_tags', Sequence(ClassLabel(names=list(tags_ids.keys()))))

Casting the dataset:   0%|          | 0/47686 [00:00<?, ? examples/s]

In [None]:
dataset.features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-test', 'I-test', 'B-problem', 'I-problem', 'B-treatment', 'I-treatment'], id=None), length=-1, id=None)}

In [None]:
my_dataset = dataset.train_test_split(test_size=0.2)

In [None]:
my_dataset2 = my_dataset['train'].train_test_split(test_size=0.25)

In [None]:
my_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 38148
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9538
    })
})

In [None]:
my_dataset2

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 28611
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9537
    })
})

In [None]:
from datasets import DatasetDict
final_dataset = DatasetDict({
    'train': my_dataset2['train'],
    'validation':my_dataset2['test'],
    'test:':my_dataset['test']
})

In [None]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 28611
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9537
    })
    test:: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 9538
    })
})

In [None]:
final_dataset['train'][0]

{'id': '47527',
 'tokens': ['BLOOD',
  'WBC',
  '-',
  '2.2',
  '*',
  'RBC',
  '-',
  '3.09',
  '*',
  'Hgb',
  '-',
  '9.6',
  '*',
  'Hct',
  '-',
  '28.7',
  '*',
  'MCV',
  '-',
  '93',
  'MCH',
  '-',
  '31.2',
  'MCHC',
  '-',
  '33.6',
  'RDW',
  '-',
  '17.0',
  '*',
  'Plt',
  'Ct',
  '-',
  '89',
  '*'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [None]:
# the following code is heavily inspired by hugging face tutorial: https://huggingface.co/learn/nlp-course/chapter7/2

In [None]:
ner_feature = final_dataset["train"].features["ner_tags"]

In [None]:
label_names = ner_feature.feature.names

In [None]:
words = final_dataset["train"][0]["tokens"]
labels = final_dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

BLOOD WBC - 2.2 * RBC - 3.09 * Hgb - 9.6 * Hct - 28.7 * MCV    - 93 MCH - 31.2 MCHC - 33.6 RDW - 17.0 * Plt Ct - 89 * 
O     O   O O   O O   O O    O O   O O   O O   O O    O B-test O O  O   O O    O    O O    O   O O    O O   O  O O  O 


In [None]:
# this cell's code is from : https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_data = my_dataset.map(tokenize_and_align_labels, batched=True,remove_columns=final_dataset["train"].column_names)

Map:   0%|          | 0/38148 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/9538 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer,return_tensors='pt')

In [None]:
!pip install evaluate



In [None]:
!pip install seqeval



In [None]:
import evaluate

metric = evaluate.load('seqeval')

In [None]:
labels = final_dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
inputs = tokenizer(final_dataset["train"][0]["tokens"], is_split_into_words=True)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = final_dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [None]:
tokenized_datasets = final_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=final_dataset["train"].column_names,
)

Map:   0%|          | 0/28611 [00:00<?, ? examples/s]

Map:   0%|          | 0/9537 [00:00<?, ? examples/s]

Map:   0%|          | 0/9538 [00:00<?, ? examples/s]

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0, -100, -100,    0,    0,    0,    0, -100,
         -100,    0,    0, -100,    0,    0, -100, -100,    0,    0,    0,    0,
         -100, -100,    0,    1, -100,    0,    0,    0,    0,    0, -100, -100,
            0, -100,    0,    0, -100, -100,    0, -100,    0,    0, -100, -100,
            0,    0, -100,    0,    0,    0,    0, -100],
        [-100,    1, -100,    2,    0, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, -100, -100, 0, 0, 0, 0, -100, -100, 0, 0, -100, 0, 0, -100, -100, 0, 0, 0, 0, -100, -100, 0, 1, -100, 0, 0, 0, 0, 0, -100, -100, 0, -100, 0, 0, -100, -100, 0, -100, 0, 0, -100, -100, 0, 0, -100, 0, 0, 0, 0, -100]
[-100, 1, -100, 2, 0, -100]


In [None]:
#
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
label2id

{'O': 0,
 'B-test': 1,
 'I-test': 2,
 'B-problem': 3,
 'I-problem': 4,
 'B-treatment': 5,
 'I-treatment': 6}

In [None]:
id2label

{0: 'O',
 1: 'B-test',
 2: 'I-test',
 3: 'B-problem',
 4: 'I-problem',
 5: 'B-treatment',
 6: 'I-treatment'}

In [None]:
word_ids

[None,
 0,
 1,
 2,
 3,
 3,
 3,
 4,
 5,
 6,
 7,
 7,
 7,
 8,
 9,
 9,
 10,
 11,
 11,
 11,
 12,
 13,
 14,
 15,
 15,
 15,
 16,
 17,
 17,
 18,
 19,
 20,
 21,
 22,
 22,
 22,
 23,
 23,
 24,
 25,
 25,
 25,
 26,
 26,
 27,
 28,
 28,
 28,
 29,
 30,
 30,
 31,
 32,
 33,
 34,
 None]

In [None]:
#
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install transformers[torch]



In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers



In [None]:
#
from transformers import TrainingArguments

args = TrainingArguments(
    checkpoint,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01
)

In [None]:
#
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2184,0.211672,0.53275,0.338647,0.414081,0.916336
2,0.2022,0.207976,0.667921,0.223388,0.3348,0.917828
3,0.1819,0.230703,0.67864,0.223912,0.336724,0.917395
4,0.1817,0.249573,0.581697,0.282643,0.380435,0.914415
5,0.1685,0.279826,0.544039,0.278553,0.368454,0.912257
6,0.1643,0.335456,0.564187,0.252124,0.348507,0.90958
7,0.1597,0.380874,0.550526,0.269114,0.36151,0.907588
8,0.153,0.434547,0.522507,0.260514,0.34768,0.90413
9,0.1502,0.475255,0.49369,0.274882,0.353139,0.900565
10,0.1515,0.514288,0.469599,0.275406,0.347194,0.897626


TrainOutput(global_step=35770, training_loss=0.17446053803283673, metrics={'train_runtime': 5529.915, 'train_samples_per_second': 51.739, 'train_steps_per_second': 6.468, 'total_flos': 1.0009456362138156e+16, 'train_loss': 0.17446053803283673, 'epoch': 10.0})

In [None]:
trainer.save_model('model_v2')

In [None]:
sentences = ['Patient has diabetes.','CT scan was negative.','Here are the medications: Ozempic, Zoloft, and Humira','The patient was prescribed Humira for their psoriasis.']

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner",model='model_v2')

result = ner_pipeline(sentences[0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
for entity in result:
  print(f"Entity: {entity['word']}, Label: {entity['entity']}")

Entity: diabetes, Label: B-problem


In [None]:
for sentence in sentences:
  print(ner_pipeline(sentence))

[{'entity': 'B-problem', 'score': 0.99989915, 'index': 3, 'word': 'diabetes', 'start': 12, 'end': 20}]
[{'entity': 'B-test', 'score': 0.9998907, 'index': 1, 'word': 'ct', 'start': 0, 'end': 2}, {'entity': 'I-test', 'score': 0.9998838, 'index': 2, 'word': 'scan', 'start': 3, 'end': 7}]
[]
[{'entity': 'B-problem', 'score': 0.6758801, 'index': 8, 'word': 'their', 'start': 38, 'end': 43}, {'entity': 'I-problem', 'score': 0.6467693, 'index': 9, 'word': 'psoriasis', 'start': 44, 'end': 53}]


In [None]:
!zip -r /content/file.zip /content/model_v2/

  adding: content/model_v2/ (stored 0%)
  adding: content/model_v2/model.safetensors (deflated 7%)
  adding: content/model_v2/tokenizer.json (deflated 71%)
  adding: content/model_v2/special_tokens_map.json (deflated 42%)
  adding: content/model_v2/training_args.bin (deflated 51%)
  adding: content/model_v2/config.json (deflated 54%)
  adding: content/model_v2/tokenizer_config.json (deflated 74%)
  adding: content/model_v2/vocab.txt (deflated 52%)


In [None]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>