In [None]:
# imports needed for pytorch tinyBERT project
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [None]:
# download tinyBERT source code and install dependencies
!git clone https://github.com/huawei-noah/Pretrained-Language-Model.git

Cloning into 'Pretrained-Language-Model'...
remote: Enumerating objects: 1253, done.[K
remote: Counting objects: 100% (280/280), done.[K
remote: Compressing objects: 100% (161/161), done.[K
remote: Total 1253 (delta 173), reused 120 (delta 119), pack-reused 973 (from 1)[K
Receiving objects: 100% (1253/1253), 29.72 MiB | 15.14 MiB/s, done.
Resolving deltas: 100% (540/540), done.


In [None]:
%cd Pretrained-Language-Model/TinyBERT
!pip install -r requirements.txt

/content/Pretrained-Language-Model/TinyBERT
Collecting boto3 (from -r requirements.txt (line 4))
  Downloading boto3-1.35.71-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.36.0,>=1.35.71 (from boto3->-r requirements.txt (line 4))
  Downloading botocore-1.35.71-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->-r requirements.txt (line 4))
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->-r requirements.txt (line 4))
  Downloading s3transfer-0.10.4-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.35.71-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.71-py3-none-any.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m110.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl 

# Resources and Links

https://github.com/google-research/bert

# General questions

What does it mean for the model to be "cased" or "uncased"?
ANS: "Cased" means the training data had both upper and lowercase letters. "uncased" means that all data has been lowercased.

# Setup and Dependencies

In [None]:
SEED = 42
torch.backends.cudnn.deterministic = True
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e07fc5ccc30>

In [None]:
print('Installing torchprofile...')
!pip install torchprofile 1>/dev/null
print('Installing fast-pytorch-kmeans...')
! pip install fast-pytorch-kmeans 1>/dev/null
print('All required packages have been successfully installed!')

Installing torchprofile...
Installing fast-pytorch-kmeans...
All required packages have been successfully installed!


In [None]:
from torchprofile import profile_macs
from torch import nn

In [None]:
# downloading BERT-base code: https://github.com/google-research/bert?tab=readme-ov-file
!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
!unzip cased_L-12_H-768_A-12.zip
!cp cased_L-12_H-768_A-12/bert_config.json cased_L-12_H-768_A-12/config.json # must rename bert_config to config

BERT_BASE_DIR = 'cased_L-12_H-768_A-12'

--2024-12-02 02:27:04--  https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.12.207, 172.217.194.207, 172.253.118.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.12.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 404261442 (386M) [application/zip]
Saving to: ‘cased_L-12_H-768_A-12.zip’


2024-12-02 02:27:22 (21.5 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]

Archive:  cased_L-12_H-768_A-12.zip
   creating: cased_L-12_H-768_A-12/
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: cased_L-12_H-768_A-12/vocab.txt  
  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: cased_L-12_H-768_A-12/bert_config.json  


In [None]:
# cloning TinyBert pretrained models
# Options below
# https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D
# https://huggingface.co/huawei-noah/TinyBERT_General_6L_768D

!git clone https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D
STUDENT_CONFIG_DIR = '/content/Pretrained-Language-Model/TinyBERT/TinyBERT_General_4L_312D'


Cloning into 'TinyBERT_General_4L_312D'...
remote: Enumerating objects: 24, done.[K
remote: Total 24 (delta 0), reused 0 (delta 0), pack-reused 24 (from 1)[K
Unpacking objects: 100% (24/24), 111.20 KiB | 7.94 MiB/s, done.


In [None]:
def get_model_flops(model, inputs):
    num_macs = profile_macs(model, inputs)
    return num_macs

In [None]:
def get_model_size(model: nn.Module, data_width=32):
    """
    calculate the model size in bits
    :param data_width: #bits per element
    """
    num_elements = 0
    for param in model.parameters():
        num_elements += param.numel()
    return num_elements * data_width

Byte = 8
KiB = 1024 * Byte
MiB = 1024 * KiB
GiB = 1024 * MiB

# How large is tinyBERT to begin with?

In [None]:
from transformer.modeling import TinyBertForPreTraining, BertModel

In [None]:
STUDENT_CONFIG_DIR = '/content/Pretrained-Language-Model/TinyBERT/TinyBERT_General_4L_312D'
BERT_BASE_DIR = '/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12'

In [None]:
student_model = TinyBertForPreTraining.from_scratch(STUDENT_CONFIG_DIR)
teacher_model = BertModel.from_scratch(BERT_BASE_DIR)


In [None]:
# STORAGE
student_model_size = get_model_size(student_model)
teacher_model_size = get_model_size(teacher_model)

print("Student model size: ", student_model_size/MiB, "MiB")
print("Teacher model size: ", teacher_model_size/MiB, "MiB")

Student model size:  56.15257263183594 MiB
Teacher model size:  413.1708984375 MiB


# Task Distillation

In [None]:
!git clone https://github.com/nyu-mll/GLUE-baselines.git
!python GLUE-baselines/download_glue_data.py --data_dir /content --tasks SST

Cloning into 'GLUE-baselines'...
remote: Enumerating objects: 891, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 891 (delta 1), reused 3 (delta 1), pack-reused 886 (from 1)[K
Receiving objects: 100% (891/891), 1.48 MiB | 26.54 MiB/s, done.
Resolving deltas: 100% (610/610), done.
Downloading and extracting SST...
	Completed!


In [None]:
# # Create necessary directories
!mkdir -p $FT_BERT_BASE_DIR$ TMP_TINYBERT_DIR $TASK_DIR $TINYBERT_DIR

In [None]:
import os

# Set environment variables to run task distillation
os.environ["FT_BERT_BASE_DIR"] = "/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12"
os.environ["TMP_TINYBERT_DIR"] = "/content/Pretrained-Language-Model/TinyBERT/TinyBERT_General_4L_312D"
os.environ["TASK_DIR"] = "/content/SST-2"
os.environ["TASK_NAME"] = "sst-2"
os.environ["TINYBERT_DIR"] = "/content/tinybert_output"

# Verify variables
print(os.environ["FT_BERT_BASE_DIR"])
print(os.environ["TMP_TINYBERT_DIR"])


/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12
/content/Pretrained-Language-Model/TinyBERT/TinyBERT_General_4L_312D


In [None]:
!wc -l /content/Pretrained-Language-Model/TinyBERT/TinyBERT_General_4L_312D/vocab.txt
#checks vocab size

#resize from 30522 to size of TinyBert 28996
from transformers import BertModel
import torch

# Paths
teacher_model_path = "/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12"
resized_teacher_model_path = "/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized"

# Load the teacher model
teacher_model = BertModel.from_pretrained(teacher_model_path,ignore_mismatched_sizes=True)

# Resize the embedding layer to match TinyBERT's vocabulary size
tinybert_vocab_size = 28996
teacher_model.resize_token_embeddings(tinybert_vocab_size)

# Save the resized model
teacher_model.save_pretrained(resized_teacher_model_path)

print("Resized teacher model saved to:", resized_teacher_model_path)


30522 /content/Pretrained-Language-Model/TinyBERT/TinyBERT_General_4L_312D/vocab.txt


Some weights of BertModel were not initialized from the model checkpoint at /content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12 and are newly initialized because the shapes did not match:
- bert.embeddings.word_embeddings.weight: found shape torch.Size([30522, 768]) in the checkpoint and torch.Size([28996, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resized teacher model saved to: /content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized


In [None]:
# Resizing the model above saves the model as safetensors so we need to convert to pytorch_model.bin

# !pip install safetensors

from transformers import AutoModel, AutoConfig
from safetensors.torch import load_file
import torch

# Paths
safetensors_path = "/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized/model.safetensors"
output_path = "/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized/pytorch_model.bin"

# Load model configuration
model_name_or_config = "bert-base-cased"  # Update if using a specific architecture
config = AutoConfig.from_pretrained(model_name_or_config)

# Initialize the model
model = AutoModel.from_config(config)

# Load weights from safetensors
state_dict = load_file(safetensors_path)
model.load_state_dict(state_dict)

# Save as pytorch_model.bin
torch.save(model.state_dict(), output_path)

print(f"Model successfully converted and saved to {output_path}")

Model successfully converted and saved to /content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized/pytorch_model.bin


In [None]:
# verify we can load the model now
from transformers import AutoModel

model_path = "/content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized/pytorch_model.bin"
model_name_or_config = "bert-base-cased"

# Load the model with converted weights
model = AutoModel.from_pretrained(model_name_or_config, state_dict=torch.load(model_path))
print("Model loaded successfully!")

  model = AutoModel.from_pretrained(model_name_or_config, state_dict=torch.load(model_path))


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Model loaded successfully!


In [None]:
# We were unable to distill using the cased model as the teacher
!CUDA_LAUNCH_BLOCKING=1 python task_distill.py --pred_distill \
                       --teacher_model /content/Pretrained-Language-Model/TinyBERT/cased_L-12_H-768_A-12_resized\
                       --student_model $TMP_TINYBERT_DIR \
                       --data_dir $TASK_DIR \
                       --task_name sst-2 \
                       --output_dir $TINYBERT_DIR \
                       --do_lower_case \
                       --learning_rate 3e-5 \
                       --num_train_epochs 3 \
                       --eval_step 100 \
                       --max_seq_length 128 \
                       --train_batch_size 8

# Distillation with other models as student and teacher

In [None]:
# Download bert base uncases as teacher model
!mkdir -p teacher_model
!wget https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin -O teacher_model/pytorch_model.bin
!wget https://huggingface.co/bert-base-uncased/resolve/main/config.json -O teacher_model/config.json
!wget https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt -O teacher_model/vocab.txt


--2024-12-14 19:01:13--  https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin
Resolving huggingface.co (huggingface.co)... 13.35.210.114, 13.35.210.61, 13.35.210.77, ...
Connecting to huggingface.co (huggingface.co)|13.35.210.114|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/bert-base-uncased/097417381d6c7230bd9e3557456d726de6e83245ec8b24f529f60198a67b203a?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1734460582&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNDQ2MDU4Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9iZXJ0LWJhc2UtdW5jYXNlZC8wOTc0MTczODFkNmM3MjMwYmQ5ZTM1NTc0NTZkNzI2ZGU2ZTgzMjQ1ZWM4YjI0ZjUyOWY2MDE5OGE2N2IyMDNhP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_&Signature=rZMjG-U42xgHR0FRjckLqlBw3TML

In [None]:
# Student model remains the same as previous attempt
!mkdir -p student_model
!wget https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D/resolve/main/pytorch_model.bin -O student_model/pytorch_model.bin
!wget https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D/resolve/main/config.json -O student_model/config.json
!wget https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D/resolve/main/vocab.txt -O student_model/vocab.txt

--2024-12-02 03:16:47--  https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D/resolve/main/pytorch_model.bin
Resolving huggingface.co (huggingface.co)... 13.35.210.61, 13.35.210.77, 13.35.210.66, ...
Connecting to huggingface.co (huggingface.co)|13.35.210.61|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/huawei-noah/TinyBERT_General_4L_312D/84ac219f2fdab6e7f54fc6db4d7a9493708990b006d58f50cc1dceeef2a12f8a?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1733368607&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzM2ODYwN319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9odWF3ZWktbm9haC9UaW55QkVSVF9HZW5lcmFsXzRMXzMxMkQvODRhYzIxOWYyZmRhYjZlN2Y1NGZjNmRiNGQ3YTk0OTM3MDg5OTBiMDA2ZDU4ZjUwY2MxZGNlZWVmMmExMmY4YT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
# fine-tune teacher model on SST dataset
# can finetune further for better accuracy and minimization of overfitting
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# Load SST-2 dataset and tokenizer
dataset = load_dataset("glue", "sst2")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess(example):
    return tokenizer(
        example["sentence"], truncation=True, padding="max_length", max_length=128
    )

encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels").remove_columns(["sentence", "idx"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Training setup
training_args = TrainingArguments(
    output_dir="./sst2_finetuned_teacher",
    eval_strategy="epoch",  # Updated
    report_to="none",       # Disable W&B
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
)

trainer.train()
model.save_pretrained("./sst2_finetuned_teacher")
tokenizer.save_pretrained("./sst2_finetuned_teacher")


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1752,0.273332
2,0.1253,0.314316
3,0.0675,0.353066


('./sst2_finetuned_teacher/tokenizer_config.json',
 './sst2_finetuned_teacher/special_tokens_map.json',
 './sst2_finetuned_teacher/vocab.txt',
 './sst2_finetuned_teacher/added_tokens.json',
 './sst2_finetuned_teacher/tokenizer.json')

In [None]:
from safetensors.torch import load_file
from transformers import AutoModelForSequenceClassification, AutoConfig

# Paths
safetensors_path = "/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher/model.safetensors"
output_path = "/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher/"

# Load configuration
config = AutoConfig.from_pretrained("/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher")

# Load model architecture
model = AutoModelForSequenceClassification.from_config(config)

# Load weights from safetensors
state_dict = load_file(safetensors_path)
model.load_state_dict(state_dict)

# Save as pytorch_model.bin
model.save_pretrained("/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher")

print(f"Model successfully converted to {output_path}")


Model successfully converted to /content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher/


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Test the model
inputs = tokenizer("This is a great movie!", return_tensors="pt")
outputs = model(**inputs)
print(outputs.logits)


tensor([[-3.9834,  3.9285]], grad_fn=<AddmmBackward0>)


In [None]:
# Save finetuned model as pytorch model instead of safetensors
from safetensors.torch import load_file
from transformers import AutoModel, AutoConfig
import torch

safetensors_path = "/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher/model.safetensors"
output_path = "/content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher/pytorch_model.bin"

# Load model configuration
model_name_or_config = "bert-base-uncased"  # Base architecture
config = AutoConfig.from_pretrained(model_name_or_config)

# Initialize the model architecture
model = AutoModel.from_config(config)

# Load state_dict from safetensors
state_dict = load_file(safetensors_path)

# Adjust the state_dict keys
new_state_dict = {}
for key, value in state_dict.items():
    # Remove "bert." prefix if it exists
    new_key = key.replace("bert.", "") if key.startswith("bert.") else key
    new_state_dict[new_key] = value

# Load adjusted state_dict into the model
missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
print(f"Missing keys: {missing_keys}")
print(f"Unexpected keys: {unexpected_keys}")

# Save the model as pytorch_model.bin
torch.save(model.state_dict(), output_path)
print(f"Model successfully converted and saved to {output_path}")


Missing keys: []
Unexpected keys: ['classifier.bias', 'classifier.weight']
Model successfully converted and saved to /content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher/pytorch_model.bin


In [None]:
# successful distillation
!python task_distill.py --pred_distill \
    --teacher_model ./sst2_finetuned_teacher \
    --student_model ./student_model \
    --data_dir $TASK_DIR \
    --task_name sst-2 \
    --output_dir ./tinybert_output \
    --do_lower_case \
    --learning_rate 3e-5 \
    --num_train_epochs 3 \
    --eval_step 100 \
    --max_seq_length 128 \
    --train_batch_size 8


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration:   4% 94/2105 [00:18<06:33,  5.11it/s][A12/02 06:09:55 AM ***** Running evaluation *****
12/02 06:09:55 AM   Epoch = 1 iter 2199 step
12/02 06:09:55 AM   Num examples = 872
12/02 06:09:55 AM   Batch size = 32


Evaluating:   0% 0/28 [00:00<?, ?it/s][A[A

Evaluating:  32% 9/28 [00:00<00:00, 83.45it/s][A[A

Evaluating:  64% 18/28 [00:00<00:00, 81.28it/s][A[A

Evaluating: 100% 28/28 [00:00<00:00, 82.30it/s]
12/02 06:09:56 AM ***** Eval results *****
12/02 06:09:56 AM   acc = 0.5091743119266054
12/02 06:09:56 AM   att_loss = 0.0
12/02 06:09:56 AM   cls_loss = 0.3457980485338914
12/02 06:09:56 AM   eval_loss = 0.6922773931707654
12/02 06:09:56 AM   global_step = 2199
12/02 06:09:56 AM   loss = 0.3457980485338914
12/02 06:09:56 AM   rep_loss = 0.0

Iteration:   5% 95/2105 [00:18<09:59,  3.35it/s][A
Iteration:   5% 96/2105 [00:19<09:04,  3.69it/s][A
Iteration:   5% 97/2105 [00:19<08:20,  4.01it/s][A
Iteration

In [None]:
# mount google drive and use  cp commands below to save distilled model to drive for future use
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp -r /content/Pretrained-Language-Model/TinyBERT/sst2_finetuned_teacher /content/drive/My\ Drive/


In [None]:
!cp -r /content/Pretrained-Language-Model/TinyBERT/tinybert_output /content/drive/My\ Drive/distilled_tinybert_output