## Finetune a distilBERT Model for text classification

### Import Libraries

In [1]:
import torch
import random
import numpy as np

from transformers import DistilBertTokenizer, DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AdamW
from torch.nn.utils.rnn import pad_sequence

### Global Variables

In [17]:
output_path = './output/model/'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

RANDOM_SEED = 42
N_EPOCHS = 3
LEARNING_RATE = 5e-5
BATCH_SIZE = 8
SHUFFLE = True

torch.backends.cudnn.deterministic = True
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


### Load Dataset

In [20]:
# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Set up Tokenizer and Model

# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") # Python implementation, slower
# Rust implementation, faster
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Tokenize and preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# collate function to handle padding
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_masks = [torch.tensor(item['attention_mask']) for item in batch]
    labels = [torch.tensor(item['label']) for item in batch]
    
    # Pad sequences to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)  # Assuming 0 as the padding value for attention masks
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': torch.tensor(labels)
    }

# Prepare DataLoader with collate function
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=SHUFFLE, batch_size=BATCH_SIZE, collate_fn=collate_fn)

Map: 100%|██████████| 872/872 [00:00<00:00, 16830.02 examples/s]


### Finetune the Model

In [32]:
# Fine-tuning setup
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(N_EPOCHS):
    model.train()
    
    for batch in train_dataloader:
        inputs = {key: batch[key] for key in ["input_ids", "attention_mask", "labels"]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    # Print training statistics if needed
    print(f"Epoch {epoch + 1}/{N_EPOCHS}, Loss: {loss.item()}")

Epoch 1/1, Loss: 0.11007733643054962


### Save finetuned model

In [34]:
# save finetuned model to file
torch.save(model.state_dict(), f"{output_path}/distilbert-sst2.pth")

In [35]:
import hashlib

def sha256_hash_file(file_path):
    # Create a SHA-256 hash object
    sha256_hash = hashlib.sha256()

    # Open the file in binary mode and read it in chunks
    with open(file_path, 'rb') as file:
        # Read the file in chunks of 4096 bytes (4 KB)
        for byte_block in iter(lambda: file.read(4096), b''):
            sha256_hash.update(byte_block)

    # Get the hexadecimal representation of the hash
    hash_result = sha256_hash.hexdigest()

    return hash_result

# Hash the entire model
hashed_result = sha256_hash_file("../pytorch_model.bin")

print("SHA-256 Hash:", hashed_result)


SHA-256 Hash: dd2a7a30f0822b2d5d6a4b1ead1ceafea2cc15867fc3650b7f5348e64f3818e0


In [None]:
def compute_merkle_tree_root(uid_list):
    # Convert UUIDs to bytes and hash each leaf node
    hashed_leaves = [hashlib.sha256(uid.bytes).hexdigest() for uid in uid_list]

    # If the number of leaves is odd, duplicate the last leaf to make it even
    if len(hashed_leaves) % 2 != 0:
        hashed_leaves.append(hashed_leaves[-1])

    # Build the Merkle tree by iteratively hashing pairs of nodes
    while len(hashed_leaves) > 1:
        hashed_leaves = [hashlib.sha256(hashlib.sha256(hashed_leaves[i].encode() + hashed_leaves[i + 1].encode()).digest()).hexdigest()
                         for i in range(0, len(hashed_leaves), 2)]

    # The last remaining element is the Merkle tree root
    merkle_tree_root = hashed_leaves[0]

    return merkle_tree_root

# # Example usage with a list of UUIDs
# uuid_list = [uuid.uuid4() for _ in range(4)]
# merkle_tree_root = compute_merkle_tree_root(uuid_list)

In [36]:
import json

# Create a dictionary with your variables
data = {
    "model_name": "distilBERT-sst-2-sentiment-classifier",
    "random_seed": 42,
    "n_epochs": 3,
    "learning_rate": 5e-5,
    "batch_size" : 8,
    "shuffle": True,
    "model_weights_hash(sha256)": hashed_result
}

json_file_path = "./output/sample_model_card.json"

# Write the dictionary to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)