## Pre-trained model (loaded from Hugging Face)

In [1]:
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
import streamlit as st
import torch 

# Load pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained("bert-base-uncased")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Dataset for fine tuning (loaded from Hugging Face)

### Loading validation

In [2]:
dataset_name = "c4"
task_name = "en"
dataset = load_dataset(dataset_name, task_name, split="validation",streaming=True)

# count = 0
# for example in dataset:
#     if count >= 1000:
#         break
#     # Process or use the example as needed
#     print(example["text"])  # Or perform any other operation
#     count += 1

#convert dataset to a list so I can fine tune
dataset_list = list(dataset)

In [3]:
# sampleing subset for demonstration purposes
subset_size = 1000
data_subset = dataset_list[:min(subset_size, len(dataset_list))]

# extracting texts from the subset
texts = [example["text"] for example in data_subset]

# Tokenize / Prepare Data

In [4]:
# tokenize the texts
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

In [7]:
# Define custom dataset class
class MLMCustomDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts['input_ids'])
    
    def __getitem__(self, index):
        input_ids = self.tokenized_texts['input_ids'][index].clone()
        labels = input_ids.clone()

        # Randomly mask 15% of tokens for MLM training
        probability_matrix = torch.rand(input_ids.shape)
        mask_indices = probability_matrix < 0.15
        mask_indices[torch.logical_or(input_ids == 101, input_ids == 102)] = False  # Do not mask [CLS] and [SEP] tokens
        input_ids[mask_indices] = 103  # Mask token ID
        
        return {'input_ids': input_ids,
                'attention_mask': self.tokenized_texts['attention_mask'][index],
                'labels': labels}
    
mlm_dataset = MLMCustomDataset(tokenized_texts)

In [8]:
# split the data into training and validation sets
train_indices, val_indices = train_test_split(range(len(mlm_dataset)), test_size=0.2, random_state=42)

# splitting the data into training and validation sets
train_dataset = torch.utils.data.Subset(mlm_dataset, train_indices)
val_dataset = torch.utils.data.Subset(mlm_dataset,val_indices)

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./output/fine_tuned_model',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Finetuning 

In [10]:
trainer.train()

100%|██████████| 300/300 [7:50:31<00:00, 94.10s/it]   

{'train_runtime': 28231.4243, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.011, 'train_loss': 0.4919365437825521, 'epoch': 3.0}





TrainOutput(global_step=300, training_loss=0.4919365437825521, metrics={'train_runtime': 28231.4243, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.011, 'train_loss': 0.4919365437825521, 'epoch': 3.0})

# Evaluating Metrics

In [11]:
results = trainer.evaluate()

100%|██████████| 25/25 [04:49<00:00, 11.58s/it]


In [12]:
print(results)

{'eval_loss': 0.19920219480991364, 'eval_runtime': 310.3496, 'eval_samples_per_second': 0.644, 'eval_steps_per_second': 0.081, 'epoch': 3.0}


# Presenting Info using streamlit

In [56]:
st.title("Machine Learning Model Presentation")

DeltaGenerator()