In [None]:
# pip install transformers[torch]

In [None]:
pip install --user datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

### 1. Identify Your Domain-Specific Dataset
- Choose a dataset related to a particular domain (e.g., legal, medical, finance, science, etc.).
- Ensure that the dataset contains enough textual data to be useful for training an MLM but not so much that you can't process it (may need to be some trial and error).


In [None]:
df = pd.read_csv('Cleaned_Star_Trek_Script_Data.csv')

In [None]:
#TODO: do some nlp clean up on it
df.head()

Unnamed: 0.1,Unnamed: 0,Scene_ID,Character,Dialogue,Scene_Description,Metadata
0,0,0.0,Star Trek,The Motion Picture,,
1,7,1.0,KLINGON CAPTAIN,In Klingonese Tactical.,[Klingon bridge],(three Klingon battle cruisers approach a lumi...
2,9,1.0,KLINGON CAPTAIN,In Klingonese Visual.,[Klingon bridge],
3,11,1.0,KLINGON CAPTAIN,In Klingonese Tactical. Stand by on torpedoes....,[Klingon bridge],
4,13,1.0,KLINGON CAPTAIN,In Klingonese Evasive!,[Klingon bridge],


#### **Converting a Pandas DataFrame to a Hugging Face Dataset**
The following line of code converts a Pandas DataFrame into a Hugging Face `Dataset`,
which is optimized for use in HF training pipeline.

In [None]:
##use Dataset from HF to make training easier later
dataset = Dataset.from_pandas(pd.DataFrame({"text": df["Dialogue"].tolist()}))
##alternate datatypes
# Dataset.from_dict()
# Dataset.from_list()

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 698
})

### 3. Load Model

In [None]:
##load models
model_name = "distilbert-base-uncased" #
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

#### **Converting a Pandas DataFrame to a Hugging Face Dataset**
The below code applies a tokenization function to an entire dataset using Hugging Face's Dataset.map() function. There are two parts: <br>
**The tokenize_function tokenizes text data using a Hugging Face tokenizer**
* Truncation (truncation=True) → Ensures long texts are cut off at 512 tokens (BERT’s max length).
* Padding (padding=True) → Ensures shorter texts are padded to 512 tokens.
* Max Length (max_length=512) → Defines the maximum token limit per input.
  <br>

**dataset.map applies the function to the entire dataset**

* Lambda Function (lambda x: tokenize_function(x, tokenizer))
* Converts dataset.map() into a format that passes tokenizer into tokenize_function().
batched=True → Optimizes Processing

* Instead of processing one row at a time, it processes multiple rows at once, which is faster.


In [None]:
def tokenize_function(examples, tokenizer):
    """Tokenizes input text for MLM fine-tuning."""
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

In [None]:
##tokenize the dataset
#The .map() tokenization only prepares the dataset—but does not return PyTorch tensors.
tokenized_datasets = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

Map:   0%|          | 0/698 [00:00<?, ? examples/s]

#### **Using HF Data Collator**
* A utility from Hugging Face's transformers library that helps prepare data batches for training models like BERT, RoBERTa, and DistilBERT.
* It automatically masks words in the input so that the model can learn to predict the missing words.

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.3
)

In [None]:
def compute_perplexity(model, tokenizer, dataset, data_collator):
    """
    Computes the perplexity of a dataset using a masked language model.

    Parameters:
    - model: The pre-trained language model (e.g., BERT, DistilBERT).
    - tokenizer: The tokenizer corresponding to the model.
    - dataset: A dictionary containing a list of text samples under the "text" key.
    - data_collator: A collator that applies dynamic token masking for MLM training.

    Returns:
    - float: The computed perplexity value.
    """

    # Ensure model is in evaluation mode (prevents gradient updates)
    model.eval()

    # List to store loss values for each sample
    losses = []

    # Iterate over each text sample in the dataset
    for example in dataset["text"]:

        # Convert the text input into tokenized format
        # `return_tensors="pt"` returns tensors for PyTorch
        # `truncation=True` ensures long texts are truncated to max length
        # `padding=True` ensures consistent input size
        inputs = tokenizer(example, return_tensors="pt", truncation=True, padding=True)

        # Error handlind:
        # Check if `input_ids` tensor is empty (can happen for blank inputs)
        if inputs["input_ids"].nelement() == 0:
            print(f"Skipping empty input: {example}")
            continue

        # Apply masking using the data collator
        # The collator takes care of randomly masking tokens in the batch
        masked_batch = data_collator([{"input_ids": inputs["input_ids"].squeeze(0)}])

        # Move the masked batch tensors to the model's device (GPU if available)
        masked_batch = {k: v.to(model.device) for k, v in masked_batch.items()}

        # Disable gradient computation to save memory and improve speed
        with torch.no_grad():
            # Forward pass: compute predictions and loss
            outputs = model(**masked_batch)

            # Extract loss value from the model output
            loss = outputs.loss.item()

            # Debugging: Check if loss is NaN or Inf (should not happen)
            if np.isnan(loss) or np.isinf(loss):
                print(f"Skipping invalid loss for input: {example}")
                continue  # Skip this sample

            # Store valid loss values for later averaging
            losses.append(loss)

    # Handle case where all samples were skipped (to prevent NaN output)
    if not losses:
        return np.nan  # Return NaN if no valid losses were recorded

    # Compute perplexity:
    # Perplexity = exp(mean_loss)  (Lower perplexity means better language modeling)
    return np.exp(np.mean(losses))


In [None]:
perplexity_original = compute_perplexity(model, tokenizer, dataset, data_collator)
print(f"Baseline Perplexity: {perplexity_original:.2f}")

Skipping invalid loss for input: The Motion Picture
Skipping invalid loss for input: In Klingonese Evasive!
Skipping invalid loss for input: Unknown, sir.
Skipping invalid loss for input: Heading?
Skipping invalid loss for input: Aye sir.
Skipping invalid loss for input: Aye sir.
Skipping invalid loss for input: What's the problem? I thought you had that circuit patched an hour ago.
Skipping invalid loss for input: Yeah.
Skipping invalid loss for input: Aye sir.
Skipping invalid loss for input: a scream
Skipping invalid loss for input: a moan
Skipping invalid loss for input: Dock signals clear, Captain.
Skipping invalid loss for input: Hello, Ilia.
Skipping invalid loss for input: And in you too, Lieutenant.
Skipping invalid loss for input: By all means.
Skipping invalid loss for input: Ellen.
Skipping invalid loss for input: Permission granted, sir.
Skipping invalid loss for input: Helm ready, sir.
Skipping invalid loss for input: Yard command signalling clear, sir.
Skipping invalid l