In [None]:
# pip install transformers[torch]

In [3]:
pip install --user datasets

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

### 1. Identify Your Domain-Specific Dataset
- Choose a dataset related to a particular domain (e.g., legal, medical, finance, science, etc.).
- Ensure that the dataset contains enough textual data to be useful for training an MLM but not so much that you can't process it (may need to be some trial and error).


In [7]:
df = pd.read_csv('combined_cleaned_scripts.csv')

In [8]:
#TODO: do some nlp clean up on it
df.head()

Unnamed: 0,Scene_ID,Character,Dialogue,Scene_Description,Metadata
0,0.0,Star Trek V,The Final Frontier,,
1,0.0,SYBOK,I thought weapons were forbidden on this plane...,,"""The Planet of Galactic Peace"""
2,0.0,J'ONN,It's all I have.,,(out of a dust storm a horseman approaches an ...
3,0.0,SYBOK,Your pain runs deep.,,
4,0.0,J'ONN,What do you know of my pain?,,


#### **Converting a Pandas DataFrame to a Hugging Face Dataset**
The following line of code converts a Pandas DataFrame into a Hugging Face `Dataset`,
which is optimized for use in HF training pipeline.

In [9]:
##use Dataset from HF to make training easier later
dataset = Dataset.from_pandas(pd.DataFrame({"text": df["Dialogue"].tolist()}))
##alternate datatypes
# Dataset.from_dict()
# Dataset.from_list()

In [10]:
dataset

Dataset({
    features: ['text'],
    num_rows: 7404
})

### 3. Load Model

In [11]:
##load models
model_name = "distilbert-base-uncased" #
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

#### **Converting a Pandas DataFrame to a Hugging Face Dataset**
The below code applies a tokenization function to an entire dataset using Hugging Face's Dataset.map() function. There are two parts: <br>
**The tokenize_function tokenizes text data using a Hugging Face tokenizer**
* Truncation (truncation=True) → Ensures long texts are cut off at 512 tokens (BERT’s max length).
* Padding (padding=True) → Ensures shorter texts are padded to 512 tokens.
* Max Length (max_length=512) → Defines the maximum token limit per input.
  <br>

**dataset.map applies the function to the entire dataset**

* Lambda Function (lambda x: tokenize_function(x, tokenizer))
* Converts dataset.map() into a format that passes tokenizer into tokenize_function().
batched=True → Optimizes Processing

* Instead of processing one row at a time, it processes multiple rows at once, which is faster.


In [12]:
def tokenize_function(examples, tokenizer):
    """Tokenizes input text for MLM fine-tuning."""
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

In [13]:
##tokenize the dataset
#The .map() tokenization only prepares the dataset—but does not return PyTorch tensors.
tokenized_datasets = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)



  0%|          | 0/8 [00:00<?, ?ba/s]

#### **Using HF Data Collator**
* A utility from Hugging Face's transformers library that helps prepare data batches for training models like BERT, RoBERTa, and DistilBERT.
* It automatically masks words in the input so that the model can learn to predict the missing words.

In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.3
)

In [15]:
def compute_perplexity(model, tokenizer, dataset, data_collator):
    """
    Computes the perplexity of a dataset using a masked language model.

    Parameters:
    - model: The pre-trained language model (e.g., BERT, DistilBERT).
    - tokenizer: The tokenizer corresponding to the model.
    - dataset: A dictionary containing a list of text samples under the "text" key.
    - data_collator: A collator that applies dynamic token masking for MLM training.

    Returns:
    - float: The computed perplexity value.
    """

    # Ensure model is in evaluation mode (prevents gradient updates)
    model.eval()

    # List to store loss values for each sample
    losses = []

    # Iterate over each text sample in the dataset
    for example in dataset["text"]:

        # Convert the text input into tokenized format
        # `return_tensors="pt"` returns tensors for PyTorch
        # `truncation=True` ensures long texts are truncated to max length
        # `padding=True` ensures consistent input size
        inputs = tokenizer(example, return_tensors="pt", truncation=True, padding=True)

        # Error handlind:
        # Check if `input_ids` tensor is empty (can happen for blank inputs)
        if inputs["input_ids"].nelement() == 0:
            print(f"Skipping empty input: {example}")
            continue

        # Apply masking using the data collator
        # The collator takes care of randomly masking tokens in the batch
        masked_batch = data_collator([{"input_ids": inputs["input_ids"].squeeze(0)}])

        # Move the masked batch tensors to the model's device (GPU if available)
        masked_batch = {k: v.to(model.device) for k, v in masked_batch.items()}

        # Disable gradient computation to save memory and improve speed
        with torch.no_grad():
            # Forward pass: compute predictions and loss
            outputs = model(**masked_batch)

            # Extract loss value from the model output
            loss = outputs.loss.item()

            # Debugging: Check if loss is NaN or Inf (should not happen)
            if np.isnan(loss) or np.isinf(loss):
                print(f"Skipping invalid loss for input: {example}")
                continue  # Skip this sample

            # Store valid loss values for later averaging
            losses.append(loss)

    # Handle case where all samples were skipped (to prevent NaN output)
    if not losses:
        return np.nan  # Return NaN if no valid losses were recorded

    # Compute perplexity:
    # Perplexity = exp(mean_loss)  (Lower perplexity means better language modeling)
    return np.exp(np.mean(losses))


In [16]:
perplexity_original = compute_perplexity(model, tokenizer, dataset, data_collator)
print(f"Baseline Perplexity: {perplexity_original:.2f}")

Skipping invalid loss for input: The Final Frontier
Skipping invalid loss for input: Your pain runs deep.
Skipping invalid loss for input: The power was within you.
Skipping invalid loss for input: But how?
Skipping invalid loss for input: You're a Vulcan!
Skipping invalid loss for input: Uhura, I thought you were on leave?
Skipping invalid loss for input: I don't believe this. Commander Sulu here.
Skipping invalid loss for input: And?
Skipping invalid loss for input: Ah.
Skipping invalid loss for input: Captain?
Skipping invalid loss for input: Jim?
Skipping invalid loss for input: Are you sure about that?
Skipping invalid loss for input: Yes.
Skipping invalid loss for input: You caught me on my way to the shower.
Skipping invalid loss for input: The feeling's mutual. Engine room.
Skipping invalid loss for input: Aye sir.
Skipping invalid loss for input: Good morning, Captain.
Skipping invalid loss for input: Ah, this must be the hostage tape.
Skipping invalid loss for input: Fascinat