# Pre-Trained Language Models (PLM)

#### Part I: BERT Implementation

In [1]:
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Function to read and clean data for one review
def process_file(file_path, label):
    with open(file_path, 'r', encoding='utf-8') as file:
        review_text = file.read()
    return review_text, label

# Function to read the data in parallel
def load_data(data_dir):
    reviews = []
    labels = []
    tasks = []
    
    with ThreadPoolExecutor() as executor:
        for label_type in ['pos', 'neg']:
            
            dir_name = os.path.join(data_dir, label_type)
            label = 1 if label_type == 'pos' else 0
            
            for file_name in os.listdir(dir_name):
                file_path = os.path.join(dir_name, file_name)
                # Submit tasks to process files in parallel
                tasks.append(executor.submit(process_file, file_path, label))
        
        # Collect results as tasks complete
        for task in as_completed(tasks):
            review, label = task.result()
            reviews.append(review)
            labels.append(label)
    
    return reviews, labels

# Load train and test datasets
train_reviews, train_labels = load_data('train')
test_reviews, test_labels = load_data('test')

# View an example of a review
print(f"Label: {train_labels[0]}")
print(" ".join(train_reviews[0].split()[:100])) 


Label: 1
If people didn't know who Barbra Streisand was before this,...(is that POSSIBLE?)...they sure knew who she was after!<br /><br />This show went on to win 5 Emmys, & stands out as one the best things Streisand has ever done.<br /><br />It's made up of 3 acts....<br /><br />ACT I...Barbra singing standards from room to room, filled with musicians, including a segment where she is a little girl again,all ending with a splendid version of her signature song,(at the time)..."People".<br /><br />ACT II....A musical tour of Bergdoff-Goodman,while Barbra Sings poverty songs..it's better than it sounds...<br /><br />ACT III.....The best part, Just


In [3]:
# Initialize DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the datasets
def tokenize_data(reviews, labels):
    encoding = tokenizer(reviews, padding=True, truncation=True, max_length=250, return_tensors='pt')
    labels = torch.tensor(labels)
    return TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels)

train_dataset = tokenize_data(train_reviews, train_labels)
test_dataset = tokenize_data(test_reviews, test_labels)

# Create DataLoader for batching
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training function
def train_model(model, train_loader, optimizer, epochs):
    model.train()
    loss_values = []
    
    for epoch in range(epochs):
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        
        for batch in train_loader:
            input_ids, attention_mask, labels = [item for item in batch]
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Get predictions and calculate the number of correct predictions
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
            total_predictions += labels.size(0)
        
        # Calculate average loss and accuracy for the epoch
        avg_loss = total_loss / len(train_loader)
        accuracy = correct_predictions / total_predictions
        loss_values.append(avg_loss)
        
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}, Accuracy: {accuracy:.4f}')

    # Plotting the Training Convergence Plot
    plt.plot(range(1, len(loss_values) + 1), loss_values)
    plt.title('Training Loss Convergence')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.show()

# Train the model
train_model(model, train_loader, optimizer, epochs)


In [None]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [item for item in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')

# Evaluate the model
evaluate_model(model, test_loader)


#### Part II: Fine-Tuning GPT-2 for Joke Generation

In [8]:
import pandas as pd
from datasets import Dataset

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

In [9]:
# Load the dataset
df = pd.read_csv('data')

# Extract the jokes column
jokes = df['Joke'].tolist()

# View an example of a joke
print(jokes[:1])


['What did the bartender say to the jumper cables? You better not try to start anything.']


In [None]:
# Convert the list of jokes into a dataset
jokes_dataset = Dataset.from_dict({"text": jokes})

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add [PAD] token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model’s embedding layer to account for the added special token
model.resize_token_embeddings(len(tokenizer))

# Tokenize the jokes
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_jokes = jokes_dataset.map(tokenize_function, batched=True)

# Convert the dataset into PyTorch DataLoader
tokenized_jokes.set_format(type="torch", columns=["input_ids", "attention_mask"])
dataloader = DataLoader(tokenized_jokes, batch_size=4, shuffle=True)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 3
model.train()  # Set the model to training mode

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    loop = tqdm(dataloader, leave=True)  # Progress bar

    for batch in loop:
        # Move batch data to the same device as the model
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Optimize
        optimizer.step()
        optimizer.zero_grad()

        # Update progress bar
        loop.set_description(f"Loss {loss.item():.4f}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_gpt2_jokes")
tokenizer.save_pretrained("./fine_tuned_gpt2_jokes")


Map:   0%|          | 0/1622 [00:00<?, ? examples/s]

Epoch 1/3


Loss 0.8588:  24%|██▍       | 97/406 [21:38<1:17:53, 15.12s/it]  