# Loading the datasets

In [1]:
import os
import json

import pandas as pd
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive/')

NUM_EPOCHS = 3

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
train = pd.read_csv('/content/drive/My Drive/Coleridge/datasets/train.csv')
train_items = train.sample(n=1000, random_state=42)

X_train, X_test = train_test_split(train_items, test_size=0.1, random_state=42)
train_papers = {}
test_papers = {}

for i in range(len(X_train)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Coleridge',
        'datasets',
        'train',
        X_train.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        train_papers[X_train.iloc[i]['Id']] = curr_json

for i in range(len(X_test)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Coleridge',
        'datasets',
        'train',
        X_test.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        test_papers[X_test.iloc[i]['Id']] = curr_json

In [3]:
# Extract text data from your papers
def extract_text(papers):
    texts = []
    for paper_id, content in papers.items():
        # Assuming each paper JSON has a key 'text' or 'content' for text data
        paper_text = " ".join([section['text'] for section in content])  # Adjust if the structure is different
        texts.append(paper_text)
    return texts

# Extract training and testing data
train_texts = extract_text(train_papers)
test_texts = extract_text(test_papers)

# Trigram Language Model

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict, Counter
import random

In [5]:
# Step 1: Preprocessing to generate unigrams, bigrams, and trigrams
def generate_ngrams(text):
    tokens = text.split()
    unigrams = tokens
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
    trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
    return unigrams, bigrams, trigrams

# Example corpus
corpus = ''.join(train_texts)

# Generate unigrams, bigrams, and trigrams from the corpus
unigrams, bigrams, trigrams = generate_ngrams(corpus)

In [6]:
# Step 2: Build a vocabulary and map words to indices
vocab = set(corpus.split())
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

In [7]:
# Step 3: Count frequencies
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
total_unigrams = len(unigrams)

In [8]:
# Step 4: Create training data for the model
X_train = []
y_train = []

for w1, w2, w3 in trigrams:
    X_train.append((word_to_idx[w1], word_to_idx[w2]))
    y_train.append(word_to_idx[w3])

# Convert training data to tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)

In [9]:
# Step 5: Define the model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TrigramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear1 = nn.Linear(embed_dim * 2, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        x = x.view((x.shape[0], -1))  # Flatten
        x = torch.relu(self.linear1(x))
        x = self.linear2(x)
        return x

# Initialize the model and train
vocab_size = len(vocab)
embed_dim = 50  # Embedding size
model = TrigramModel(vocab_size, embed_dim)

In [10]:
from torch.utils.data import DataLoader, TensorDataset

# Prepare dataset for batch processing
dataset = TensorDataset(X_train, y_train)
batch_size = 512  # Adjust based on available memory
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [11]:
# Step 6: Interpolation functions
def unigram_prob(word):
    return unigram_counts[idx_to_word[word]] / total_unigrams

def bigram_prob(w2, w1):
    return bigram_counts[(idx_to_word[w1], idx_to_word[w2])] / unigram_counts[idx_to_word[w1]] if unigram_counts[idx_to_word[w1]] > 0 else 0

def trigram_prob(w3, w1, w2):
    return trigram_counts[(idx_to_word[w1], idx_to_word[w2], idx_to_word[w3])] / bigram_counts[(idx_to_word[w1], idx_to_word[w2])] if bigram_counts[(idx_to_word[w1], idx_to_word[w2])] > 0 else 0

def interpolated_prob(w3, w1, w2, lambda1=0.1, lambda2=0.3, lambda3=0.6):
    p1 = unigram_prob(w3)
    p2 = bigram_prob(w3, w2)
    p3 = trigram_prob(w3, w1, w2)
    return lambda1 * p1 + lambda2 * p2 + lambda3 * p3

# Training the Trigram Model

In [12]:
from torch.utils.data import DataLoader, TensorDataset

test_corpus = ''.join(test_texts)
test_unigrams, test_bigrams, test_trigrams = generate_ngrams(test_corpus)

test_vocab = set(test_corpus.split())
test_word_to_idx = {word: i for i, word in enumerate(test_vocab)}
test_idx_to_word = {i: word for i, word in enumerate(test_vocab)}

X_test = []
y_test = []

for w1, w2, w3 in test_trigrams:
    X_test.append((test_word_to_idx[w1], test_word_to_idx[w2]))
    y_test.append(test_word_to_idx[w3])


X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Prepare dataset for batch processing
test_dataset = TensorDataset(X_test, y_test)
batch_size = 512  # Adjust based on available memory
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [13]:
# TODO: Try to integrate GradScaler
import os
import torch
from tqdm.notebook import tqdm
import torch.nn.functional as F

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to save checkpoint
def save_checkpoint(model, optimizer, epoch, loss, checkpoint_dir='checkpoints'):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch}.pth')

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, checkpoint_path)

    print(f"Checkpoint saved: {checkpoint_path}")

# Function to load checkpoint
def load_checkpoint(checkpoint_path, model, optimizer=None):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    print(f"Checkpoint loaded: Epoch {epoch}, Loss: {loss:.4f}")
    return epoch, loss

# Function to calculate interpolated probability
def interpolated_prob(w3_idx, w1_idx, w2_idx, lambda1=0.1, lambda2=0.3, lambda3=0.6):
    word3 = idx_to_word[w3_idx]
    word1 = idx_to_word[w1_idx]
    word2 = idx_to_word[w2_idx]

    p1 = unigram_prob(w3_idx)  # Unigram probability
    p2 = bigram_prob(w3_idx, w2_idx)  # Bigram probability
    p3 = trigram_prob(w3_idx, w1_idx, w2_idx)  # Trigram probability

    return lambda1 * p1 + lambda2 * p2 + lambda3 * p3

# Function to calculate log-likelihood and perplexity using interpolated probabilities
def calculate_metrics_with_interpolation(model, data_loader):
    model.eval()  # Set model to evaluation mode
    total_log_likelihood = 0.0
    total_words = 0

    with torch.no_grad():  # Disable gradient calculation
        for inputs, targets in tqdm(data_loader, desc="Evaluating"):
            inputs, targets = inputs.to(device), targets.to(device)

            # Calculate log-likelihood using interpolated probabilities
            for i in range(len(inputs)):
                w1_idx, w2_idx = inputs[i][0].item(), inputs[i][1].item()
                w3_idx = targets[i].item()

                # Get interpolated probability
                prob = interpolated_prob(w3_idx, w1_idx, w2_idx)

                # Avoid log(0) by ensuring prob is non-zero
                if prob > 0:
                    log_prob = torch.log(torch.tensor(prob))
                else:
                    log_prob = torch.tensor(-float('inf'))  # Log of zero case

                total_log_likelihood += log_prob.item()
                total_words += 1

    # Average log-likelihood
    average_log_likelihood = total_log_likelihood / total_words

    # Perplexity: exp(-average log-likelihood)
    perplexity = torch.exp(-torch.tensor(average_log_likelihood))

    return total_log_likelihood, perplexity.item()

# Modify the training loop to calculate metrics with interpolation
def train_and_evaluate_model_with_interpolation(model, train_loader, test_loader, num_epochs=10, learning_rate=0.001, checkpoint_dir='checkpoints'):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Move model to the GPU if available
    model = model.to(device)

    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        model.train()
        running_loss = 0.0

        for i, (inputs, targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)):
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 100 == 99:    # Print every 100 batches
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}], Loss: {running_loss / 100:.4f}')
                running_loss = 0.0

        # Save checkpoint after every epoch
        save_checkpoint(model, optimizer, epoch+1, loss.item(), checkpoint_dir)

        # Calculate log-likelihood and perplexity on the test set using interpolated probabilities
        log_likelihood, perplexity = calculate_metrics_with_interpolation(model, test_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Log-Likelihood: {log_likelihood:.4f}, Perplexity: {perplexity:.4f}')

# Example: Initialize and train model with evaluation using interpolation
vocab_size = len(vocab)
embed_dim = 50  # Embedding size
model = TrigramModel(vocab_size, embed_dim)

# Assume `train_loader` and `test_loader` are your DataLoaders
# Train and evaluate the model with interpolated probabilities for log-likelihood and perplexity
train_and_evaluate_model_with_interpolation(model, train_loader, test_loader, num_epochs=NUM_EPOCHS)

# You can also run evaluation separately on the test set after training using interpolated probabilities
log_likelihood, perplexity = calculate_metrics_with_interpolation(model, test_loader)
print(f"Final Evaluation - Log-Likelihood: {log_likelihood:.4f}, Perplexity: {perplexity:.4f}")


Using device: cuda


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3:   0%|          | 0/13798 [00:00<?, ?it/s]

Epoch [1/3], Batch [100], Loss: 10.5625
Epoch [1/3], Batch [200], Loss: 8.5157
Epoch [1/3], Batch [300], Loss: 8.3741
Epoch [1/3], Batch [400], Loss: 8.2722
Epoch [1/3], Batch [500], Loss: 8.1680
Epoch [1/3], Batch [600], Loss: 8.1408
Epoch [1/3], Batch [700], Loss: 8.0564
Epoch [1/3], Batch [800], Loss: 8.0114
Epoch [1/3], Batch [900], Loss: 7.9630
Epoch [1/3], Batch [1000], Loss: 7.9328
Epoch [1/3], Batch [1100], Loss: 7.9008
Epoch [1/3], Batch [1200], Loss: 7.8781
Epoch [1/3], Batch [1300], Loss: 7.8631
Epoch [1/3], Batch [1400], Loss: 7.8263
Epoch [1/3], Batch [1500], Loss: 7.8219
Epoch [1/3], Batch [1600], Loss: 7.7965
Epoch [1/3], Batch [1700], Loss: 7.7719
Epoch [1/3], Batch [1800], Loss: 7.7543
Epoch [1/3], Batch [1900], Loss: 7.7186
Epoch [1/3], Batch [2000], Loss: 7.7482
Epoch [1/3], Batch [2100], Loss: 7.6893
Epoch [1/3], Batch [2200], Loss: 7.6974
Epoch [1/3], Batch [2300], Loss: 7.6979
Epoch [1/3], Batch [2400], Loss: 7.6674
Epoch [1/3], Batch [2500], Loss: 7.6740
Epoch [1

Evaluating:   0%|          | 0/1564 [00:00<?, ?it/s]

Epoch [1/3], Log-Likelihood: -13800114.8855, Perplexity: 30609710.0000


Epoch 2/3:   0%|          | 0/13798 [00:00<?, ?it/s]

Epoch [2/3], Batch [100], Loss: 6.7652
Epoch [2/3], Batch [200], Loss: 6.7309
Epoch [2/3], Batch [300], Loss: 6.7312
Epoch [2/3], Batch [400], Loss: 6.7284
Epoch [2/3], Batch [500], Loss: 6.6969
Epoch [2/3], Batch [600], Loss: 6.7111
Epoch [2/3], Batch [700], Loss: 6.6992
Epoch [2/3], Batch [800], Loss: 6.7056
Epoch [2/3], Batch [900], Loss: 6.6957
Epoch [2/3], Batch [1000], Loss: 6.7169
Epoch [2/3], Batch [1100], Loss: 6.6836
Epoch [2/3], Batch [1200], Loss: 6.6813
Epoch [2/3], Batch [1300], Loss: 6.7064
Epoch [2/3], Batch [1400], Loss: 6.6631
Epoch [2/3], Batch [1500], Loss: 6.6586
Epoch [2/3], Batch [1600], Loss: 6.6612
Epoch [2/3], Batch [1700], Loss: 6.6525
Epoch [2/3], Batch [1800], Loss: 6.6596
Epoch [2/3], Batch [1900], Loss: 6.6722
Epoch [2/3], Batch [2000], Loss: 6.6617
Epoch [2/3], Batch [2100], Loss: 6.6537
Epoch [2/3], Batch [2200], Loss: 6.6540
Epoch [2/3], Batch [2300], Loss: 6.6291
Epoch [2/3], Batch [2400], Loss: 6.6532
Epoch [2/3], Batch [2500], Loss: 6.6751
Epoch [2/

Evaluating:   0%|          | 0/1564 [00:00<?, ?it/s]

Epoch [2/3], Log-Likelihood: -13800114.8855, Perplexity: 30609710.0000


Epoch 3/3:   0%|          | 0/13798 [00:00<?, ?it/s]

Epoch [3/3], Batch [100], Loss: 6.3288
Epoch [3/3], Batch [200], Loss: 6.3233
Epoch [3/3], Batch [300], Loss: 6.3325
Epoch [3/3], Batch [400], Loss: 6.2839
Epoch [3/3], Batch [500], Loss: 6.3281
Epoch [3/3], Batch [600], Loss: 6.3528
Epoch [3/3], Batch [700], Loss: 6.3172
Epoch [3/3], Batch [800], Loss: 6.3106
Epoch [3/3], Batch [900], Loss: 6.2809
Epoch [3/3], Batch [1000], Loss: 6.2986
Epoch [3/3], Batch [1100], Loss: 6.3057
Epoch [3/3], Batch [1200], Loss: 6.2816
Epoch [3/3], Batch [1300], Loss: 6.3141
Epoch [3/3], Batch [1400], Loss: 6.3029
Epoch [3/3], Batch [1500], Loss: 6.2831
Epoch [3/3], Batch [1600], Loss: 6.2683
Epoch [3/3], Batch [1700], Loss: 6.2892
Epoch [3/3], Batch [1800], Loss: 6.2929
Epoch [3/3], Batch [1900], Loss: 6.2818
Epoch [3/3], Batch [2000], Loss: 6.2880
Epoch [3/3], Batch [2100], Loss: 6.3146
Epoch [3/3], Batch [2200], Loss: 6.2978
Epoch [3/3], Batch [2300], Loss: 6.2933
Epoch [3/3], Batch [2400], Loss: 6.2837
Epoch [3/3], Batch [2500], Loss: 6.2705
Epoch [3/

Evaluating:   0%|          | 0/1564 [00:00<?, ?it/s]

Epoch [3/3], Log-Likelihood: -13800114.8855, Perplexity: 30609710.0000


Evaluating:   0%|          | 0/1564 [00:00<?, ?it/s]

Final Evaluation - Log-Likelihood: -13800114.8855, Perplexity: 30609710.0000


# Evaluation of Trigram Model

In [14]:
# Function to calculate log-likelihood and perplexity
def calculate_metrics(model, data_loader):
    model.eval()  # Set model to evaluation mode
    total_log_likelihood = 0.0
    total_words = 0

    with torch.no_grad():  # Disable gradient calculation
        for inputs, targets in tqdm(data_loader, desc="Evaluating"):
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)

            # Calculate log probabilities (log softmax)
            log_probs = F.log_softmax(outputs, dim=1)

            # Gather the log likelihoods for the true targets
            log_likelihoods = log_probs[range(len(targets)), targets]

            # Sum log likelihoods
            total_log_likelihood += log_likelihoods.sum().item()
            total_words += len(targets)

    # Average negative log likelihood
    average_log_likelihood = total_log_likelihood / total_words

    # Perplexity: exp(-average log-likelihood)
    perplexity = torch.exp(-torch.tensor(average_log_likelihood))

    return total_log_likelihood, perplexity.item()

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
metrics = calculate_metrics(model, test_loader)

Evaluating:   0%|          | 0/100078 [00:00<?, ?it/s]

In [21]:
print(metrics)

(-13523289.508937836, 21661860.0)


# Transformer Decoder-Only Model

## Initialize Tokenizer and GPT2 Model

In [15]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# GPT-2 does not have a padding token, so we'll assign one
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding(50257, 768)

## Create and Set Custom Dataset

In [16]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text
        encodings = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors="pt"
        )
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids  # For language modeling, input and target are the same
        }


In [17]:
train_dataset = TextDataset(train_texts, tokenizer)
test_dataset = TextDataset(test_texts, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


## Set up Transformer Training

In [18]:
from transformers import Trainer, TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    save_steps=10_000,
    save_total_limit=2,
    fp16=True  # Enable if you're using a T4 unit
)

# Define a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Start training
trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,No log,2.998817
2,No log,2.97538
3,No log,2.971974


TrainOutput(global_step=333, training_loss=3.094998929593656, metrics={'train_runtime': 315.2576, 'train_samples_per_second': 8.45, 'train_steps_per_second': 1.056, 'total_flos': 696081973248000.0, 'train_loss': 3.094998929593656, 'epoch': 3.0})

## Evaluate and compute metrics

In [19]:
results = trainer.evaluate()
print(results)


{'eval_loss': 2.9719741344451904, 'eval_runtime': 6.92, 'eval_samples_per_second': 14.451, 'eval_steps_per_second': 1.879, 'epoch': 3.0}


In [20]:
import math

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def compute_log_likelihood_and_perplexity(dataloader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    total_length = 0

    with torch.no_grad():  # Disable gradient computation for efficiency
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Forward pass, compute the logits and loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss  # Cross-entropy loss

            batch_size = input_ids.size(0)
            batch_loss = loss.item() * batch_size
            total_loss += batch_loss
            total_length += input_ids.size(1)  # Count total tokens

    # Compute average log-likelihood (cross-entropy loss)
    avg_log_likelihood = total_loss / total_length

    # Compute perplexity: exp of average negative log-likelihood
    perplexity = math.exp(avg_log_likelihood)

    return avg_log_likelihood, perplexity

# Create the DataLoader
test_dataset = TextDataset(test_texts, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8)

# Compute log-likelihood and perplexity
log_likelihood, perplexity = compute_log_likelihood_and_perplexity(test_loader)

print(f"Log-Likelihood: {log_likelihood}")
print(f"Perplexity: {perplexity}")

Using device: cuda
Log-Likelihood: 0.04465106086662182
Perplexity: 1.0456629235277861
