# Loading the datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
train = pd.read_csv('/content/drive/My Drive/Datasets/Coleridge/datasets/train.csv')
train_items = train.sample(n=1000, random_state=42)

X_train, X_test = train_test_split(train_items, test_size=0.1, random_state=42)
train_papers = {}
test_papers = {}

for i in range(len(X_train)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Datasets',
        'Coleridge',
        'datasets',
        'train',
        X_train.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        train_papers[X_train.iloc[i]['Id']] = curr_json

for i in range(len(X_test)):
    curr_path = os.path.join(
        os.getcwd(),
        'drive',
        'My Drive',
        'Datasets',
        'Coleridge',
        'datasets',
        'train',
        X_test.iloc[i]['Id'] + '.json')
    with open(curr_path, 'r') as file:
        curr_json = json.load(file)
        test_papers[X_test.iloc[i]['Id']] = curr_json

# Trigram Language Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict
import random

In [None]:
# Step 1: Preprocessing to generate unigrams, bigrams, and trigrams
def generate_ngrams(text):
    tokens = text.split()
    unigrams = tokens
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
    trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
    return unigrams, bigrams, trigrams

# Example corpus
corpus = "I am learning NLP. NLP is fun. I love learning about models in NLP."

# Generate unigrams, bigrams, and trigrams from the corpus
unigrams, bigrams, trigrams = generate_ngrams(corpus)

In [None]:
# Step 2: Build a vocabulary and map words to indices
vocab = set(corpus.split())
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

In [None]:
# Step 3: Count frequencies
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
total_unigrams = len(unigrams)

In [None]:
# Step 3: Count frequencies
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
total_unigrams = len(unigrams)

In [None]:
# Step 4: Create training data for the model
X_train = []
y_train = []

for w1, w2, w3 in trigrams:
    X_train.append((word_to_idx[w1], word_to_idx[w2]))
    y_train.append(word_to_idx[w3])

# Convert training data to tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)

In [None]:
# Step 5: Define the model
class TrigramModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TrigramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear1 = nn.Linear(embed_dim * 2, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        x = x.view((x.shape[0], -1))  # Flatten
        x = torch.relu(self.linear1(x))
        x = self.linear2(x)
        return x

In [None]:
# Step 6: Interpolation functions
def unigram_prob(word):
    return unigram_counts[idx_to_word[word]] / total_unigrams

def bigram_prob(w2, w1):
    return bigram_counts[(idx_to_word[w1], idx_to_word[w2])] / unigram_counts[idx_to_word[w1]] if unigram_counts[idx_to_word[w1]] > 0 else 0

def trigram_prob(w3, w1, w2):
    return trigram_counts[(idx_to_word[w1], idx_to_word[w2], idx_to_word[w3])] / bigram_counts[(idx_to_word[w1], idx_to_word[w2])] if bigram_counts[(idx_to_word[w1], idx_to_word[w2])] > 0 else 0

def interpolated_prob(w3, w1, w2, lambda1=0.1, lambda2=0.3, lambda3=0.6):
    p1 = unigram_prob(w3)
    p2 = bigram_prob(w3, w2)
    p3 = trigram_prob(w3, w1, w2)
    return lambda1 * p1 + lambda2 * p2 + lambda3 * p3

In [None]:
# Step 7: Train the model with interpolated probabilities
def train_model(model, X_train, y_train, num_epochs=10, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        # Forward pass
        outputs = model(X_train)
        loss = criterion(outputs, y_train)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

        # Optionally, calculate and display interpolated probabilities
        for i in range(5):  # Show interpolated probabilities for 5 random examples
            w1, w2 = X_train[i]
            w3 = y_train[i]
            interp_prob = interpolated_prob(w3.item(), w1.item(), w2.item())
            print(f"Interpolated probability of {idx_to_word[w3.item()]} given {idx_to_word[w1.item()]} {idx_to_word[w2.item()]}: {interp_prob:.4f}")

# Initialize the model and train
vocab_size = len(vocab)
embed_dim = 50  # Embedding size
model = TrigramModel(vocab_size, embed_dim)

# Train the model for 10 epochs
train_model(model, X_train, y_train, num_epochs=10)

# Transformer Decoder-Only Model

In [3]:
import math
import os
import json
import re
import random

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from collections import defaultdict



MessageError: Error: credential propagation was unsuccessful

# Preprocessing the data

In [None]:
# Extract text data from your papers
def extract_text(papers):
    texts = []
    for paper_id, content in papers.items():
        # Assuming each paper JSON has a key 'text' or 'content' for text data
        paper_text = " ".join([section['text'] for section in content])  # Adjust if the structure is different
        texts.append(paper_text)
    return texts

# Extract training and testing data
train_texts = extract_text(train_papers)
test_texts = extract_text(test_papers)

# Tokenizing the Data

In [None]:
import torch
from transformers import BertTokenizer

# Initialize tokenizer (or use any other tokenizer that fits your dataset)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input and output (target) sequences
def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors='pt')

# Prepare data (assuming you have train_texts and test_texts)
train_input_ids = tokenize_texts(train_texts)['input_ids']
train_output_ids = tokenize_texts(test_texts)['input_ids']  # For decoder targets

# Use a special token for padding and start/end of sequences if needed


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


NameError: name 'train_texts' is not defined

In [None]:
import torch.nn as nn
import torch.optim as optim

class TransformerModel(nn.Module):
    def __init__(self, input_dim, output_dim, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoder = nn.Embedding(512, d_model)  # Positional encoding
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.fc_out = nn.Linear(d_model, output_dim)

    def forward(self, src, tgt):
        src = self.embedding(src)  # Encode the input
        tgt = self.embedding(tgt)  # Encode the target

        src = self.pos_encoder(src)  # Positional encoding for inputs
        tgt = self.pos_encoder(tgt)  # Positional encoding for targets

        transformer_output = self.transformer(src, tgt)  # Transformer forward pass

        output = self.fc_out(transformer_output)  # Final output layer
        return output

# Hyperparameters
input_dim = tokenizer.vocab_size
output_dim = tokenizer.vocab_size
d_model = 512
nhead = 8
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048

model = TransformerModel(input_dim, output_dim, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward)


# Creating PyTorch Datasets

In [None]:
# import torch
# from torch.utils.data import Dataset

# class TextDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __len__(self):
#         return len(self.encodings['input_ids'])

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         return item

# # Create train and test datasets
# train_dataset = TextDataset(train_encodings)
# test_dataset = TextDataset(test_encodings)


# Setting up the model

In [None]:
# from transformers import BertForSequenceClassification

# # Initialize the model for binary or multi-class classification
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels for your task


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training the Model

In [None]:
# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Ignore padding

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i in range(len(train_input_ids)):
        src = train_input_ids[i].unsqueeze(0).to(device)
        tgt = train_output_ids[i].unsqueeze(0).to(device)

        optimizer.zero_grad()

        output = model(src, tgt[:, :-1])  # Decoder uses everything except the last token as input
        loss = criterion(output.view(-1, output_dim), tgt[:, 1:].reshape(-1))  # Shifted targets

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_input_ids)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Save the model after training
torch.save(model.state_dict(), 'transformer_model.pth')


Loss is None!


# Test Eval

In [None]:
model.eval()
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        # You can extract logits and compute accuracy or other metrics here
