In [1]:
#Install Torch
!pip install torch



In [2]:
#Install PDF processing library
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertTokenizer
import pdfplumber

# Device configuration (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 1: Extract Text from the CV PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return ' '.join(pages)

# Load the CV PDF and extract text
cv_path = "about.pdf"  # Change this path to your file location
cv_text = extract_text_from_pdf(cv_path)

# Step 2: Preprocessing - Masked Language Modeling Input
# Load a pre-trained tokenizer (BERT for tokenizing the text)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to mask tokens for MLM
def create_masked_input(text, tokenizer, mask_prob=0.15):
    tokens = tokenizer.encode(text, return_tensors="pt").to(device)  # Tokenize text
    labels = tokens.clone()

    probability_matrix = torch.full(labels.shape, mask_prob, device=device)
    masked_indices = torch.bernoulli(probability_matrix).bool()

    labels[~masked_indices] = -100  # Only compute loss on masked tokens

    # Replace tokens with [MASK]
    mask_token_id = tokenizer.convert_tokens_to_ids("[MASK]")
    tokens[masked_indices] = mask_token_id

    return tokens, labels

# Create masked inputs for MLM from CV text
tokens, labels = create_masked_input(cv_text, tokenizer)

# Legendre Polynomial Function
def legendre_polynomials(x, order=5):
    """
    Compute the first few Legendre polynomials (up to a given order) for each input in x.
    Args:
    x (torch.Tensor): Input tensor of shape (batch_size, seq_len, d_model).
    order (int): The maximum order of the Legendre polynomial to compute.

    Returns:
    torch.Tensor: Transformed input with additional Legendre polynomial features.
    """
    batch_size, seq_len, d_model = x.shape

    # Initialize a list to store polynomials P_0(x), P_1(x), ..., P_order(x)
    polynomials = []

    # P_0(x) = 1 (constant)
    P0 = torch.ones_like(x)
    polynomials.append(P0)

    # P_1(x) = x
    P1 = x
    polynomials.append(P1)

    # Recursively compute P_n(x) for n = 2, 3, ..., order
    for n in range(2, order + 1):
        Pn = ((2 * n - 1) * x * polynomials[n - 1] - (n - 1) * polynomials[n - 2]) / n
        polynomials.append(Pn)

    # Stack all polynomials together along the last dimension (d_model)
    # This will create additional features for each input based on Legendre polynomials
    return torch.cat(polynomials, dim=-1)

# S6 Module
class S6(nn.Module):
    def __init__(self, d_model, state_size):
        super(S6, self).__init__()
        self.fc1 = nn.Linear(d_model, d_model)
        self.fc2 = nn.Linear(d_model, state_size)
        self.fc3 = nn.Linear(d_model, state_size)
        self.d_model = d_model
        self.state_size = state_size

        self.A = nn.Parameter(torch.randn(d_model, state_size))
        nn.init.xavier_uniform_(self.A)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape  # Dynamically infer the sequence length from input

        # Initialize dynamic buffers
        B = torch.zeros(batch_size, seq_len, self.state_size, device=x.device)
        C = torch.zeros(batch_size, seq_len, self.state_size, device=x.device)
        delta = torch.zeros(batch_size, seq_len, self.d_model, device=x.device)
        dA = torch.zeros(batch_size, seq_len, self.d_model, self.state_size, device=x.device)
        h = torch.zeros(batch_size, seq_len, self.d_model, self.state_size, device=x.device)

        # Apply linear transformations
        B = self.fc2(x)
        C = self.fc3(x)
        delta = F.softplus(self.fc1(x))

        # Discretization operation
        dA = torch.exp(torch.einsum("bld,dn->bldn", delta, self.A))

        # Compute h and y (output)
        h = torch.einsum('bldn,bldn->bldn', dA, h) + torch.unsqueeze(x, dim=-1)
        y = torch.einsum('bln,bldn->bld', C, h)

        return y

# Mamba Block with Legendre Polynomial Transformation
class MambaBlock(nn.Module):
    def __init__(self, d_model, state_size, legendre_order=5):
        super(MambaBlock, self).__init__()
        self.legendre_order = legendre_order  # Order of Legendre polynomials
        self.inp_proj = nn.Linear((legendre_order + 1) * d_model, 2 * d_model)  # Adjust input size after Legendre expansion
        self.out_proj = nn.Linear(2 * d_model, d_model)
        self.s6 = S6(2 * d_model, state_size)
        self.norm = nn.LayerNorm(2 * d_model)  # LayerNorm matches the d_model after projection

    def forward(self, x):
        # Apply Legendre polynomials to the input
        x_legendre = legendre_polynomials(x, order=self.legendre_order)  # Non-linear transform

        # Project input to 2*d_model after Legendre expansion
        x_proj = self.inp_proj(x_legendre)
        x_proj = self.norm(x_proj)  # Apply normalization
        x_ssm = self.s6(x_proj)  # Pass through S6 module
        x_out = self.out_proj(x_ssm)  # Project back to d_model dimension
        return x_out

# Full Mamba Model
class Mamba(nn.Module):
    def __init__(self, d_model, state_size, vocab_size, legendre_order=5):
        super(Mamba, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)  # Embedding layer
        self.mamba_block1 = MambaBlock(d_model, state_size, legendre_order)
        self.mamba_block2 = MambaBlock(d_model, state_size, legendre_order)
        self.mamba_block3 = MambaBlock(d_model, state_size, legendre_order)
        self.fc_out = nn.Linear(d_model, vocab_size)  # Final output layer for MLM

    def forward(self, x):
        x = self.embedding(x)  # Embed the input tokens to shape (batch_size, seq_len, d_model)
        x = self.mamba_block1(x)
        x = self.mamba_block2(x)
        x = self.mamba_block3(x)
        return self.fc_out(x)  # Return logits for each token

# Step 4: Training the Mamba Model
# Hyperparameters
d_model = 128  # Dimensionality of the model
state_size = 256  # Size of the hidden state
batch_size = 32
num_epochs = 5
vocab_size = tokenizer.vocab_size  # Vocabulary size from the tokenizer

# Initialize Mamba model and optimizer
model = Mamba(d_model, state_size, vocab_size).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    tokens, labels = tokens.to(device), labels.to(device)

    # Forward pass through Mamba model
    outputs = model(tokens)
    loss = criterion(outputs.view(-1, vocab_size), labels.view(-1))

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Step 5: Inference with Mamba Model
def masked_language_modeling_inference(text, tokenizer, model, mask_token_id, top_k=5):
    tokens = tokenizer.encode(text, return_tensors="pt").to(device)
    mask_token_index = torch.where(tokens == mask_token_id)[1]

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(tokens)

    logits = outputs[0, mask_token_index]
    top_k_tokens = torch.topk(logits, top_k, dim=-1).indices.tolist()[0]
    predicted_tokens = [tokenizer.decode([token]) for token in top_k_tokens]
    return predicted_tokens

# Perform inference on the masked CV text
masked_text = "Kaushik Roy is pursuing his Ph.D. at the [MASK] of South Carolina."
predictions = masked_language_modeling_inference(masked_text, tokenizer, model, tokenizer.convert_tokens_to_ids("[MASK]"))
print(f"Predicted words for [MASK]: {predictions}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Epoch 1, Loss: 14.453326225280762
Epoch 2, Loss: 14.151762008666992
Epoch 3, Loss: 11.153654098510742
Epoch 4, Loss: 10.359596252441406
Epoch 5, Loss: 10.333610534667969
Predicted words for [MASK]: ['agreements', '[unused165]', 'cafeteria', 'unrelated', 'whereupon']
