In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm
import sys
import numpy as np
import os
sys.path.append(os.path.abspath('../data'))
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence






In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import re

def tokenize(expression):
    """Convert expression string to tokens, preserving operators."""
    # Replace parentheses with spaces
    expr = expression.replace('(', ' ').replace(')', ' ')

    # Add spaces around brackets that aren't part of operators
    expr = re.sub(r'\[(?!(MIN|MAX|MED|SM))', ' [ ', expr)
    expr = expr.replace(']', ' ] ')

    # Split and filter empty strings
    return [token for token in expr.split() if token]

class ListOpsDataset(Dataset):
    def __init__(self, X, y):
        """
        Args:
            X: Array of source expressions
            y: Array of target values
        """
        self.X = X
        self.y = y

        # Create vocabulary from operators and digits
        self.vocab = {
            'PAD': 0,  # Padding token
            '[MIN': 1,
            '[MAX': 2,
            '[MED': 3,
            '[SM': 4,
            ']': 5,
            '(': 6,
            ')': 7
        }
        # Add digits 0-9
        for i in range(10):
            self.vocab[str(i)] = i + 8

    def __len__(self):
        return len(self.X)

    def tokenize(self, expr):
        """Convert expression to token IDs."""
        tokens = tokenize(expr)  # Using our previous tokenize function
        return [self.vocab.get(token, 0) for token in tokens]

    def __getitem__(self, idx):
        expr = self.X[idx]
        target = self.y[idx]

        # Convert to token IDs without padding or truncating
        token_ids = self.tokenize(expr)

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [None]:

# Define the data directory and file paths
data_dir = '/content/drive/MyDrive/LongListOps/data/output_dir'
train_file = os.path.join(data_dir, 'basic_train.tsv')
val_file = os.path.join(data_dir, 'basic_val.tsv')
test_file = os.path.join(data_dir, 'basic_test.tsv')

def load_listops_data(file_path, max_rows=None):
    """
    Load ListOps data from TSV file.

    Args:
        file_path: Path to the TSV file
        max_rows: Maximum number of rows to load (for testing)

    Returns:
        sources: Array of source expressions
        targets: Array of target values (0-9)
    """
    sources = []
    targets = []

    with open(file_path, 'r', encoding='utf-8') as f:
        next(f)  # Skip header (Source, Target)
        for i, line in enumerate(f):
            if max_rows and i >= max_rows:
                break
            if not line.strip():  # Skip empty lines
                continue
            parts = line.strip().split('\t')
            if len(parts) != 2:
                continue  # Skip lines that don't have exactly two columns
            source, target = parts
            sources.append(source)
            targets.append(int(target))  # Target is always 0-9

    # Convert to numpy arrays
    source_array = np.array(sources, dtype=object)  # Keep expressions as strings
    target_array = np.array(targets, dtype=np.int32)  # Targets are integers

    return source_array, target_array

try:
    # Load training data
    print("Loading training data...")
    X_train, y_train = load_listops_data(train_file)

    # Load validation data
    print("Loading validation data...")
    X_val, y_val = load_listops_data(val_file)

    # Load test data
    print("Loading test data...")
    X_test, y_test = load_listops_data(test_file)

    # Print dataset statistics
    print("\nDataset sizes:")
    print(f"Training: {len(X_train)} examples")
    print(f"Validation: {len(X_val)} examples")
    print(f"Test: {len(X_test)} examples")

except Exception as e:
    print(f"Error occurred: {type(e).__name__}: {str(e)}")

Loading training data...
Loading validation data...
Loading test data...

Dataset sizes:
Training: 96000 examples
Validation: 2000 examples
Test: 2000 examples


In [None]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

def collate_fn(batch):
    # Separate sequences and targets
    sequences = [item['input_ids'] for item in batch]
    targets = [item['target'] for item in batch]

    # Get lengths of each sequence
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long, device=sequences[0].device)

    # Sort sequences by length in descending order for pack_padded_sequence
    lengths, sort_idx = lengths.sort(descending=True)
    sequences = [sequences[i] for i in sort_idx]
    targets = [targets[i] for i in sort_idx]

    # Pad sequences
    padded_sequences = pad_sequence(sequences, batch_first=True)

    # Convert targets to tensor
    targets = torch.stack(targets)

    return {
        'input_ids': padded_sequences,
        'target': targets,
        'lengths': lengths
    }

# Create datasets
train_dataset = ListOpsDataset(X_train, y_train)
val_dataset = ListOpsDataset(X_val, y_val)
test_dataset = ListOpsDataset(X_test, y_test)

# Create dataloaders with collate_fn
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn
)

# Verify the data
print("Dataset sizes:")
print(f"Train: {len(train_dataset)}")
print(f"Val: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")

# Check first batch
batch = next(iter(train_loader))
print("\nFirst batch shape:")
print(f"Input IDs: {batch['input_ids'].shape}")
print(f"Targets: {batch['target'].shape}")
print(f"Sequence lengths: {batch['lengths']}")

Dataset sizes:
Train: 96000
Val: 2000
Test: 2000

First batch shape:
Input IDs: torch.Size([32, 1951])
Targets: torch.Size([32])
Sequence lengths: tensor([1951, 1833, 1744, 1743, 1614, 1472, 1448, 1238, 1229, 1143, 1058,  992,
         962,  937,  891,  878,  863,  859,  829,  787,  768,  741,  722,  709,
         703,  681,  611,  606,  602,  568,  565,  517])


In [None]:
## Model choice

In [None]:

def collate_fn(batch):
    # Separate sequences and targets
    sequences = [item['input_ids'] for item in batch]
    targets = [item['target'] for item in batch]

    # Get lengths of each sequence
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long, device=sequences[0].device)

    # Sort sequences by length in descending order for pack_padded_sequence
    lengths, sort_idx = lengths.sort(descending=True)
    sequences = [sequences[i] for i in sort_idx]
    targets = [targets[i] for i in sort_idx]

    # Pad sequences
    padded_sequences = pad_sequence(sequences, batch_first=True)

    # Convert targets to tensor
    targets = torch.stack(targets)

    return {
        'input_ids': padded_sequences,
        'target': targets,
        'lengths': lengths
    }

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_seq_length=5000, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_seq_length, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        # input_ids: (batch_size, seq_len)
        # attention_mask: (batch_size, seq_len)
        batch_size, seq_len = input_ids.size()
        device = input_ids.device

        # Generate position indices and get positional embeddings
        positions = torch.arange(0, seq_len, device=device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.embedding(input_ids) + self.pos_embedding(positions)

        # Prepare input for transformer (seq_len, batch_size, embed_dim)
        x = x.transpose(0, 1)

        # Create source key padding mask (batch_size, seq_len)
        src_key_padding_mask = attention_mask == 0  # True for padding tokens

        # Pass through Transformer Encoder
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        # Convert back to (batch_size, seq_len, embed_dim)
        x = x.transpose(0, 1)

        # Pooling: Mean over the sequence length
        x = torch.mean(x, dim=1)  # (batch_size, embed_dim)
        x = self.dropout(x)

        # Classification head
        logits = self.fc_out(x)  # (batch_size, num_classes)
        return logits


In [None]:
def collate_fn(batch):
    # Separate sequences and targets
    sequences = [item['input_ids'] for item in batch]
    targets = [item['target'] for item in batch]

    # Pad sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

    # Create attention masks (1 for tokens, 0 for padding)
    attention_mask = (padded_sequences != 0).long()

    # Convert targets to tensor
    targets = torch.tensor(targets, dtype=torch.long)

    return {
        'input_ids': padded_sequences,
        'attention_mask': attention_mask,
        'target': targets
    }


In [None]:
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=collate_fn
)


In [None]:
# Get the vocabulary size from your dataset
vocab_size = len(train_dataset.vocab)

# Hyperparameters
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 10  # Since targets are 0-9
dropout = 0.1
max_seq_length = 5000  # Adjust if your sequences are longer

# Initialize the model
model = TransformerClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    num_classes=num_classes,
    max_seq_length=max_seq_length,
    dropout=dropout
)




In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


TransformerClassifier(
  (embedding): Embedding(18, 128, padding_idx=0)
  (pos_embedding): Embedding(5000, 128)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc_out): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * input_ids.size(0)
        _, predicted = torch.max(outputs, dim=1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

    avg_loss = total_loss / total
    accuracy = correct / total * 100
    return avg_loss, accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)

            total_loss += loss.item() * input_ids.size(0)
            _, predicted = torch.max(outputs, dim=1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    avg_loss = total_loss / total
    accuracy = correct / total * 100
    return avg_loss, accuracy


In [None]:
num_epochs = 5  # Adjust as needed

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)

    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%\n')


In [None]:
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%')


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn.functional as F
import os
import numpy as np

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Your ListOpsDataset class and other functions (tokenize, etc.) are assumed to be defined already

# Define the TransformerClassifier model
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, num_classes, max_seq_length=5000, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_seq_length, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=dropout,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        batch_size, seq_len = input_ids.size()
        device = input_ids.device

        positions = torch.arange(0, seq_len, device=device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.embedding(input_ids) + self.pos_embedding(positions)

        x = x.transpose(0, 1)  # Transformer expects (seq_len, batch_size, embed_dim)

        src_key_padding_mask = attention_mask == 0  # True for padding tokens
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        x = x.transpose(0, 1)  # Back to (batch_size, seq_len, embed_dim)

        x = torch.mean(x, dim=1)  # Pooling
        x = self.dropout(x)
        logits = self.fc_out(x)
        return logits

# Update collate_fn
def collate_fn(batch):
    sequences = [item['input_ids'] for item in batch]
    targets = [item['target'] for item in batch]
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    attention_mask = (padded_sequences != 0).long()
    targets = torch.tensor(targets, dtype=torch.long)
    return {
        'input_ids': padded_sequences,
        'attention_mask': attention_mask,
        'target': targets
    }

# Create datasets and data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Initialize the model
vocab_size = len(train_dataset.vocab)
embed_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 10
dropout = 0.1
max_seq_length = 5000  # Adjust if necessary

model = TransformerClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    num_classes=num_classes,
    max_seq_length=max_seq_length,
    dropout=dropout
)

# Set up training components
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training and evaluation loops
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_ids.size(0)
        _, predicted = torch.max(outputs, dim=1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
    avg_loss = total_loss / total
    accuracy = correct / total * 100
    return avg_loss, accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * input_ids.size(0)
            _, predicted = torch.max(outputs, dim=1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    avg_loss = total_loss / total
    accuracy = correct / total * 100
    return avg_loss, accuracy

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%\n')

# Testing the model
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%')




Epoch 1/5
Train Loss: 1.7485, Train Accuracy: 35.20%
Validation Loss: 1.6919, Validation Accuracy: 35.85%

Epoch 2/5
Train Loss: 1.6582, Train Accuracy: 36.71%
Validation Loss: 1.6630, Validation Accuracy: 35.60%

Epoch 3/5
Train Loss: 1.6304, Train Accuracy: 37.07%
Validation Loss: 1.6373, Validation Accuracy: 35.80%

Epoch 4/5
Train Loss: 1.6132, Train Accuracy: 37.57%
Validation Loss: 1.6245, Validation Accuracy: 37.00%

Epoch 5/5
Train Loss: 1.6052, Train Accuracy: 37.75%
Validation Loss: 1.6432, Validation Accuracy: 38.10%

Test Loss: 1.6071, Test Accuracy: 38.65%


In [None]:
torch.save(model.state_dict(), 'transformer_classifier.pth')


In [None]:
!pip uninstall -y tensorflow && pip install tensorflow-cpu

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0
Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-cpu)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow-cpu)
  Downloading keras-3.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow-cpu)
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting namex (from keras>=3.5.0->tensorflow-cpu)
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting optree (from keras>=3.5.0->tensorflow-cpu)
  Downloading optree-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.

In [None]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification

# Initialize tokenizer and model
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=10)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import re

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
def tokenize(expression):
    """
    Convert expression string to tokens, preserving operators.
    """
    # Replace parentheses with spaces
    expr = expression.replace('(', '').replace(')', '')

    # Add spaces around brackets that aren't part of operators
    expr = re.sub(r'\[(?!(MIN|MAX|MED|SM))', ' [ ', expr)
    expr = expr.replace(']', ' ] ')

    # Split and filter empty strings
    #return [token for token in expr.split() if token]
    return expr.replace(' ', '')

In [None]:
X_train_P = [tokenize(expr) for expr in X_train]
X_val_P = [tokenize(expr) for expr in X_val]
X_test_P = [tokenize(expr) for expr in X_test]


In [None]:
print(X_train_P[0])

[MIN1[MED8013061]7[MED[SM4[MED[SM29[MED992[MIN8354856260]58]26898]34[SM3[SM[MIN[MIN[MED5879171]88[MAX25]7110][MAX6250[MIN224871]][MIN1[SM36405]5[MED638]0]564]37806]883][MAX5135][MED3[SM326[MAX2[MIN8[MAX82587159]009[MAX81][MAX594113232]2[MIN1902938]][MAX[SM5179382224][MAX7373789]8[MIN45]40274][SM25[MIN50033847]][MAX885380]7[MIN386157059][MAX3[MIN476]3[SM19824][MAX1573]2[MED98]]34]]46[MED298[MED[MAX89[MIN210521001]7]7[MED183][SM699][MIN[MED73310]67728]38]877]341]]523[MAX[MED5955092[MIN02[MAX[MIN5998952043]0[MED230[MED0487896671]786[MAX2771992]1]5[MAX781[SM738]382]62996]50[MED874[SM4[MAX799]52]5][MED50[MIN63[MAX33]60[MIN6207860880]7]21[MIN[MAX95]5][MAX77]490]]1]5[SM1[SM171[MED146[SM[MED983569]602]593]][SM323[MIN3[MAX5752292][MED05[SM2175154]879]47]]37]7501[MED[MIN927]6[SM726]0[MIN01[MIN90349[MAX82[MAX53295566]38345]][MIN[MIN85[MAX359]]8[MAX[MIN2649]54[MAX4487]1455][MED57[SM907735]63[MAX8903]1370]887579]403]]]36][MED02]3743[MED962]]81563]


In [None]:
# print the maximum length of the expression
max_len = max([len(expr) for expr in X_train_P])
print(f"Maximum expression length: {max_len}")

Maximum expression length: 2820


In [None]:

import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerTokenizer, LongformerForSequenceClassification, get_linear_schedule_with_warmup

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Initialize the tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

Using device: cuda


In [None]:





# Preprocess the data
def preprocess_data(sources, targets, tokenizer, max_length=1024):
    inputs = tokenizer(
        sources,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = torch.tensor(targets, dtype=torch.long)
    return inputs, labels


max_instances = 3000
max_length = 1024
train_inputs, train_labels = preprocess_data(X_train_P[0:max_instances], y_train[0:max_instances], tokenizer, max_length)
print("Done with train")
val_inputs, val_labels = preprocess_data(X_val_P[0:max_instances], y_val[0:max_instances], tokenizer, max_length)
print("Done with val")
test_inputs, test_labels = preprocess_data(X_test_P[0:max_instances], y_test[0:max_instances], tokenizer, max_length)
print("Done with test")





Done with train
Done with val
Done with test


In [None]:
# Define the Dataset
class ListOpsHuggingFaceDataset(Dataset):
    def __init__(self, inputs, labels):
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create Dataset objects
train_dataset = ListOpsHuggingFaceDataset(train_inputs, train_labels)
val_dataset = ListOpsHuggingFaceDataset(val_inputs, val_labels)
test_dataset = ListOpsHuggingFaceDataset(test_inputs, test_labels)

# Create DataLoaders
batch_size = 4  # Adjust based on GPU memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Initialize the model
num_classes = 10  # Targets are 0-9
model = LongformerForSequenceClassification.from_pretrained(
    'allenai/longformer-base-4096',
    num_labels=num_classes
)
model.to(device)

# Set up optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Define training and evaluation functions
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        global_attention_mask = torch.zeros_like(attention_mask)
        global_attention_mask[:, 0] = 1

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item() * input_ids.size(0)
        _, predicted = torch.max(logits, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = total_loss / total
    accuracy = correct / total * 100
    return avg_loss, accuracy

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            global_attention_mask = torch.zeros_like(attention_mask)
            global_attention_mask[:, 0] = 1

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                global_attention_mask=global_attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item() * input_ids.size(0)
            _, predicted = torch.max(logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / total
    accuracy = correct / total * 100
    return avg_loss, accuracy




Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_loss, val_acc = evaluate(model, val_loader, device)

    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2f}%\n')

# Evaluate on the test set
test_loss, test_acc = evaluate(model, test_loader, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.2f}%')

Epoch 1/3
Train Loss: 2.2588, Train Accuracy: 17.60%
Validation Loss: 2.2663, Validation Accuracy: 15.95%

Epoch 2/3
Train Loss: 2.2535, Train Accuracy: 17.53%
Validation Loss: 2.2560, Validation Accuracy: 15.95%

Epoch 3/3
Train Loss: 2.2471, Train Accuracy: 16.80%
Validation Loss: 2.2568, Validation Accuracy: 15.95%

Test Loss: 2.2397, Test Accuracy: 17.25%


In [None]:
!nvidia-smi


Thu Dec  5 12:58:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0              31W /  70W |  14919MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from IPython.display import clear_output
!kill -9 -1
clear_output()


In [None]:
'e'

'e'