In [None]:
# Load Data

import kagglehub
import os
import pandas as pd

# Download latest version
# path = kagglehub.dataset_download("linkanjarad/coding-problems-and-solution-python-code")

# file_path = os.path.join(path, 'ProblemSolutionPythonV3.csv')
# df = pd.read_csv(file_path)
# df.drop(df.columns[0], axis=1, inplace=True)
# df = df.loc[df['Python Code'].str.contains('pandas', case=False, na=False)]
# df

path = '/mnt/c/Users/Pavilion/Documents/BYU-Idaho/Classwork/Winter 2025/DS499/training_data.csv'
df = pd.read_csv(path)
df

In [None]:
# Loading in pre-trained model: CodeLlama (https://huggingface.co/Salesforce/codegen-350M-mono)

from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import torch
import torch.nn.utils.prune as prune
import torch.nn as nn

torch.cuda.empty_cache()

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Tell PyTorch which GPU to use
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizer parallelism to prevent crashes
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # Set CUDA_LAUNCH_BLOCKING for debugging

pretrained = "Salesforce/codegen-350M-mono"
tokenizer = AutoTokenizer.from_pretrained(pretrained)
model = AutoModelForCausalLM.from_pretrained(pretrained)

# Model pruning

def apply_magnitude_pruning(model, amount=0.2):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            prune.l1_unstructured(module, name='weight', amount=amount)
            prune.remove(module, 'weight')  # Optionally, remove the pruned weights

apply_magnitude_pruning(model, .25)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [3]:
import torch
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch.nn.functional as F

# Define column names, user queries, and columns in query
column_names = df['columns'].explode().unique()  # All unique column names across all rows
user_queries = df['x']  # Column with the user's query
columns_in_query = df['columns']  # Each row has a list of columns used in the query

import torch
import torch.nn.functional as F

def equalize_and_concatenate(tensor1, tensor2):
    batch_size, seq_len, feat_dim = tensor1.shape

    expanded_tensor2 = tensor2[:seq_len, :].unsqueeze(0).expand(batch_size, -1, -1)
    combined_tensor = torch.cat((tensor1, expanded_tensor2), dim=-1)

    # print(f'Combined dimensions: {combined_tensor.shape}')
    
    # Concatenate along the last dimension (feature dimension)
    return combined_tensor


# Define the ColumnNameEmbedder class
class ColumnNameEmbedder(nn.Module):
    def __init__(self, column_names, embedding_dim=16):
        super(ColumnNameEmbedder, self).__init__()
        self.column_name_to_idx = {name: idx for idx, name in enumerate(column_names)}
        self.embedding = nn.Embedding(len(column_names), embedding_dim)

    def forward(self, columns):
        column_indices = [self.column_name_to_idx[col] for col in columns]
        column_indices = torch.tensor(column_indices, dtype=torch.long, device=self.embedding.weight.device)
        column_embeddings = self.embedding(column_indices)
        return column_embeddings

# Define the EntityQueryModel class
class EntityQueryModel(nn.Module):
    def __init__(self, column_embedder, hidden_dim=128, embedding_dim=16):
        super(EntityQueryModel, self).__init__()
        self.column_embedder = column_embedder
        self.embedding_dim = embedding_dim  # Set embedding_dim as an attribute

        # Load the tokenizer and model for query embeddings
        self.query_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
        self.query_encoder = AutoModel.from_pretrained("Salesforce/codegen-350M-mono")
        self.query_tokenizer.pad_token = self.query_tokenizer.eos_token

        # LSTM and output layer
        self.lstm = nn.LSTM(input_size=embedding_dim * 2, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, self.query_tokenizer.vocab_size)

        # Projection layer to match embedding sizes if needed
        self.query_projection = nn.Linear(self.query_encoder.config.hidden_size, embedding_dim)
        self.column_projection = nn.Linear(embedding_dim, embedding_dim)  # Use the passed embedding_dim

    def forward(self, query, columns):
        device = self.lstm.weight_ih_l0.device

        # Tokenize and encode the query
        query_tokens = self.query_tokenizer(query, padding=True, truncation=False, return_tensors="pt").to(device)
        query_embedding = self.query_encoder(**query_tokens).last_hidden_state  # Shape: (batch, seq_len, hidden_dim)
        query_embedding = self.query_projection(query_embedding)  # Project to (batch, seq_len, embedding_dim)

        # Get column embeddings
        column_embeddings = self.column_embedder(columns).to(device)  # Assuming this outputs (batch, seq_len, embedding_dim)
        column_embeddings = self.column_projection(column_embeddings)  # Ensure correct embedding size

        # Equalize and concatenate embeddings
        combined_embeddings = equalize_and_concatenate(query_embedding, column_embeddings)

        # Pass through LSTM
        lstm_output, _ = self.lstm(combined_embeddings)
        output = self.fc(lstm_output)  # Use last LSTM output

        return output


# Define the tokenize_inputs function
def tokenize_inputs(values):
    tokenizer.pad_token = tokenizer.eos_token
    tokenized = tokenizer(values.astype(str).tolist(), padding='max_length', max_length=1000, truncation=False, return_tensors="pt")
    return tokenized

# Prepare data for training
batch_size = 2
X = df['x']
y = df['y']

train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(X, y, test_size=0.2, random_state=42)
train_inputs_tokenized = tokenize_inputs(train_inputs)
val_inputs_tokenized = tokenize_inputs(val_inputs)
train_outputs_tokenized = tokenize_inputs(train_outputs)
val_outputs_tokenized = tokenize_inputs(val_outputs)

train_dataset = Dataset.from_dict({
    'input_ids': train_inputs_tokenized['input_ids'],
    'attention_mask': train_inputs_tokenized['attention_mask'],
    'labels': train_outputs_tokenized['input_ids'],
}).with_format(type='torch')

val_dataset = Dataset.from_dict({
    'input_ids': val_inputs_tokenized['input_ids'],
    'attention_mask': val_inputs_tokenized['attention_mask'],
    'labels': val_outputs_tokenized['input_ids'],
}).with_format(type='torch')

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, num_workers=4, pin_memory=True)

In [None]:
query_embedding = torch.randn(4, 28, 16)  # [batch_size, seq_len, query_dim]
column_embeddings = torch.randn(974, 16)  # [num_columns, column_dim]

combined_embeddings = equalize_and_concatenate(query_embedding, column_embeddings)
'''
Expected Output:

Combined dimensions: torch.Size([4, 28, 32])
'''


In [None]:
# Instantiate the column name embedder
column_name_embedder = ColumnNameEmbedder(column_names)

# Instantiate the entity query model
entity_query_model = EntityQueryModel(column_embedder=column_name_embedder)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)  # Cross entropy loss function
optimizer = optim.Adam(entity_query_model.parameters(), lr=0.1)

# Training loop with early stopping
num_epochs = 12  # Adjust the number of epochs as needed
patience = 3  # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')
epochs_without_improvement = 0

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
entity_query_model.to(device)

for epoch in range(num_epochs-1):
    print(f'Epoch {epoch + 1} in progress...')
    entity_query_model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_tokens = 0
    
    for batch in train_dataloader:
        inputs = batch['input_ids'].to(device)  # Encoded user query
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)  # Encoded Python code (target sequence)

        optimizer.zero_grad()

        # Forward pass: Generate logits for next token prediction
        inputs_list = tokenizer.batch_decode(inputs, skip_special_tokens=True)
        outputs = entity_query_model(inputs_list, columns_in_query)

        if epoch == 0 and batch == 0:
            print(f'inputs_list:\n{inputs_list}')
            print(f"Outputs before flattening: {outputs.shape}")  # Should be (batch_size, seq_len, vocab_size)
            print(f"Labels before flattening: {labels.shape}")    # Should be (batch_size, seq_len)

            # Padding outputs to a length of 1000 tokens (truncating if necessary)
        max_length = 1000
        seq_len = outputs.size(1)

        if seq_len < max_length:
            # Pad the outputs to match the required max length
            padding = (0, 0, 0, max_length - seq_len)  # (pad_left, pad_right, pad_top, pad_bottom)
            outputs = F.pad(outputs, padding, value=tokenizer.pad_token_id)
        elif seq_len > max_length:
            # Truncate if the sequence length exceeds the max_length
            outputs = outputs[:, :max_length, :]

        outputs = outputs.view(-1, outputs.size(-1))  # Flatten to (batch_size * seq_len, vocab_size)
        labels = labels.view(-1)

        # Filter out invalid labels
        valid_mask = labels < outputs.size(-1)
        outputs = outputs[valid_mask]
        labels = labels[valid_mask]

        if torch.any(labels >= outputs.size(-1)):
            print(f"Invalid label found: {labels[labels >= outputs.size(-1)]}")
            raise ValueError("Invalid label found")

        assert outputs.shape[0] == labels.shape[0], f"Mismatch: Outputs shape {outputs.shape}, Labels shape {labels.shape}"

        # Calculate Loss
        if epoch == 0 and batch == 0:
            print(f"Inputs shape: {inputs.shape}, Outputs shape: {outputs.shape}, Labels shape: {labels.shape}")

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

        # Calculate accuracy
        predicted_tokens = outputs.argmax(dim=-1)  # Get most likely token per position
        mask = labels != tokenizer.pad_token_id  # Ignore padding tokens

        correct_predictions += (predicted_tokens[mask] == labels[mask]).sum().item()
        total_tokens += mask.sum().item()  # Count only valid tokens
    
    epoch_loss = running_loss / len(train_dataloader)
    epoch_accuracy = correct_predictions / total_tokens
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}, Accuracy: {epoch_accuracy}")

    # Validation loop
    entity_query_model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_tokens = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            inputs_list = tokenizer.batch_decode(inputs, skip_special_tokens=True)
            
            # Forward pass
            outputs = entity_query_model(inputs_list, columns_in_query)
            
            # Compute the loss
            outputs = outputs.view(-1, outputs.size(-1))  # Flatten to (batch_size * seq_len, vocab_size)
            labels = labels.view(-1)

            # Filter out invalid labels
            valid_mask = labels < outputs.size(-1)
            outputs = outputs[valid_mask.nonzero(as_tuple=True)]
            labels = labels[valid_mask]

            # Debugging: Check for invalid labels
            if torch.any(labels >= outputs.size(-1)):
                print(f"Invalid label found: {labels[labels >= outputs.size(-1)]}")
                raise ValueError("Invalid label found")

            loss = criterion(outputs, labels)
            
            val_loss += loss.item()

            # Calculate accuracy
            predicted_tokens = outputs.argmax(dim=-1)
            mask = labels != tokenizer.pad_token_id  # Ignore padding tokens

            correct_predictions += (predicted_tokens[mask] == labels[mask]).sum().item()
            total_tokens += mask.sum().item()
    
    val_loss /= len(val_dataloader)
    val_accuracy = correct_predictions / total_tokens
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Check for early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        # Save the best model
        torch.save(entity_query_model.state_dict(), 'best_model.pth')
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print("Early stopping triggered")
            break

    torch.cuda.empty_cache()