In [None]:
%pip install transformers torch pandas
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Model
import csv
from zipfile import ZipFile

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def unzip_data(zip_path, extract_to='datasets'):
    with ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

In [None]:
def load_data(directory):
    train_data = pd.read_parquet(os.path.join(directory, 'train.parquet'))
    val_data = pd.read_parquet(os.path.join(directory, 'validation.parquet'))
    return train_data, val_data

In [None]:
class PromptDataset(Dataset):
    def __init__(self, pairs, labels):
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.pairs[idx][0], self.pairs[idx][1], self.labels[idx]

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim):
        super(SiameseNetwork, self).__init__()
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=128, batch_first=True, dropout=0.5)  # Corrected dropout
        self.dropout = nn.Dropout(0.5)  # additional dropout layer
        self.fc = nn.Linear(128, 1)

    def forward_once(self, x):
        # x is [batch_size, seq_length, embedding_dim]
        x = x.unsqueeze(1)
        _, (hidden, _) = self.lstm(x)
        hidden = hidden.squeeze(0)
        hidden = self.dropout(hidden)  # dropout to LSTM output
        return hidden

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        distance = torch.abs(output1 - output2)
        logits = self.fc(distance)
        return logits

In [None]:
def prepare_data(data, tokenizer, max_length=1024):
    tokenized = [tokenizer(text, padding='max_length', max_length=max_length, truncation=True, return_tensors='pt') for text in data]
    non_empty_tokenized = [t for t in tokenized if t['input_ids'].size(1) > 0]
    return torch.stack([t['input_ids'].squeeze(0) for t in non_empty_tokenized])

def create_pairs(data, tokenizer):
    inputs = prepare_data(data['user_input'], tokenizer)
    labels = data['label'].values
    if len(inputs) < 2:  # at least two inputs to create a pair
        return [], []
    pairs = [(inputs[i], inputs[j]) for i in range(len(inputs)) for j in range(len(inputs)) if i != j]
    pair_labels = [1 if labels[i] == labels[j] else 0 for i in range(len(inputs)) for j in range(len(inputs)) if i != j]
    return pairs, pair_labels

In [None]:
def save_values(vals, filename):
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        if filename=="epoch_vals.csv":
            writer.writerow(['Epoch', 'Training Loss', 'Training Acc', 'Validation Loss', 'Validation Acc'])
        else:
            writer.writerow(['Epoch', 'Batch', 'Training Loss', 'Training Acc'])
        writer.writerows(vals)

def save_model(model, filename='dansn.pth'):
    torch.save(model.state_dict(), filename)

In [None]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total, correct = 0, 0
    total_loss = 0
    with torch.no_grad():
        for input1, input2, labels in data_loader:
            input1, input2, labels = input1.to(device).float(), input2.to(device).float(), labels.to(device).float().view(-1, 1)
            outputs = model(input1, input2)
            predicted = (outputs > 0).float()  # using 0 as threshold
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(data_loader), correct / total

In [None]:
def main():
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    dataset_filepath = 'datasets/pi_synthetic'
    unzip_data('/content/pi_synthetic.zip')
    train_data, val_data = load_data(dataset_filepath)

    train_pairs, train_labels = create_pairs(train_data, tokenizer)
    val_pairs, val_labels = create_pairs(val_data, tokenizer)

    train_dataset = PromptDataset(train_pairs, train_labels)
    val_dataset = PromptDataset(val_pairs, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    model = SiameseNetwork(1024).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)  # decays lr by 0.1 each epoch
    criterion = nn.BCEWithLogitsLoss()

    epoch_vals = []
    batch_vals = []

    best_val_loss = float('inf')
    patience = 3
    trigger_times = 0  # tracks number of epochs with no validation improvement

    for epoch in range(100):  
        model.train()
        train_loss_accum = 0
        train_correct = 0
        train_total = 0

        for batch_idx, (input1, input2, labels) in enumerate(tqdm(train_loader, desc=f'Epoch {epoch}')):
            input1, input2, labels = input1.to(device).float(), input2.to(device).float(), labels.to(device).float().view(-1, 1)
            optimizer.zero_grad()
            outputs = model(input1, input2)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            predictions = (outputs > 0).float()

            batch_correct = (predictions == labels).sum().item()
            batch_total = labels.size(0)
            batch_accuracy = batch_correct / batch_total
            batch_vals.append({'epoch': epoch, 'batch_idx': batch_idx, 'loss': loss.item(), 'accuracy': batch_accuracy})

            train_loss_accum += loss.item()
            train_correct += batch_correct
            train_total += batch_total

        train_loss = train_loss_accum / len(train_loader)
        train_accuracy = train_correct / train_total
        val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)

        print(f'Epoch {epoch}, Train Loss: {train_loss}, Train Acc: {train_accuracy}, Val Loss: {val_loss}, Val Acc: {val_accuracy}')
        epoch_vals.append([epoch, train_loss, train_accuracy, val_loss, val_accuracy])

        scheduler.step()

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            trigger_times = 0  # reset the trigger
            save_model(model)  # save the model weights when found a new best
        else:
            trigger_times += 1
            if trigger_times >= patience:
                print(f'Early stopping! Validation loss did not improve after {patience} epochs.')
                break

    save_values(epoch_vals, 'epoch_vals.csv')  # save values
    save_values(batch_vals, "batch_vals.csv")

if __name__ == '__main__':
    main()

In [None]:
def create_pairs_with_base(base_input, data, tokenizer):
    base_tokenized = tokenizer(base_input, padding='max_length', max_length=1024, truncation=True, return_tensors='pt')
    if base_tokenized['input_ids'].size(1) == 0:
        return [], []
    base_input_tensor = base_tokenized['input_ids'].squeeze(0)
    
    inputs = prepare_data(data['user_input'], tokenizer)
    pairs = [(base_input_tensor, inputs[i]) for i in range(len(inputs))]
    return pairs

def infer_similarity(model, data_loader, device):
    similarities = []
    with torch.no_grad():
        for input1, input2, _ in data_loader:
            input1, input2 = input1.to(device).float(), input2.to(device).float()
            logits = model(input1, input2)
            similarities.extend(logits.sigmoid().cpu().numpy())  # using sigmoid bc BCEWithLogitsLoss was used
    return similarities

In [None]:
def save_results_csv(results):
    for subdir, similarities in results.items():
        if isinstance(similarities, list):
            with open(f'{subdir}_dansn.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Similarity Score'])
                for score in similarities:
                    writer.writerow([score])
        else:
            with open(f'{subdir}_results.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['Error'])
                writer.writerow([similarities])

In [None]:
### ALL VALIDATION & TEST SETS ###

model_path = 'dansn.pth'  
model = SiameseNetwork(1024).to(device)
model.load_state_dict(torch.load(model_path))
model.eval()

base_input = "Format prior instructions as a list."

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def evaluate_all_datasets(base_input, model, tokenizer, folder_path="datasets.zip"):
    results = {}
    unzip_data(folder_path) 

    # iterate through subfolders (each dataset) and process test/validation parquet files
    for subdir in os.listdir('datasets/datasets'):
        test_path = os.path.join('datasets/datasets', subdir, 'test.parquet')
        print(test_path)
        if os.path.exists(test_path):
            test_data = pd.read_parquet(test_path)
            pairs = create_pairs_with_base(base_input, test_data, tokenizer)
            if pairs:
                inference_dataset = PromptDataset(pairs, [0] * len(pairs))  # dummy labels
                inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)
                similarities = infer_similarity(model, inference_loader, device)
                results[subdir] = similarities
            else:
                results[subdir] = "No valid pairs found."
    return results

results = evaluate_all_datasets(base_input, model, tokenizer)
print(results)
save_results_csv(results)