In [1]:
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset, Dataset
from transformers import AutoConfig
from huggingface_hub import login
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
import os

# Load the dataset with embeddings and scores
dataset = load_dataset("daparasyte/gpt4_dataset_prompt_scores_with_embeddings")

README.md:   0%|          | 0.00/479 [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/98.3M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/98.2M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/18.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/109101 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [2]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

In [3]:
# Prepare DataLoader for balanced training data
train_embeddings = torch.tensor(train_dataset["embedding"], dtype=torch.float32)
train_scores = torch.tensor(train_dataset["score"], dtype=torch.long) - 1  # Zero-indexing the classes for PyTorch
val_embeddings = torch.tensor(val_dataset["embedding"], dtype=torch.float32)
val_scores = torch.tensor(val_dataset["score"], dtype=torch.long) - 1  # Zero-indexing the classes

# Define DataLoaders
batch_size = 32
train_loader = DataLoader(TensorDataset(train_embeddings, train_scores), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(val_embeddings, val_scores), batch_size=batch_size)

In [4]:
# Print shapes and label range for debugging
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Train scores min: {train_scores.min()}, max: {train_scores.max()}")
print(f"Validation embeddings shape: {val_embeddings.shape}")
print(f"Validation scores min: {val_scores.min()}, max: {val_scores.max()}")

Train embeddings shape: torch.Size([109101, 1024])
Train scores min: 0, max: 4
Validation embeddings shape: torch.Size([10000, 1024])
Validation scores min: 0, max: 4


In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Import Transformer layers
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import numpy as np

class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=5, nhead=8, nlayers=5, dropout_rate=0.5):
        super(TransformerClassifier, self).__init__()
        
        # Define Transformer Encoder
        encoder_layers = TransformerEncoderLayer(d_model=input_dim, nhead=nhead, dim_feedforward=hidden_dim, dropout=dropout_rate)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=nlayers)
        
        # Fully connected layers with dropout
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)  # Add dropout
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.transformer_encoder(x.unsqueeze(1))  # Add sequence dimension
        x = torch.relu(self.fc1(x[:, -1, :]))  # Use last token
        x = self.dropout(x)  # Apply dropout after the first FC layer
        return self.fc2(x)


# Initialize model, criterion, optimizer, etc.
device = "cuda" if torch.cuda.is_available() else "cpu"
input_dim = train_embeddings.shape[1]
model = TransformerClassifier(input_dim=input_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)


# class PromptScoreClassifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim=128, output_dim=5):
#         super(PromptScoreClassifier, self).__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.3)
#         self.fc2 = nn.Linear(hidden_dim, output_dim)  # Output layer for 5 classes

#     def forward(self, x):
#         x = self.relu(self.fc1(x))
#         x = self.dropout(x)
#         return self.fc2(x)  # Logits for each class

# # Initialize model, criterion, optimizer, etc.
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = PromptScoreClassifier(input_dim=train_embeddings.shape[1]).to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [6]:
# Model summary
print("Model Summary:")
print(f"Number of Transformer layers: {model.transformer_encoder.num_layers}")
print(f"Number of heads: {model.transformer_encoder.layers[0].self_attn.num_heads}")
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

Model Summary:
Number of Transformer layers: 5
Number of heads: 8
Total parameters: 23904005


In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report

# Define the training loop
def train_model(
    model,
    train_loader,
    val_loader,
    lr=1e-4,
    weight_decay=1e-5,
    num_epochs=50,
    device="cuda"
):
    # Initialize optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    best_val_accuracy = 0.0
    progress_bar = tqdm(total=num_epochs)

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss_sum = 0.0
        n = 0
        for embeddings, scores in train_loader:
            embeddings, scores = embeddings.to(device), scores.to(device)

            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, scores)
            loss.backward()
            optimizer.step()

            train_loss_sum += loss.item() * embeddings.size(0)
            n += embeddings.size(0)

        avg_train_loss = train_loss_sum / n

        # Validation phase
        model.eval()
        val_loss_sum = 0.0
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for embeddings, scores in val_loader:
                embeddings, scores = embeddings.to(device), scores.to(device)
                outputs = model(embeddings)
                val_loss = criterion(outputs, scores)
                val_loss_sum += val_loss.item() * embeddings.size(0)

                _, predicted = torch.max(outputs, 1)
                val_preds.extend(predicted.cpu().numpy())
                val_labels.extend(scores.cpu().numpy())

        avg_val_loss = val_loss_sum / len(val_loader.dataset)
        val_accuracy = accuracy_score(val_labels, val_preds)
        val_mae = mean_absolute_error(val_labels, val_preds)

        # Print epoch results
        print(f"Epoch [{epoch + 1}/{num_epochs}], "
              f"Train Loss: {avg_train_loss:.4f}, "
              f"Validation Loss: {avg_val_loss:.4f}, "
              f"Validation Accuracy: {val_accuracy:.4f}, "
              f"Validation MAE: {val_mae:.4f}")

        # Check for best validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), "best_model.pt")  # Save best model

        progress_bar.set_postfix(train_loss=avg_train_loss, val_loss=avg_val_loss, val_acc=val_accuracy)
        progress_bar.update(1)

    progress_bar.close()



# Train the model
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    lr=1e-4,
    weight_decay=1e-5,
    num_epochs=50,
    device=device
)

In [7]:
# Load the best model for evaluation
# model.load_state_dict(torch.load("best_model.pt"))
model.load_state_dict(torch.load("/kaggle/input/router-training/best_model.pt"))
model.eval()
print("Best model loaded for final evaluation.")

  model.load_state_dict(torch.load("/kaggle/input/router-training/best_model.pt"))


Best model loaded for final evaluation.


In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification
from huggingface_hub import HfApi, Repository
import os
import torch

# Save the model's state dictionary
save_directory = "/kaggle/working/prompt_complexity_classifier_1"
os.makedirs(save_directory, exist_ok=True)
torch.save(model.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))

# Save the config
config = AutoConfig.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
config.num_labels = 5  # Since we are scoring from 1 to 5
config.save_pretrained(save_directory)

# # Create a model card with simple information about the model
# with open(os.path.join(save_directory, "README.md"), "w") as f:
#     f.write("# Prompt Complexity Classifier\nThis model scores prompts based on complexity from 1 to 5.")

# Push the directory to the Hub
from huggingface_hub import HfApi
api = HfApi()
repo_id = "daparasyte/prompt_complexity_classifier_1"  # Replace with your username/repo name

# Authenticate
from huggingface_hub import login
login(token="hf_api_key")  # Replace with your Hugging Face API key

# Upload the model directory
api.upload_folder(
    folder_path=save_directory,
    path_in_repo=".",
    repo_id=repo_id,
    repo_type="model"
)

print("Model pushed to Hugging Face Hub successfully!")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


pytorch_model.bin:   0%|          | 0.00/95.6M [00:00<?, ?B/s]

Model pushed to Hugging Face Hub successfully!
