In [1]:
import pandas as pd

df = pd.read_csv("prompts_rows.csv")

print(df.columns)

Index(['id', 'content', 'user_id', 'is_dangerous', 'positive_ratings',
       'negative_ratings', 'created_at', 'updated_at', 'scenarios_violation',
       'tools_violation', 'intent_violation', 'scenarios_violation_count',
       'tools_violation_count', 'intent_violation_count', 'label',
       'embedding'],
      dtype='object')


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
import pandas as pd

# --- Dataset Definition ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item['text'] = self.texts[idx]  # Add this line
        return item

# --- Model Definition ---

class ContrastiveModel(nn.Module):
    def __init__(self, model_name='bert-base-uncased', projection_dim=128):
        super().__init__()
        self.encoder = SentenceTransformer("../intersafety/intersafety-backend/InterSafetyDeploy/src/local_model")
        hidden_size = 384
        self.projector = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, projection_dim)
        )

    def forward(self, texts):
        embeddings = self.encoder.encode(texts, convert_to_tensor=True)
        embeddings = embeddings.clone()  # <-- Add this line
        projected = self.projector(embeddings)
        return projected

class Classifier(nn.Module):
    def __init__(self, contrastive_model, projection_dim=128, num_classes=2):
        super().__init__()
        self.encoder = contrastive_model.encoder
        self.projector = contrastive_model.projector
        self.classifier = nn.Linear(projection_dim, num_classes)

    def forward(self, texts):
        with torch.no_grad():
            embeddings = self.encoder.encode(texts, convert_to_tensor=True)
            embeddings = self.projector(embeddings)
        logits = self.classifier(embeddings)
        return logits

# --- Loss Function ---

def supervised_contrastive_loss(embeddings, labels, temperature=0.1):
    embeddings = F.normalize(embeddings, dim=1)
    similarity_matrix = torch.matmul(embeddings, embeddings.T) / temperature
    labels = labels.unsqueeze(1)
    mask = torch.eq(labels, labels.T).float()
    logits_mask = torch.ones_like(mask) - torch.eye(mask.size(0)).to(mask.device)
    mask = mask * logits_mask

    exp_logits = torch.exp(similarity_matrix) * logits_mask
    log_prob = similarity_matrix - torch.log(exp_logits.sum(1, keepdim=True) + 1e-9)

    mean_log_prob_pos = (mask * log_prob).sum(1) / (mask.sum(1) + 1e-9)
    loss = -mean_log_prob_pos.mean()
    return loss


In [15]:
df = pd.read_csv("prompts_rows.csv")  # Your CSV path here
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])  # SAFE=0, DANGEROUS=1

texts = df['content'].tolist()
labels = df['label_id'].tolist()

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = TextDataset(texts, labels, tokenizer)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = ContrastiveModel().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3


model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        texts_batch = batch['text']
        labels_batch = batch['labels'].to(device)

        embeddings = model(texts_batch)
        loss = supervised_contrastive_loss(embeddings, labels_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Contrastive Loss: {total_loss / len(train_loader):.4f}")


model.eval()
classifier = Classifier(model).to(device)
criterion = nn.CrossEntropyLoss()
optimizer_cls = AdamW(classifier.classifier.parameters(), lr=1e-3)

# --- Train Classification Head ---

classifier.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        texts_batch = batch['text']
        labels_batch = batch['labels'].to(device)

        logits = classifier(texts_batch)
        loss = criterion(logits, labels_batch)

        optimizer_cls.zero_grad()
        loss.backward()
        optimizer_cls.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Classifier Loss: {total_loss / len(train_loader):.4f}")

# --- Evaluation ---

classifier.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        texts_batch = batch['text']
        labels_batch = batch['labels'].to(device)

        logits = classifier(texts_batch)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels_batch.cpu().numpy())

acc = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {acc:.4f}")

torch.save(model.state_dict(), "new_encoder.pt")
print("Model saved as new_encoder.pt")

Using device: cpu


  return forward_call(*args, **kwargs)


Epoch 1 Contrastive Loss: 3.4275
Epoch 2 Contrastive Loss: 3.4074
Epoch 3 Contrastive Loss: 3.4191
Epoch 1 Classifier Loss: 0.6795
Epoch 2 Classifier Loss: 0.5960
Epoch 3 Classifier Loss: 0.5281
Validation Accuracy: 0.9529
Model saved as new_encoder.pt


In [13]:
# Load your model
model = ContrastiveModel()
model.load_state_dict(torch.load("new_encoder.pt", map_location=torch.device("cpu")))
model.eval()

# Input text (can be a single string or list of strings)
text = "This is a test sentence to re-encode."
projected_embedding = model([text])  # Wrap in list
print(len(projected_embedding.tolist()[0]))  # Should be [1, projection_dim]

128
