In [None]:
from gliner import GLiNER 
model="../train/models/new_synth_binary_v2/checkpoint-3000"
trained_model=GLiNER.from_pretrained(model,device="cuda:0")
trained_model=trained_model.to("cuda:0")

In [None]:
from datasets import load_dataset
import pandas as pd
mt_bench = load_dataset("philschmid/mt-bench")
mt_questions=[x[0] for x in mt_bench["train"]["turns"]]
benign_uap=pd.read_json("dataset_for_gpt_benign_uap_structure.jsonl",lines=True)
benign_questions=[x[1]["content"] for x in benign_uap["messages"]]
xstest=pd.read_csv("xstest_v2_prompts.csv")["prompt"].unique().tolist()


In [50]:
safe_categories=["homonyms","figurative_language","safe_targets","safe_contexts","definitions","nons_group_real_discr","real_group_nons_discr",
                   "historical_events","privacy_public","privacy_fictional"]

xstest=pd.read_csv("xstest_v2_prompts.csv")
xstest_safe=xstest[xstest["type"].isin(safe_categories)]
xstest_safe_prompts=xstest_safe["prompt"].unique().tolist()

In [51]:
len(xstest_safe["type"].unique())

10

In [52]:
datasets={"MT":mt_questions,"Benign":benign_questions,"XSTEST":xstest_safe_prompts}

In [53]:
from tqdm import tqdm
import numpy as np
def compute_rejection_rate(data):
    output=[]
    for item in tqdm(data):
        question=item     
        answers=trained_model.predict_entities(question,labels=["malicious goal","benign goal"],flat_ner=True,multi_label=False,threshold=0.10)
      
        if len(answers)>0:
            best=np.argmax([x["score"] for x in answers])
            best_answer=answers[best]
            output.append({"span":best_answer["text"],"label":best_answer["label"]})
    
    rejected=[x for x in output if "malicious" in x["label"]]
    
   
    reject_rate=(len(rejected)/ len(data)) *100
    print(f"Rejection Rate: ",reject_rate)
    return rejected

In [None]:
res={}
for name,data in datasets.items(): 
    print("Dataset",name)
    res[name]=compute_rejection_rate(data)


## BASELINES

#### PROMPT GUARD

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="meta-llama/Llama-Prompt-Guard-2-86M")


In [None]:
from tqdm import tqdm
def compute_rejection_rate(data):    
    output=[]
    for item in tqdm(data):
        question=item       
        label=classifier(question)[0]["label"]            
        output.append(label)        
    rejected=[x for x in output if "LABEL_1"==x]    
    reject_rate=(len(rejected)/ len(data)) *100
    print(f"Rejection Rate: ",reject_rate)
    return rejected

In [None]:
prompt_guard_res={}
for name,data in datasets.items():
    print("Dataset",name)
    prompt_guard_res[name]=compute_rejection_rate(data)


### LLAMA GUARD

In [None]:
import os
import json
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env")

HF_TOKEN=os.environ.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["HUGGINFACEHUB_API_TOKEN"] = HF_TOKEN
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
from transformers import AutoConfig, AutoTokenizer,AutoModelForCausalLM
import torch
from vllm import LLM, SamplingParams
model_id = "meta-llama/Llama-Guard-3-8B"
dtype = torch.bfloat16
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)

In [None]:
llm =LLM(model=model_id)

In [55]:
from tqdm import tqdm
sampling_params = SamplingParams(temperature=0.00, top_p=1, max_tokens=25)
def compute_rejection_rate(data):   
    output=[]                        
    messages=[[{"role": "user", "content": x}] for x in data]
    outputs = llm.chat(messages, sampling_params)
    for i, answer in enumerate(outputs):
        generated_text = answer.outputs[0].text
        prompt = data[i]
        output.append({"prompt": prompt, "label": generated_text})

    rejected=[x for x in output if "unsafe" in x["label"]]     
         
    reject_rate=(len(rejected)/ len(data)) *100
    print(f"Rejection Rate: ",reject_rate)
    
    return rejected

In [None]:
llama_guard_res={}
for name,data in datasets.items():
    print("Dataset",name)    
    llama_guard_res[name]=compute_rejection_rate(data)
