In [2]:
model="../train/models/new_synth_binary_v2/checkpoint-3000"


In [None]:
from gliner import GLiNER 
trained_model=GLiNER.from_pretrained(model,device="cuda:0")
trained_model=trained_model.to("cuda:0")

## DATASETS

In [4]:
import json
deep_inception=json.load(open("../Jailbreak_attacks/attacks2test/DeepInception/data.json","r"))
dan=json.load(open("../Jailbreak_attacks/attacks2test/DAN/Dan_samples.json"))
deep_prompts=[{"prompt":item["inception_attack"],"ann":[item["plain_attack"].lower().replace(".",""),1]} for item in deep_inception]
dan_prompts=[{"prompt":item["full_prompt"],"ann":[item["question"],1]} for item in dan]

In [5]:
import os
sap_200_categories = os.listdir("../Jailbreak_attacks/attacks2test/SAP200/")
sap_prompts=[]
for category in sap_200_categories:
    attack = json.load(open(f"../Jailbreak_attacks/attacks2test/SAP200/{category}/generated_cases.json"))
    for item in attack:
        sap_prompts.append({"category": category, "prompt": item["Attack Prompt"]})



In [14]:
import os
import json
from tqdm import tqdm
import numpy as np
str2id={"malicious goal":1,"benign goal":0}
attacks={"DAN":dan_prompts,"SAP200":sap_prompts,"DeepInception":deep_prompts}
for attack,data in attacks.items():
    output=[]   
    print("Processing",attack)
    for item in tqdm(data):
        prompt=item["prompt"]
        answers=trained_model.predict_entities(prompt,labels=["malicious goal","benign goal"],flat_ner=True,multi_label=False,threshold=0.10)
        request="[]"
        if len(answers)>0:
            best=np.argmax([x["score"] for x in answers])
            best_answer=answers[best]
            lbl=best_answer["label"]
            request=best_answer["text"] 
            lbl=str2id[lbl]           
        else:
            lbl=0
        output.append({"prompt":item["prompt"],"user_request":request,"label":lbl})
    with open(f"output/{attack}_JS.json","w") as f:
        json.dump(output,f,indent=2)

Processing DAN


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1560/1560 [01:57<00:00, 13.31it/s]


Processing SAP200


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600/1600 [00:39<00:00, 40.89it/s]


Processing DeepInception


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 42.62it/s]


In [17]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def compute_metrics(predictions,gt):
    tp,fp,fn=0,0,0
    for real_item in gt:
        for pred_item in predictions:
            if pred_item["prompt"]==real_item["prompt"]:
                pred_span=pred_item["user_request"]   
                pred_label=pred_item["label"]
                real_span,real_label=real_item["ann"]
                #print(pred_span,len(pred_span))
                if pred_span!="[]":                    
                    rouge_score=scorer.score(real_span, pred_span)["rougeL"].fmeasure         
               
                    if rouge_score >=0.70 and pred_label==real_label:
                        tp+=1
                    else:
                      
                        fp+=1
                else:
                    fn+=1
  
    p=tp/(tp+fp)
    r=tp/(tp+fn)
    f1=2*(p*r)/(p+r)
    return p,r,f1

In [23]:
DATASETS={"DAN":dan_prompts,"DeepInception":deep_prompts}
for name,dataset in DATASETS.items():
    print("Dataset",name)
    predictions=json.load(open(f"output/{name}_JS.json"))
    p,r,f1=compute_metrics(predictions,dataset)
    print(f"Precision: {p*100:.2f} --Recall: {r*100:.2f} --F1: {f1*100:.2f}")
    print("*"*50)

Dataset DAN
Precision: 62.64 --Recall: 77.93 --F1: 69.46
**************************************************
Dataset DeepInception
Precision: 82.00 --Recall: 100.00 --F1: 90.11
**************************************************
