In [1]:
import os,sys,gc,json,warnings,random,torch
sys.path.append("./share")
sys.path.append("./common")
sys.path.append("./llama")
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import torch.nn.functional as F
from LLMCommon import CLLMCommon
from LLMDataset import CLLMDataset
from LLMFTRegression import CLLMFTRegression
from LLMFLClassify import CLLMFLClassify,evalulate_classify
from LLMModelClassify import CLLMModelClassify
from Config import g_data_root,g_embedding_shape,get_attack_score
from IoTSample import CIoTSample

warnings.filterwarnings("ignore")
g_token_root = "%stoken"%g_data_root
g_embedding_root = "%sembedding"%g_data_root

In [2]:
model = CLLMModelClassify(0)
model.load_huggingface()
model.m_model.eval()
pass

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def get_embedding(df_data,max_length = 640*1024):
    embedding = model.predict(df_data['text'].tolist(),batch_size=1,max_length=max_length)
    noised = model.predict(df_data['noised'].tolist(),batch_size=1,max_length=max_length)
    df_ret = df_data.copy(deep = True)
    df_ret['token_embedding'] = pd.Series(embedding)
    df_ret['noised_embedding'] = pd.Series(noised)
    return df_ret

In [5]:
class CLLMSample:
        
    def __init__(self,attack):
        self.m_attack = attack
        index_file = "%s/%s/index.csv"%(g_token_root,attack)
        self.m_df_index = pd.read_csv(index_file,index_col=0)
        
    def read_text(self, file_name):
        with open(file_name,'r') as fp:
            data = fp.read()
            text = "<|begin_of_text|>"+data+"<|end_of_text|>"
        return text

    def resize_embedding(self,embedding1):
        embedding = np.array(embedding1).squeeze()
        array_list = embedding.tolist()
        embedding = np.array(array_list)
        embedding = torch.tensor(embedding,dtype=torch.float32)
        pooled_tensor = F.adaptive_avg_pool2d(embedding.unsqueeze(0).unsqueeze(0), g_embedding_shape)
        pooled_tensor = pooled_tensor.squeeze().numpy().tolist()
        return pooled_tensor
        
    def write_embedding(self,file_name,embedding):
        with open(file_name,"w") as fp1:
            json_str = json.dumps(embedding)
            fp1.write(json_str)
                
    def get_group_text(self,group):
        df_tmp = self.m_df_index[self.m_df_index['group' ] == group ]
        if df_tmp['Label'].nunique() > 1 :
            print("There are too much Labels in one Group")
            return None, None, None
        if df_tmp['score'].nunique() > 1 :
            print("There are too much score in one Group")
            return None, None, None
        if df_tmp['llm_token_file'].nunique() > 1 :
            print("There are too much llm_token_file in one Group")
            return None, None, None
        if df_tmp['llm_noised_file'].nunique() > 1 :
            print("There are too much llm_noised_file in one Group")
            return None, None, None
            
        raw_score = df_tmp.iloc[0]['score']
        raw_labels = json.loads(df_tmp.iloc[0]['Label'])
        score = get_attack_score(pd.Series(raw_labels))
        llm_token_file = df_tmp.iloc[0]['llm_token_file']
        llm_noised_file = df_tmp.iloc[0]['llm_noised_file']

        def compare_floats_round(a, b, precision):
            return round(a, precision) == round(b, precision)
        if not compare_floats_round(raw_score, score, 7):
            display(raw_score,score)
            print("Algorithm is rereshed continue...") 
            
        llm_text = self.read_text(llm_token_file)
        noised_text = self.read_text(llm_noised_file)
        return score, llm_text, noised_text,llm_token_file,llm_noised_file
        
    def get_batch_text(self,group_list):
        all_data =  []
        for group in group_list:
            score, llm_text, noised_text,llm_token_file,llm_noised_file = self.get_group_text(group)
            tmp = {}
            tmp['score'] = score
            tmp['text'] = llm_text
            tmp['noised'] = noised_text
            tmp['llm_token_file'] = llm_token_file
            tmp['llm_noised_file'] = llm_noised_file
            file_name = llm_token_file.split("/")[-1]
            token_embedding_path = "%s/attack/%s/"%(g_embedding_root,self.m_attack)
            noised_embedding_path = "%s/noised/%s/"%(g_embedding_root,self.m_attack)
            os.makedirs(token_embedding_path, exist_ok=True)
            os.makedirs(noised_embedding_path, exist_ok=True)
            tmp['token_embedding_file'] = "%s/G%d-%s"%(token_embedding_path,group,file_name)
            tmp['noised_embedding_file'] = "%s/G%d-%s"%(noised_embedding_path,group,file_name)
            all_data.append(tmp)
            
        return pd.DataFrame(all_data)
        
    def get_group_batch(self,step = 100 ):
        def split_into_ranges(a, t):
            max_val = max(a)
            return [[i, min(i + t, max_val + 1)] for i in range(0, max_val + 1, t)]
        groups = self.m_df_index['group'].unique().tolist()
        return split_into_ranges(groups,step)

    def Create_Embedding(self,batch_size = 5):
        batch_range = self.get_group_batch(batch_size)
        count = 0
        for r in tqdm(batch_range):
            group_list = list(range(r[0],r[1]))
            print("Embedding",group_list)
            df_text = self.get_batch_text(group_list)
            df_embedding = get_embedding(df_text)
            for i,row in df_embedding.iterrows():
                token_embedding = self.resize_embedding(row['token_embedding'])
                noised_embedding = self.resize_embedding(row['noised_embedding'])
                self.write_embedding(row['token_embedding_file'],token_embedding)
                self.write_embedding(row['noised_embedding_file'],noised_embedding)
                count = count + 1
            if count > 2000:
                break

In [None]:
attack = 'Port Scanning attack'
test = CLLMSample(attack)
test.Create_Embedding()

  0%|          | 0/4325 [00:00<?, ?it/s]

Embedding [0, 1, 2, 3, 4]
Embedding [5, 6, 7, 8, 9]
Embedding [10, 11, 12, 13, 14]
Embedding [15, 16, 17, 18, 19]
Embedding [20, 21, 22, 23, 24]
Embedding [25, 26, 27, 28, 29]
Embedding [30, 31, 32, 33, 34]
Embedding [35, 36, 37, 38, 39]
Embedding [40, 41, 42, 43, 44]
Embedding [45, 46, 47, 48, 49]
Embedding [50, 51, 52, 53, 54]
Embedding [55, 56, 57, 58, 59]
Embedding [60, 61, 62, 63, 64]
Embedding [65, 66, 67, 68, 69]
Embedding [70, 71, 72, 73, 74]
Embedding [75, 76, 77, 78, 79]
Embedding [80, 81, 82, 83, 84]
Embedding [85, 86, 87, 88, 89]


In [None]:
df_embedding