In [1]:
from datasets import load_dataset, Dataset
Dataset.cleanup_cache_files
from dotenv import load_dotenv
from os import getenv
from tqdm import tqdm


load_dotenv()
HUGGING_FACE_READ_TOKEN = getenv("HUGGING_FACE_READ_TOKEN")

dataset = load_dataset("Jetlime/NF-UNSW-NB15-v2", streaming=True, split="test")
dataset

IterableDataset({
    features: ['input', 'output', 'Attack'],
    n_shards: 1
})

In [3]:
classes = dataset.features["output"].names
classes

['0', '1']

In [4]:
import transformers
import torch
torch.cuda.empty_cache()

# We choose the instruction version of Llama 3 as the foundational
# model showed difficulties to answer in the required format.
# This is an expected behavior as these models were not trained to
# understand instructions but simply to predict the sequence of words.
model_id = "OrpoLlama-3-8B"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
    token=HUGGING_FACE_READ_TOKEN,
    pad_token_id = 50256
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def classification_pipeline(netflow):

    messages = [
        {"role": "instruction", "content": "You are a cybersecurity expert tasked with classifying network flows as either malicious or benign. If you determine the network flow is benign, respond with '0'. If you determine the network flow is malicious, respond with '1'. For example, if given the following network flow: 'IPV4_SRC_ADDR: 59.166.0.7, L4_SRC_PORT: 53030, IPV4_DST_ADDR: 149.171.126.7, L4_DST_PORT: 44287, PROTOCOL: 6, L7_PROTO: 0.0, IN_YTES: 8928, IN_TS: 14, OUT_TES: 320, OUT_S: 6, TCP_AGS: 27, CLIENT_CP_AGS: 27, SERVER_CP_LAGS: 19, FLOW_URATION_ILLISECONDS: 0, DURATION_N: 0, DURATION_UT: 0, MIN_L: 31, MAX_L: 32, LONGEST_OW_T: 1500, SHORTEST_OW_: 52, MIN__PKT_N: 52, MAX__T_: 1500, SRC__T_ECOND_YTES: 8928.0, DST_O_C_COND_TES: 320.0, RETRANSMITTED__TES: 4252, RETRANSMITTED__TS: 3, RETRANSMITTED_T_TES: 0, RETRANSMITTED_T_TS: 0, SRC__DST_VG_ROUGHPUT: 71424000, DST__RC_G_ROUGHPUT: 2560000, NUM_TS___28_TES: 14, NUM_TS_8__6_YTES: 0, NUM_KTS_6__2_TES: 0, NUM_TS_2__24_TES: 0, NUM_TS_24__14_TES: 6, TCP_WIN_MAX_N: 5792, TCP_N_X_T: 10136, ICMP_PE: 39936, ICMP_PV4_TYPE: 156, DNS_QUERY_ID: 0, DNS_QUERY_TYPE: 0, DNS_TTL_ANSWER: 0, FTP_COMMAND_RET_CODE: 0.0' and you assess it as benign, you would respond with '0'. If you assess it as malicious, you would respond with '1'. You are only allowed to respond with '0' or '1'. If requested, provide an explanation for your classification, detailing the reasoning and which feature values influenced your decision."},
        {"role": "input", "content": netflow},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=100,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.01,
        top_p=0.9,
        
    )

    return outputs[0]["generated_text"][len(prompt):]

classification_pipeline("IPV4_SRC_ADDR: 149.171.126.0, L4_SRC_PORT: 62073, IPV4_DST_ADDR: 59.166.0.5, L4_DST_PORT: 56082, PROTOCOL: 6, L7_PROTO: 0.0, IN_BYTES: 9672, OUT_BYTES: 416, IN_PKTS: 11, OUT_PKTS: 8, TCP_FLAGS: 25, FLOW_DURATION_MILLISECONDS: 15")

'1'

In [6]:
dataset = dataset.take(100)

In [7]:
import time
from numpy import mean, array
from scipy.stats import sem

prediction_labels = []
inference_times = []

for i in tqdm(dataset, total=100):
    start_time = time.time()  # Record the start time
    prediction = classification_pipeline(i['input'])
    end_time = time.time()  # Record the end time

    prediction_labels.append(prediction)
    
    inference_time = end_time - start_time  # Calculate the inference time
    inference_times.append(inference_time)

# Convert inference_times to a numpy array for statistical operations
inference_times = array(inference_times)

# Compute the mean inference time
mean_inference_time = mean(inference_times)

# Compute the standard error of the mean (SEM)
standard_error = sem(inference_times)

  9%|▉         | 9/100 [00:03<00:28,  3.14it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [00:30<00:00,  3.23it/s]


In [11]:
mean_inference_time

0.2967293405532837

In [12]:
standard_error

0.0004772357772076936

: 

In [9]:
true_labels = []
for i in tqdm(dataset, total=120000):
    true_labels.append(i["output"])

  0%|          | 100/120000 [00:01<22:46, 87.76it/s] 


In [10]:
from sklearn.metrics import classification_report

prediction_labels = [int(item) for item in prediction_labels]

target_names = ['benign', 'malicious']

print(classification_report(true_labels, prediction_labels, digits=4, target_names=target_names))

              precision    recall  f1-score   support

      benign     0.9688    0.3229    0.4844        96
   malicious     0.0441    0.7500    0.0833         4

    accuracy                         0.3400       100
   macro avg     0.5064    0.5365    0.2839       100
weighted avg     0.9318    0.3400    0.4683       100



In [None]:
def compute_confusion_matrix_indices(predictions, true_labels):
    TN_indices = []
    FN_indices = []
    FP_indices = []
    TP_indices = []
    
    for i, (pred, true) in enumerate(zip(predictions, true_labels)):
        if pred == 0 and true == 0:
            TN_indices.append(i)
        elif pred == 0 and true == 1:
            FN_indices.append(i)
        elif pred == 1 and true == 0:
            FP_indices.append(i)
        elif pred == 1 and true == 1:
            TP_indices.append(i)
    
    return TN_indices, FN_indices, FP_indices, TP_indices

TN_indices, FN_indices, FP_indices, TP_indices = compute_confusion_matrix_indices(prediction_labels, true_labels)

print("TN indices:", TN_indices)
print("FN indices:", FN_indices)
print("FP indices:", FP_indices)
print("TP indices:", TP_indices)