# 0-shot Learning Binary Classification

In this script, we investigate the usage of 0-shot learning on the *UNSW-NB15* dataset using various preprocessing techniques.

# Key-Value pairs Text Encoding

In [2]:
import pandas as pd
from dotenv import load_dotenv
from os import getenv

df = pd.read_csv("./data/NF-UNSW-NB15_encoded.csv", index_col=None)
df = df.head(10000)

load_dotenv()
HUGGING_FACE_READ_TOKEN = getenv("HUGGING_FACE_READ_TOKEN")

FileNotFoundError: [Errno 2] No such file or directory: './data/NF-UNSW-NB15_encoded.csv'

In [2]:
import transformers
import torch
torch.cuda.empty_cache()

# We choose the instruction version of Llama 3 as the foundational
# model showed difficulties to answer in the required format.
# This is an expected behavior as these models were not trained to
# understand instructions but simply to predict the sequence of words.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
    token=HUGGING_FACE_READ_TOKEN
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.81it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def classification_pipeline(netflow):

    messages = [
        {"role": "instruction", "content": "You are a Cybersecurity expert which will be asked to classify network flows as malicious or benign. If you think the network flow is benign, answer 0. If you believe the network flow is mailicious, answer 1. For example if I say: IPV4_SRC_ADDR: 149.171.126.0, L4_SRC_PORT: 62073, IPV4_DST_ADDR: 59.166.0.5, L4_DST_PORT: 56082, PROTOCOL: 6, L7_PROTO: 0.0, IN_BYTES: 9672, OUT_BYTES: 416, IN_PKTS: 11, OUT_PKTS: 8, TCP_FLAGS: 25, FLOW_DURATION_MILLISECONDS: 15 and the flow is benign, you output 0. If it is malicious you output 1. You are not allowed to say anything else besides the number 1 or 0."},
        {"role": "input", "content": netflow},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=100,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.01,
        top_p=0.9,
    )

    return outputs[0]["generated_text"][len(prompt):]

classification_pipeline("IPV4_SRC_ADDR: 149.171.126.0, L4_SRC_PORT: 62073, IPV4_DST_ADDR: 59.166.0.5, L4_DST_PORT: 56082, PROTOCOL: 6, L7_PROTO: 0.0, IN_BYTES: 9672, OUT_BYTES: 416, IN_PKTS: 11, OUT_PKTS: 8, TCP_FLAGS: 25, FLOW_DURATION_MILLISECONDS: 15")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


'0'

In [4]:
prediction_labels = []

for index, row in df.iterrows():
    prediction_labels.append(classification_pipeline(row['input']))

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_to

In [6]:
from sklearn.metrics import classification_report


true_labels = df["output"].to_list()
prediction_labels = [int(item) for item in prediction_labels]

target_names = ['benign', 'malicious']

print(classification_report(true_labels, prediction_labels, target_names=target_names))

              precision    recall  f1-score   support

      benign       0.95      0.96      0.96      9452
   malicious       0.22      0.19      0.20       548

    accuracy                           0.92     10000
   macro avg       0.59      0.58      0.58     10000
weighted avg       0.91      0.92      0.92     10000



: 