In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import matplotlib.pyplot as plt
import seaborn as sns

# PEFT imports
from peft import get_peft_model, LoraConfig, TaskType

# bitsandbytes import
import bitsandbytes as bnb

2024-05-23 18:29:20.036931: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-23 18:29:20.081603: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  torch.utils._pytree._register_pytree_node(


In [2]:
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

print(f'There are {torch.cuda.device_count()} GPU(s) available.')
print('Device name:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
Device name: NVIDIA RTX A5000


In [3]:
from sklearn.utils import shuffle

benign_data = pd.read_csv("/home/vikrant/Desktop/Thesis/Thesis_Projects/URL_detection/Phishtank_UNB/Benign.csv")

malicious_data = pd.read_csv("/home/vikrant/Desktop/Thesis/Thesis_Projects/URL_detection/Phishtank_UNB/Malicious.csv")

df1 = pd.DataFrame(benign_data)
df2 = pd.DataFrame(malicious_data)

x = df1.sample(25000)
y = df2.sample(25000)

data = pd.concat([x,y], axis=0)
data = shuffle(data)
data

Unnamed: 0,url,label
39402,latex.silmaril.ie/formattinginformation/,0
21928,www.yukudr.com/mp3player/,0
33548,www.users.zetnet.co.uk/logs/Genesis/terrsky.htm,0
5427,fun-dive.com/gps/?check=1i56456650.html,1
10927,www.godziecin.brzegdolny.pl//wp-content/upload...,1
...,...,...
6781,ricajupanu.go.ro/index.html,1
29892,www.appdev.com/visual_basic_training.asp,0
1705,dunningphoto.com/rebuilt.html,0
14655,www.asus.com/Display/,0


In [4]:
sentences = data['url'].values
labels = data['label'].values

In [5]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Nous-Hermes-llama-2-7b') 




In [6]:
# Tokenize the sentences
input_ids = []
attention_masks = []

max_length = 128  # Adjusted sequence length for better performance

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
# Split the data into 80% training and 20% test set
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, 
                                                                        random_state=2018, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_masks, attention_masks,
                                                 random_state=2018, test_size=0.2)

# Further split the training data into 75% training and 25% validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(train_inputs, train_labels, 
                                                                                    random_state=2018, test_size=0.25)
train_masks, validation_masks, _, _ = train_test_split(train_masks, train_masks,
                                                       random_state=2018, test_size=0.25)

# Create the DataLoader for training, validation, and test sets
batch_size_train = 4
batch_size_val = 4

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size_train)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size_val)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size_val)


In [8]:
# Load the pre-trained model
base_model = AutoModelForSequenceClassification.from_pretrained(
    "NousResearch/Nous-Hermes-llama-2-7b",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Nous-Hermes-llama-2-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=2,
    lora_alpha=4,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # Target modules need to be specified based on the model architecture
)

In [10]:
# Apply LoRA to the model
model = get_peft_model(base_model, lora_config)
model.to(device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 