<a href="https://colab.research.google.com/github/kpenzo/bert_transformers/blob/main/cve_bert_TRANSFORMERS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset: https://www.kaggle.com/datasets/andrewkronser/cve-common-vulnerabilities-and-exposures

## KMeans: https://realpython.com/k-means-clustering-python/

In [None]:
!nvidia-smi

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
!ls "/content/sample_data/archive.zip"
!unzip "/content/sample_data/archive.zip"

In [None]:
!pip show accelerate
!pip install accelerate -U

In [4]:
import numpy as np
import pandas as pd

In [5]:
cve = pd.read_csv('/content/cve.csv')

In [None]:
!pip show transformers

In [None]:
!pip install transformers datasets evaluate

In [8]:
from transformers import pipeline

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
def preprocess_function(example):
    token = tokenizer(example, truncation=True)
    return token

In [11]:
X = cve['summary'].tolist()
y = cve['cwe_name'].tolist()

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [13]:
cve['cwe_name_code'] = label_encoder.fit_transform(cve['cwe_name'])

In [14]:
y_numeric = cve['cwe_name_code'].tolist()

In [15]:
tokenized_cve = list(map(preprocess_function, X))

In [None]:
print(tokenized_cve[0].keys())

tokenized_cve_input_ids_as_list = []
tokenized_cve_attention_mask_as_list = []
tokenized_cve_text_as_list = []
tokenized_cve_label_as_list = []

for i in range(len(tokenized_cve)):
  tokenized_cve[i]['text'] = X[i]
  tokenized_cve[i]['label'] = y_numeric[i]

  tokenized_cve_input_ids_as_list.append(tokenized_cve[i]['input_ids'])
  tokenized_cve_attention_mask_as_list.append(tokenized_cve[i]['attention_mask'])
  tokenized_cve_text_as_list.append(tokenized_cve[i]['text'])
  tokenized_cve_label_as_list.append(tokenized_cve[i]['label'])

In [None]:
max(y_numeric)

In [None]:
d = {
  'text': tokenized_cve_text_as_list,
  'label': tokenized_cve_label_as_list,
  'input_ids': tokenized_cve_input_ids_as_list,
  'attention_mask': tokenized_cve_attention_mask_as_list,
}
df = pd.DataFrame(data=d)

from datasets import Dataset
import pyarrow as pa

pa_arrow = pa.Table.from_pandas(df)
tokenized_cve_ds = Dataset(pa_arrow)

print(tokenized_cve_ds)

In [19]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

In [21]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [22]:
'''
id2label = {
    0: "NEGATIVE",
    1: "POSITIVE"
}

label2id = {
    "NEGATIVE": 0,
    "POSITIVE": 1
}
'''
id2label = {}
label2id = {}

cwe_name_list = cve['cwe_name'].tolist()
cwe_name_code_list = cve['cwe_name_code'].tolist()

for x in range(len(cwe_name_list)):
  id2label[cwe_name_code_list[x]] = cwe_name_list[x]
  label2id[cwe_name_list[x]] = cwe_name_code_list[x]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=170,
    id2label=id2label,
    label2id=label2id
)

In [None]:
!pip install --upgrade transformers

In [39]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="steps",  #epoch
    save_strategy="steps",  #epoch
    save_steps=500, #
    eval_steps=500, #
    logging_steps=100, #
    do_eval=True,
    save_total_limit=1,  #,
    load_best_model_at_end=True,
    push_to_hub=False,
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cve_ds,
    eval_dataset=tokenized_cve_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
print(type(model))
print(type(tokenizer))

In [None]:
model.save_pretrained('/content/sample_data/model_trained_04_08')
tokenizer.save_pretrained('/content/sample_data/tokenizer_trained_04_08')

In [None]:
tokenizer_2 = AutoTokenizer.from_pretrained('/content/sample_data/tokenizer_trained_04_08')
model_2 = AutoModelForSequenceClassification.from_pretrained('/content/sample_data/model_trained_04_08')

In [None]:
print(type(model_2))
print(type(tokenizer_2))

In [None]:
# inference
text = "'Zoom for Windows clients before version 5.13.3, Zoom Rooms for Windows clients before version 5.13.5 and Zoom VDI for Windows clients before 5.13.1 contain an information disclosure vulnerability. A recent update to the Microsoft Edge WebView2 runtime used by the affected Zoom clients, transmitted text to Microsoft&#8217;s online Spellcheck service instead of the local Windows Spellcheck. Updating Zoom remediates this vulnerability by disabling the feature. Updating Microsoft Edge WebView2 Runtime to at least version 109.0.1481.0 and restarting Zoom remediates this vulnerability by updating Microsoft&#8217;s telemetry behavior."

In [None]:
inputs = tokenizer(text, return_tensors="pt")

In [None]:
inputs.to('cuda')

In [None]:
import torch
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
print(logits)

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
inputs = tokenizer_2(text, return_tensors="pt")

In [None]:
inputs.to('cuda')
model_2.to('cuda')

In [None]:
import torch
with torch.no_grad():
    logits = model_2(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
!wmic

In [None]:
!uname -r
!lsb_release -a