In [11]:
pip install tqdm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.comNote: you may need to restart the kernel to use updated packages.



In [1]:
!pip install -q transformers accelerate peft bitsandbytes datasets


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig


In [5]:
model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   4%|3         | 178M/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:  46%|####6     | 262M/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)


In [8]:
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 588
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 66
    })
})


In [13]:
dataset = load_dataset("csv", data_files="C:/Users/alpha/Downloads/mit.csv")  # or "json"
dataset = dataset["train"].train_test_split(test_size=0.1)

tokenizer.pad_token = tokenizer.eos_token

# 🔁 Now you're safe to tokenize with padding
def format_and_tokenize(example):
    prompt = f"### Question:\n{example['question']}\n\n### Answer:\n{example['answer']}"
    tokens = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(format_and_tokenize)

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
    output_dir="LEVI",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2
)


In [15]:
from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.058
20,2.801
30,2.6073
40,2.4066
50,2.2737
60,2.2024
70,2.1688
80,2.1559
90,2.0654
100,2.0112


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=219, training_loss=2.1682404208945356, metrics={'train_runtime': 2747.085, 'train_samples_per_second': 0.642, 'train_steps_per_second': 0.08, 'total_flos': 1.422522939604992e+16, 'train_loss': 2.1682404208945356, 'epoch': 2.965986394557823})

In [17]:
import os
os.getcwd()


'C:\\Users\\alpha\\anaconda3\\envs\\cyber'

In [18]:
trainer.save_model("cyai.v1")
tokenizer.save_pretrained("cyai.v1")


('cyai.v1\\tokenizer_config.json',
 'cyai.v1\\special_tokens_map.json',
 'cyai.v1\\vocab.json',
 'cyai.v1\\merges.txt',
 'cyai.v1\\added_tokens.json',
 'cyai.v1\\tokenizer.json')

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model = AutoModelForCausalLM.from_pretrained("cyai.v1")
tokenizer = AutoTokenizer.from_pretrained("cyai.v1")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
import torch

def ask_question(question, max_length=100):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [5]:
response = ask_question("What is input capture")
print(response)


What is input capture?
The goal of input capture is to intercept and analyze data entered by the user through a keyboard, mouse, or other input device. This information can then be used for various purposes, such as detecting malicious activity or gathering intelligence.

What are the different types of input capture?
There are several different types of input capture, including:

- Network: This involves capturing data that is transmitted over a network, such as through a web browser or email.



In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig


In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("cyai.v1")
model = AutoModelForCausalLM.from_pretrained("cyai.v1",quantization_config=bnb_config)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)


In [17]:
dataset = load_dataset("csv", data_files="C:/Users/alpha/Downloads/data web/cydata1.csv")  # or "json"
dataset = dataset["train"].train_test_split(test_size=0.1)

tokenizer.pad_token = tokenizer.eos_token

# 🔁 Now you're safe to tokenize with padding
def format_and_tokenize(example):
    prompt = f"### Question:\n{example['INSTRUCTION']}\n\n### Answer:\n{example['RESPONSE']}"
    tokens = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(format_and_tokenize)

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

In [18]:
training_args = TrainingArguments(
    output_dir="LEVI",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    report_to="none",
    dataloader_num_workers=2
)


In [19]:
from transformers import Trainer, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,1.5943
20,1.5076
30,1.326
40,1.3086
50,1.221
60,1.1699
70,1.2186
80,1.1788
90,1.1909
100,1.1895


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=234, training_loss=1.1973561507004957, metrics={'train_runtime': 3247.7896, 'train_samples_per_second': 0.582, 'train_steps_per_second': 0.072, 'total_flos': 1.523665625677824e+16, 'train_loss': 1.1973561507004957, 'epoch': 2.965079365079365})

In [21]:
trainer.save_model("cyai.v2")
tokenizer.save_pretrained("cyai.v2")


('cyai.v2\\tokenizer_config.json',
 'cyai.v2\\special_tokens_map.json',
 'cyai.v2\\vocab.json',
 'cyai.v2\\merges.txt',
 'cyai.v2\\added_tokens.json',
 'cyai.v2\\tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model = AutoModelForCausalLM.from_pretrained("cyai.v2")
tokenizer = AutoTokenizer.from_pretrained("cyai.v2")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch

def ask_question(question, max_length=100):
    inputs = tokenizer(question, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)cdm
    return answer


In [1]:
response = ask_question("what is malware analysis")
print(response)


NameError: name 'ask_question' is not defined