<a href="https://colab.research.google.com/github/himanshudhami/BuiltbyChatGPT/blob/main/Networkdevices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import random
from faker import Faker
import pandas as pd

# Generate synthetic data
faker = Faker()

device_types = ["router", "switch", "firewall"]
statuses = ["up", "down"]
security_statuses = ["secure", "insecure"]

def generate_device_data(num_devices=1000):
    devices = []

    for _ in range(num_devices):
        device = {
            "device_id": faker.uuid4(),
            "device_type": random.choice(device_types),
            "ip_address": faker.ipv4(),
            "status": random.choice(statuses),
            "security_status": random.choice(security_statuses),
            "last_updated": faker.date_time_this_year()
        }
        devices.append(device)
    
    return devices

# Generate 1000 devices
device_data = generate_device_data(1000)

# Convert to DataFrame
device_df = pd.DataFrame(device_data)

# Feature extraction
def extract_features(device_df):
    # One-hot encode device types
    device_type_dummies = pd.get_dummies(device_df["device_type"], prefix="device_type")

    # Convert status and security status to binary values
    device_df["status"] = device_df["status"].map({"up": 1, "down": 0})
    device_df["security_status"] = device_df["security_status"].map({"secure": 1, "insecure": 0})

    # Combine original dataframe with one-hot encoded device types
    device_df = pd.concat([device_df, device_type_dummies], axis=1)

    # Drop unnecessary columns
    device_df = device_df.drop("device_type", axis=1)

    return device_df

# Extract features from the device dataframe
feature_df = extract_features(device_df)
print(feature_df)

                                device_id       ip_address  status  \
0    56bab6f4-293e-4ce9-b16d-b491b80d6a27    153.174.10.56       0   
1    ade10231-5425-47e9-8dc9-d6f564d2c2ce  169.136.180.132       1   
2    21fe1380-05a5-417d-a40b-5594a20389ca   20.183.159.208       0   
3    e67fe010-d3c7-401c-988a-7dd5d0efc4b5   40.219.244.241       0   
4    bed95e2a-8c13-4bac-a898-1c724a395e5a    42.13.143.131       0   
..                                    ...              ...     ...   
995  08f2437e-c596-478f-95a8-e6112b0905b9  138.211.208.236       1   
996  f73be419-a3d5-4e31-bccc-234a8d836b57      96.14.82.70       1   
997  409bf19d-6dd1-4cf4-9a4f-34b9c5edbbdf   178.221.37.226       0   
998  df4a2cb3-d045-43db-b2c5-a4306b424f52     65.194.73.92       1   
999  c43e48ad-822b-442f-87e2-1a255b0ef240    183.157.77.85       0   

     security_status        last_updated  device_type_firewall  \
0                  1 2023-02-09 12:30:02                     1   
1                  0 2023-0

In [None]:
pip install datasets


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def create_labeled_examples(device_df):
    examples = []

    for _, row in device_df.iterrows():
        device_id = row["device_id"]
        status = "up" if row["status"] == 1 else "down"
        security_status = "secure" if row["security_status"] == 1 else "insecure"

        context = f"The device with ID {device_id} has a status of {status} and a security status of {security_status}."
        examples.append({
            "context": context,
            "question": f"What is the status of the device with ID {device_id}?",
            "answer": status
        })
        examples.append({
            "context": context,
            "question": f"What is the security status of the device with ID {device_id}?",
            "answer": security_status
        })

    return examples

labeled_examples = create_labeled_examples(feature_df)
print(labeled_examples)

[{'context': 'The device with ID 56bab6f4-293e-4ce9-b16d-b491b80d6a27 has a status of down and a security status of secure.', 'question': 'What is the status of the device with ID 56bab6f4-293e-4ce9-b16d-b491b80d6a27?', 'answer': 'down'}, {'context': 'The device with ID 56bab6f4-293e-4ce9-b16d-b491b80d6a27 has a status of down and a security status of secure.', 'question': 'What is the security status of the device with ID 56bab6f4-293e-4ce9-b16d-b491b80d6a27?', 'answer': 'secure'}, {'context': 'The device with ID ade10231-5425-47e9-8dc9-d6f564d2c2ce has a status of up and a security status of insecure.', 'question': 'What is the status of the device with ID ade10231-5425-47e9-8dc9-d6f564d2c2ce?', 'answer': 'up'}, {'context': 'The device with ID ade10231-5425-47e9-8dc9-d6f564d2c2ce has a status of up and a security status of insecure.', 'question': 'What is the security status of the device with ID ade10231-5425-47e9-8dc9-d6f564d2c2ce?', 'answer': 'insecure'}, {'context': 'The device w

In [None]:
from sklearn.model_selection import train_test_split

train_examples, val_examples = train_test_split(labeled_examples, test_size=0.2, random_state=42)
# Define the tokenizer
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
from transformers import TrainingArguments, Trainer, default_data_collator

def prepare_data(examples):
    encodings = tokenizer([example["context"] for example in examples], [example["question"] for example in examples], truncation=True, padding=True)

    # Encode labels
    encodings["start_positions"] = []
    encodings["end_positions"] = []
    for i, example in enumerate(examples):
        answer = example["answer"]
        start_index = encodings.char_to_token(i, example["context"].index(answer))
        end_index = encodings.char_to_token(i, example["context"].index(answer) + len(answer) - 1)

        encodings["start_positions"].append(start_index)
        encodings["end_positions"].append(end_index)

    return encodings

train_encodings = prepare_data(train_examples)
val_encodings = prepare_data(val_examples)


In [None]:
import torch

class NetworkDeviceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = NetworkDeviceDataset(train_encodings)
val_dataset = NetworkDeviceDataset(val_encodings)


In [34]:
import torch
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast, Trainer, TrainingArguments


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')


In [36]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy='epoch',     # evaluate each epoch
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
trainer.train()


***** Running training *****
  Num examples = 1600
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 300
  Number of trainable parameters = 66364418


Epoch,Training Loss,Validation Loss
1,No log,0.498735
2,No log,0.00067
3,No log,0.000363


***** Running Evaluation *****
  Num examples = 400
  Batch size = 64
***** Running Evaluation *****
  Num examples = 400
  Batch size = 64
***** Running Evaluation *****
  Num examples = 400
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=300, training_loss=0.9214981079101563, metrics={'train_runtime': 2548.0395, 'train_samples_per_second': 1.884, 'train_steps_per_second': 0.118, 'total_flos': 116362917216000.0, 'train_loss': 0.9214981079101563, 'epoch': 3.0})

In [None]:

# Define the question and context
question = "Is router R500 secure?"
context = "Router R1 has an up-to-date firewall and is configured correctly."

# Encode the inputs and compute the answer
inputs = tokenizer(question, context, return_tensors="pt")
start_positions = torch.argmax(model(**inputs)["start_logits"])
end_positions = torch.argmax(model(**inputs)["end_logits"])
print(start_positions)
print(end_positions)
print (inputs["input_ids"][0][start_positions:end_positions+1])
# Decode the answer
answer = "11"+tokenizer.decode(inputs["input_ids"][0][start_positions:end_positions+1])
print(answer)