In [1]:
!pip install huggingface_hub datasets evaluate requests urllib3 datasets 

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from huggingface_hub import login

# Hugging Face login
login("input_Your_Token")

# Load dataset from Hugging Face
dataset = load_dataset("Mohammedbendahrass/threat_dataset")

# Check dataset columns
print(dataset['train'].column_names)  # Ensure the dataset has 'threat name' and 'description threat' columns

# Extract threat names and descriptions
threat_name = dataset['train']['threat name']  # Replace 'threat name' with the actual column name
description_threat = dataset['train']['description threat']  # Replace 'description threat' with the actual column name

# Combine threat name and description into a single formatted text
threat_data = [
    f"THREAT_NAME: {name}\nTHREAT_DESCRIPTION: {desc} <|endoftext|>"
    for name, desc in zip(threat_name, description_threat)
]

# Split data into train and test sets
train_texts, test_texts = train_test_split(threat_data, test_size=0.2, random_state=42)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use the EOS token as the PAD token
model.resize_token_embeddings(len(tokenizer))  # Resize the model's token embeddings

model.to(device)

# Tokenize the dataset with max_length=128
def tokenize_function(examples):
    return tokenizer(
        examples,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

# Tokenize train and test texts
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Convert to PyTorch datasets
class ThreatDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = ThreatDataset(train_encodings)
test_dataset = ThreatDataset(test_encodings)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",  
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=1e-3,
    warmup_steps=100,
    weight_decay=0.01,
    report_to="none",
)

# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
train_result = trainer.train()

# Save final model
final_model_path = "./results/final_model"
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

# Evaluate model
metrics = trainer.evaluate()
print(metrics)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


README.md:   0%|          | 0.00/373 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/30.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/259742 [00:00<?, ? examples/s]

['threat name', 'description threat']
Using device: cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,1.5682,1.543321
2,1.2699,1.369985
3,0.9331,1.322422


{'eval_loss': 1.3224215507507324, 'eval_runtime': 817.2049, 'eval_samples_per_second': 63.569, 'eval_steps_per_second': 3.973, 'epoch': 3.0}


In [4]:
from huggingface_hub import HfApi

# Define model repository details
repo_name = "threat-detection-gpt2"  # Name of your model repository on Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

# Verify the model on the Hub
print(f"Model saved to Hugging Face Hub: https://huggingface.co/threat-detection-gpt2-3")


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Model saved to Hugging Face Hub: https://huggingface.co/threat-detection-gpt2-3
