# Dotrenowanie modelu językowego do dowolnego zadania

In [1]:
import torch
import datasets
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
)
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Wybór urządzenia do przeprowadzania obliczeń, biblioteka Torch umożliwia bardzo łatwe przejście na obliczenia na GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Załadowanie zbioru danych
dataset = datasets.load_dataset("zefang-liu/phishing-email-dataset", split='train').train_test_split(test_size=0.1)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type'],
        num_rows: 16785
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Email Text', 'Email Type'],
        num_rows: 1865
    })
})

In [5]:
# Ekstrakcja zbioru walidacyjnego i testowego
full_train_split = dataset['train'].train_test_split(test_size=0.1)

train_dataset = full_train_split['train']
val_dataset = full_train_split['test']
test_dataset = dataset['test']

# Sprawdzenie rozmiarów zbiorów danych po podziale
print(f"Dataset size: {len(train_dataset) + len(test_dataset) + len(val_dataset)}")
print(f"Train dataset size: {len(train_dataset) / (len(train_dataset) + len(test_dataset) + len(val_dataset)) * 100:.2f}%")
print(f"Test dataset size: {len(test_dataset) / (len(train_dataset) + len(test_dataset) + len(val_dataset)) * 100:.2f}%")
print(f"Validation dataset size: {len(val_dataset) / (len(train_dataset) + len(test_dataset) + len(val_dataset)) * 100:.2f}%")

Dataset size: 18650
Train dataset size: 81.00%
Test dataset size: 10.00%
Validation dataset size: 9.00%


In [6]:
train_dataset[42]

{'Unnamed: 0': 9550,
 'Email Text': "power trading hi folks : very glad to hear about the new developments . just to recap what we discussed this morning about different things you need to look into to set up trading operations and the contacts : 1 . licence to trade : regulatory people : i guess you know about this part better than me . 2 . trading & risk mgmt : global risk mgmt oversight : john sherrif in london has the overall responsibility outside western hemisphere . research group can help with the structuring models used for trading . 3 . risk conrols : before trading group is operational , it needs to get the authorization from the board of directors of enron along with total position limits and possibly value @ risk limits . these limits are typically by commodity type and by region . risk assessment & control ( rac ) under rick buy performs the internal control function ensuring that the businesses adhere to these trading limits . ted murphy is the vp in rac overseeing the t

## Procesowanie danych i kodowanie

In [7]:
labels = train_dataset.unique(column="Email Type")
print("Etykiety:", labels)

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

Flattening the indices: 100%|██████████| 15106/15106 [00:00<00:00, 47964.40 examples/s]


Etykiety: ['Safe Email', 'Phishing Email']


In [8]:
# Tokenizacja danych
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    texts = examples["Email Text"]
    labels = examples["Email Type"]

    texts = [text if text is not None else "" for text in texts]
    labels = [label if label is not None else "Safe Email" for label in labels]

    tokenized_outputs = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )

    tokenized_outputs["labels"] = [label2id[label] for label in labels]

    return tokenized_outputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

print("Tokenizacja zakończona.")


Map: 100%|██████████| 15106/15106 [00:20<00:00, 738.24 examples/s] 
Map: 100%|██████████| 1679/1679 [00:00<00:00, 3731.43 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 3746.68 examples/s]

Tokenizacja zakończona.





In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

print(model)
print(f"\n\nZajętość pamięci przez model: {model.get_memory_footprint() / 1024**3:0.3f}GB")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# Pełny fine-tuning
for param in model.base_model.parameters():
    param.requires_grad = True

In [11]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

In [12]:
args = TrainingArguments(
    output_dir="bert-finetuned-phishing-detection",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    gradient_accumulation_steps=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.049814,0.97975,0.973846
2,0.135200,0.124122,0.966647,0.956044
3,0.044900,0.048289,0.982728,0.977709
4,0.029000,0.049209,0.983323,0.97856
5,0.025900,0.049898,0.982728,0.977744
6,0.023300,0.050467,0.983323,0.978495


TrainOutput(global_step=2838, training_loss=0.048142298751519215, metrics={'train_runtime': 2259.8938, 'train_samples_per_second': 66.844, 'train_steps_per_second': 2.093, 'total_flos': 2.384733361360896e+16, 'train_loss': 0.048142298751519215, 'epoch': 6.0})

In [15]:
evaluation_results = trainer.evaluate()
print(f"Dokładność dotrenowanego modelu na zbiorze testowym: {evaluation_results['eval_accuracy']:.4f}")
print(evaluation_results)

Dokładność dotrenowanego modelu na zbiorze testowym: 0.9833
{'eval_loss': 0.04920933023095131, 'eval_accuracy': 0.9833234067897558, 'eval_f1': 0.9785604900459418, 'eval_runtime': 11.6688, 'eval_samples_per_second': 143.887, 'eval_steps_per_second': 8.998, 'epoch': 6.0}
