# Imports

In [1]:
import transformers
import pandas as pd

from transformers import pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


# Preprocessing & Setup

In [2]:
df = pd.read_csv("./text, incident, tags long form.csv")
df.dropna(inplace=True)

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# dataset = TextDataset(encodings, labels)

In [6]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.01MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 247kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 571/571 [00:00<00:00, 5.53MB/s]


In [7]:
# Convert labels to integers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['IncidentType'])

train_texts, val_texts, train_labels, val_labels = train_test_split(df['Headline'], labels, test_size=0.2)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [None]:
from transformers import EarlyStoppingCallback
# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,  # This will ensure the best model is loaded after training.
    metric_for_best_model="loss",  # Use validation loss to determine the best model.
    greater_is_better=False,  # Lower validation loss is better.
    remove_unused_columns=False,
    output_dir = "./results",
    
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.0)]

)

# Train the model
trainer.train()

Downloading pytorch_model.bin: 100%|██████████| 1.34G/1.34G [00:14<00:00, 92.3MB/s]
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weigh

Step,Training Loss,Validation Loss
10,4.0675,3.633522
20,3.4894,3.297144
30,3.2523,3.115949
40,2.9956,2.863802
50,2.6082,2.567799
60,2.5317,2.468321
70,2.4359,2.310724
80,2.3362,2.281756
90,2.0471,2.125778
100,2.107,2.023014




In [None]:
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')

# Inference

In [9]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained('./results/checkpoint-510')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
from torch.utils.data import DataLoader
test_loader = DataLoader(val_dataset, batch_size=8)

In [18]:
from tqdm import tqdm
model.eval()  # Set the model to evaluation mode

predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        inputs = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs.logits
        preds = logits.argmax(dim=-1).cpu().numpy()  # Convert predictions to label IDs
        predictions.extend(preds)

predicted_labels = label_encoder.inverse_transform(predictions)  # Convert label IDs back to original labels
print(predicted_labels)

100%|██████████| 452/452 [00:31<00:00, 14.53it/s]

['Aviation Incident' 'Fire' 'Bomb Scare' ... 'Military Action' 'Shooting'
 'Emergency Landing']





In [24]:
accuracy = (predictions == val_labels).mean()
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.57
