In [None]:
import kagglehub
import pandas as pd

path = kagglehub.dataset_download("vstepanenko/disaster-tweets")
df = pd.read_csv(f"{path}/tweets.csv")

Downloading from https://www.kaggle.com/api/v1/datasets/download/vstepanenko/disaster-tweets?dataset_version_number=3...


100%|██████████| 656k/656k [00:00<00:00, 24.0MB/s]

Extracting files...





In [None]:
df.isnull().sum()
text = df['text'].values
target = df['target'].values

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader, random_split, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report

In [None]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7cb3bab27290>

In [None]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
encodings = tokenizer(list(text), truncation=True, padding=True, max_length=128, return_tensors="pt")

input_ids = encodings["input_ids"]
attention_mask = encodings["attention_mask"]
labels = torch.tensor(target, dtype=torch.long)

dataset = TensorDataset(input_ids, attention_mask, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

generator1 = torch.Generator().manual_seed(42)
train_data, val_data = random_split(dataset, [train_size, val_size], generator=generator1)
print(train_data)
print(val_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

<torch.utils.data.dataset.Subset object at 0x7cb3c0140650>
<torch.utils.data.dataset.Subset object at 0x7cb3c01405f0>


In [None]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [None]:
epochs = 3
for epoch in range(epochs):
  model.train()
  t_loss = 0
  corr = 0
  for batch_ids, batch_masks, batch_labels in train_loader:
    batch_ids, batch_masks, batch_labels = batch_ids.to(device), batch_masks.to(device), batch_labels.to(device)
    outputs = model(input_ids=batch_ids, attention_mask=batch_masks, labels=batch_labels)
    loss = outputs.loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    t_loss += loss.item()
    preds = outputs.logits.argmax(dim=1)
    corr += (preds == batch_labels).sum()
  train_loss = t_loss/ len(train_loader)
  train_acc = corr / len(train_loader.dataset)

  print(f"train loss = {train_loss} | train acc = {train_acc}")

  model.eval()
  all_p = []
  all_lab = []
  with torch.inference_mode():
    for batch_input_ids, batch_masks, batch_labels in val_loader:
      batch_input_ids, batch_masks = batch_input_ids.to(device), batch_masks.to(device)
      outputs = model(input_ids=batch_input_ids, attention_mask=batch_masks)
      predict = outputs.logits.argmax(dim=1)
      all_p.extend(predict.cpu().numpy())
      all_lab.extend(batch_labels.cpu().numpy())

  report = classification_report(all_lab, all_p, target_names= ['Not Disaster', 'Disaster'])
  print("epoch: ",epoch+1 )
  print(report)

train loss = 0.27839337264127073 | train acc = 0.8867633938789368
epoch:  1
              precision    recall  f1-score   support

Not Disaster       0.96      0.93      0.94      1851
    Disaster       0.71      0.82      0.76       423

    accuracy                           0.91      2274
   macro avg       0.84      0.87      0.85      2274
weighted avg       0.91      0.91      0.91      2274

train loss = 0.14668363026672276 | train acc = 0.9463500380516052
epoch:  2
              precision    recall  f1-score   support

Not Disaster       0.95      0.94      0.95      1851
    Disaster       0.75      0.78      0.77       423

    accuracy                           0.91      2274
   macro avg       0.85      0.86      0.86      2274
weighted avg       0.91      0.91      0.91      2274

train loss = 0.06359066059407803 | train acc = 0.9793315529823303
epoch:  3
              precision    recall  f1-score   support

Not Disaster       0.95      0.94      0.95      1851
    Disas