In [2]:
pip install torch transformers datasets scikit-learn

,Collecting datasets
,  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
,Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
,  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
,Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
,  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
,Collecting multiprocess<0.70.17 (from datasets)
,  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
,Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
,Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
,Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
,Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
,[0mInstalling collected packages: nvidia-cudnn-cu12, multiprocess, nvidia-cusolver-cu12, datasets
,  Attempting uninstall: nvidia-cusolver-cu12
,    Found existing installation: nvidia-cusolver-

In [4]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Load dataset
df = pd.read_csv("combined.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Drop unnecessary index column
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Encode labels: 'on-topic' -> 1, 'off-topic' -> 0
df["label"] = df["label"].map({"on-topic": 1, "off-topic": 0})

# Drop NaN values
df = df.dropna()

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Train-test split (80% train, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["tweet"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Tokenize tweets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Convert to PyTorch tensors
train_inputs = torch.tensor(train_encodings["input_ids"])
train_masks = torch.tensor(train_encodings["attention_mask"])
train_labels = torch.tensor(train_labels)

val_inputs = torch.tensor(val_encodings["input_ids"])
val_masks = torch.tensor(val_encodings["attention_mask"])
val_labels = torch.tensor(val_labels)


,The secret `HF_TOKEN` does not exist in your Colab secrets.
,To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
,You will be able to reuse this secret in all of your notebooks.
,Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
from torch.utils.data import Dataset, DataLoader

class DisasterDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.masks[idx],
            "labels": self.labels[idx],
        }

# Create DataLoaders
train_labels = train_labels.to(torch.long)
val_labels = val_labels.to(torch.long)
train_dataset = DisasterDataset(train_inputs, train_masks, train_labels)
val_dataset = DisasterDataset(val_inputs, val_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
from transformers import BertForSequenceClassification

# Load BERT with a classification head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
,You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
!pip install transformers



In [8]:
!pip install torch transformers datasets scikit-learn



In [9]:
from torch.optim import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()


In [16]:
from transformers import get_scheduler
from tqdm import tqdm

# Define learning rate scheduler
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
epochs = 2
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(outputs.logits, labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

print("Training complete!")


Epoch 1: 100%|██████████| 366/366 [01:06<00:00,  5.48it/s, loss=0.0278]
,Epoch 2: 100%|██████████| 366/366 [01:06<00:00,  5.50it/s, loss=0.00247]

Training complete!





In [17]:
model.save_pretrained("bert_disaster_model")
tokenizer.save_pretrained("bert_disaster_model")

('bert_disaster_model/tokenizer_config.json',
 'bert_disaster_model/special_tokens_map.json',
 'bert_disaster_model/vocab.txt',
 'bert_disaster_model/added_tokens.json')

In [18]:
from sklearn.metrics import accuracy_score

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.9651


In [19]:
import torch
from transformers import BertTokenizer

# Function to make predictions on new tweets
def predict_disaster(text, model, tokenizer, device):
    model.eval()
    encoded_text = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')

    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        _, predictions = torch.max(outputs.logits, dim=1)

    return {
        'prediction': predictions.item(),
        'on_topic_probability': probs[0][1].item(),
        'off_topic_probability': probs[0][0].item(),
        'label': 'on-topic' if predictions.item() == 1 else 'off-topic'
    }

# Make sure model is in evaluation mode
model.eval()

# Example tweets to test
test_tweets = [
    "BREAKING NEWS: Earthquake magnitude 7.1 hits coastal region, tsunami warning issued #emergency",
    "Our thoughts are with everyone affected by the floods, stay safe!",
    "Just watched the latest Marvel movie and it was amazing!",
    "Today's weather forecast shows clear skies and warm temperatures",
    "Volunteers needed to help with hurricane relief efforts, please RT",
    "Fire spreading through downtown area, evacuation orders in place",
    "The new restaurant on Main Street has delicious food",
    "Happy birthday to my best friend! Love you lots!",
    "URGENT: Missing child in Springfield area, please share description",
    "Traffic is terrible this morning, expect delays on Highway 101"
]

print("\nTesting model on example tweets:\n")
for tweet in test_tweets:
    result = predict_disaster(tweet, model, tokenizer, device)
    print(f"Tweet: {tweet}")
    print(f"Prediction: {result['label']}")
    print(f"Confidence: {max(result['on_topic_probability'], result['off_topic_probability']):.4f}")
    print("-" * 80)


,Testing model on example tweets:
,
,Prediction: on-topic
,Confidence: 0.9578
,--------------------------------------------------------------------------------
,Tweet: Our thoughts are with everyone affected by the floods, stay safe!
,Prediction: on-topic
,Confidence: 0.9977
,--------------------------------------------------------------------------------
,Tweet: Just watched the latest Marvel movie and it was amazing!
,Prediction: off-topic
,Confidence: 0.9993
,--------------------------------------------------------------------------------
,Tweet: Today's weather forecast shows clear skies and warm temperatures
,Prediction: off-topic
,Confidence: 0.9582
,--------------------------------------------------------------------------------
,Tweet: Volunteers needed to help with hurricane relief efforts, please RT
,Prediction: on-topic
,Confidence: 0.9950
,--------------------------------------------------------------------------------
,Tweet: Fire spreading through downtown area, evacuati

In [20]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Stage 1: Use your existing BERT model for on-topic/off-topic classification
def predict_disaster_relevance(text, model, tokenizer, device):
    model.eval()
    encoded_text = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')

    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        _, predictions = torch.max(outputs.logits, dim=1)

    return {
        'prediction': predictions.item(),
        'on_topic_probability': probs[0][1].item(),
        'off_topic_probability': probs[0][0].item(),
        'label': 'on-topic' if predictions.item() == 1 else 'off-topic'
    }

# Stage 2: Train a second classifier for disaster type identification
# (This only needs to be trained once, then can be saved and reused)

# Load dataset with disaster categories
df = pd.read_csv("combined.csv")
df.columns = df.columns.str.strip()

# Filter only on-topic tweets and their categories
on_topic_df = df[df['label'] == 'on-topic'].copy()

# Ensure we have the category column
if 'category' not in on_topic_df.columns:
    raise ValueError("Your dataset needs a 'category' column for disaster types")

# Define disaster categories (modify based on your actual data)
disaster_categories = ['bombing', 'earthquake', 'explosion', 'floods', 'hurricane', 'tornado']

# Train a simple TF-IDF + RandomForest classifier for disaster type
X = on_topic_df['tweet'].tolist()
y = on_topic_df['category'].tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train classifier
disaster_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
disaster_classifier.fit(X_train_tfidf, y_train)

# Combined prediction function
def predict_disaster_type(text, relevance_model, disaster_model, tokenizer, tfidf_vectorizer, device):
    # First check if the tweet is on-topic
    relevance_result = predict_disaster_relevance(text, relevance_model, tokenizer, device)

    if relevance_result['label'] == 'off-topic':
        return {
            'is_disaster': False,
            'disaster_type': 'unrelated',
            'confidence': relevance_result['off_topic_probability']
        }
    else:
        # If on-topic, predict the disaster type
        text_tfidf = tfidf_vectorizer.transform([text])
        disaster_type = disaster_classifier.predict(text_tfidf)[0]
        disaster_probs = disaster_classifier.predict_proba(text_tfidf)[0]
        max_prob = max(disaster_probs)

        return {
            'is_disaster': True,
            'disaster_type': disaster_type,
            'confidence': relevance_result['on_topic_probability'],
            'disaster_type_confidence': max_prob
        }

# Test with example tweets
test_tweets = [
    "BREAKING NEWS: Earthquake magnitude 7.1 hits coastal region, tsunami warning issued #emergency",
    "Explosion reported at downtown factory, emergency services responding",
    "Hurricane warning in effect for coastal areas, residents advised to evacuate",
    "Just watched the latest Marvel movie and it was amazing!",
    "Volunteers needed urgently for flood relief efforts in affected areas"
]

# Make predictions
print("Two-Stage Disaster Classification Results:\n")
for tweet in test_tweets:
    result = predict_disaster_type(tweet, model, disaster_classifier, tokenizer, tfidf, device)
    print(f"Tweet: {tweet}")
    if result['is_disaster']:
        print(f"Classification: ON-TOPIC - {result['disaster_type'].upper()}")
        print(f"Confidence (on-topic): {result['confidence']:.4f}")
        print(f"Confidence (disaster type): {result['disaster_type_confidence']:.4f}")
    else:
        print(f"Classification: OFF-TOPIC")
        print(f"Confidence: {result['confidence']:.4f}")
    print("-" * 80)

# If you want to save the second classifier
import joblib
joblib.dump(disaster_classifier, 'disaster_type_classifier.joblib')
joblib.dump(tfidf, 'disaster_tfidf_vectorizer.joblib')

Two-Stage Disaster Classification Results:
,
,Classification: ON-TOPIC - EARTHQUAKE
,Confidence (on-topic): 0.9578
,Confidence (disaster type): 0.9500
,--------------------------------------------------------------------------------
,Tweet: Explosion reported at downtown factory, emergency services responding
,Classification: ON-TOPIC - EXPLOSION
,Confidence (on-topic): 0.5165
,Confidence (disaster type): 0.5400
,--------------------------------------------------------------------------------
,Classification: ON-TOPIC - HURRICANE
,Confidence (on-topic): 0.9979
,Confidence (disaster type): 0.8700
,--------------------------------------------------------------------------------
,Tweet: Just watched the latest Marvel movie and it was amazing!
,Classification: OFF-TOPIC
,Confidence: 0.9993
,--------------------------------------------------------------------------------
,Tweet: Volunteers needed urgently for flood relief efforts in affected areas
,Classification: ON-TOPIC - FLOODS
,Confide

['disaster_tfidf_vectorizer.joblib']