In [1]:
# !pip install transformers datasets accelerate evaluate scikit-learn
import torch
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# 1. Load the dataset
df = pd.read_csv('cleaned_crisis_data.csv')

# 2. Text Cleaning Function
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # URLs
    text = re.sub(r'@\w+', '', text) # Mentions
    text = text.strip()
    return text

# 3. Standardize Column Names
if 'text_info' in df.columns:
    df.rename(columns={'text_info': 'label'}, inplace=True)
elif 'image_info' in df.columns:
    df.rename(columns={'image_info': 'label'}, inplace=True)

# 4. Map labels to numbers (ONLY ONCE)
# We use .dropna() at the end to ensure we don't have empty rows
df['label'] = df['label'].map({'informative': 1, 'not_informative': 0})
df = df.dropna(subset=['label']) # Remove rows that didn't match 'informative' or 'not_informative'

# 5. Apply cleaning
df['cleaned_text'] = df['tweet_text'].apply(clean_text)

# 6. Stratified Split (80% Train, 20% Test)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

print(f"Data ready! Training samples: {len(train_df)}, Testing samples: {len(test_df)}")
print(f"Class distribution:\n{train_df['label'].value_counts(normalize=True)}")

Data ready! Training samples: 14465, Testing samples: 3617
Class distribution:
label
1    0.710957
0    0.289043
Name: proportion, dtype: float64


In [3]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True)

train_ds = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
test_ds = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

# CALCULATE CLASS WEIGHTS
labels = train_df['label'].values
weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(weights, dtype=torch.float).to(device)

print(f"Calculated Weights: Non-Informative={weights[0]:.2f}, Informative={weights[1]:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/14465 [00:00<?, ? examples/s]

Map:   0%|          | 0/3617 [00:00<?, ? examples/s]

Calculated Weights: Non-Informative=1.73, Informative=0.70


In [4]:
import numpy as np
import torch
import torch.nn as nn
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

# 1. Metrics Function (Calculates everything for your report)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate Precision, Recall, and F1 (Weighted handles the imbalance)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 2. Weighted Trainer (The fixed version for the latest Transformers)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # class_weights must be defined in your previous cell
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# 3. Model & Fast Training Configuration
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",        # Evaluate every epoch
    save_strategy="epoch",        # SAVE every epoch (Fixes the ValueError)
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,    # Now this will work
    metric_for_best_model="f1",
    save_total_limit=1,             # Only keep the best model to save space
    report_to="none"

)

# 4. Initialize and Start Training
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

print("Training starting....")
trainer.train()

# 5. Final Save
model.save_pretrained("./final_text_model")
tokenizer.save_pretrained("./final_text_model")
print("Done! Your biased-fixed model is saved.")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training starting... This should take less than 10 minutes on a T4 GPU.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.463502,0.855682,0.851102,0.852253,0.855682
2,0.451900,0.391635,0.858723,0.857249,0.856503,0.858723
3,0.337900,0.417431,0.860382,0.860282,0.860186,0.860382


Done! Your biased-fixed model is saved.


In [22]:
from transformers import pipeline

# AI Model Initialization
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

def run_integrity_audit():
    print("\n⚡ CRISIS INTEGRITY ENGINE : FULL SPECTRUM ANALYSIS")

    # 1. Expanded Disaster Keyword List (All-in-one)
    disaster_keywords = [
        # Natural Disasters
        'earthquake', 'flood', 'tsunami', 'hurricane', 'cyclone', 'tornado', 'monsoon',
        'landslide', 'avalanche', 'wildfire', 'drought', 'quake', 'storm', 'rain',
        # Emergency & Rescue
        'emergency', 'rescue', 'evacuate', 'evacuation', 'shelter', 'trapped', 'missing',
        'relief', 'aid', 'medical', 'hospital', 'ambulance', 'paramedic', '1122', 'ndma',
        # Casualties & Damage
        'casualty', 'dead', 'death', 'died', 'injured', 'injury', 'blood', 'victim',
        'survivor', 'bodies', 'debris', 'collapsed', 'destroyed', 'damage', 'shattered',
        # Fire & Man-made
        'fire', 'explosion', 'blast', 'smoke', 'terrorist', 'shooting', 'hostage',
        # Urgent Alerts
        'alert', 'warning', 'critical', 'urgent', 'disaster', 'catastrophe'
    ]

    # 2. Logic Filters for Fake/OOD Content
    uncertain_words = [
        'think', 'maybe', 'was it', 'perhaps', 'not sure', 'guessing',
        'anyone else', 'did you feel', 'is it true', 'confirm?', 'rumor',
        'heard that', 'could be', 'seems like', 'probably', 'can someone check',
        'is there any news', 'allegedly', 'supposedly', 'vibration?', 'did i dream',
        'checking', 'asking', 'any updates?', 'what happened', 'was that a'
    ]
   alarmist_indicators = [
        'announced', 'nasa', 'who', 'must share', 'toxic gas', 'leak',
        'aliens', 'mystery', 'secret', 'hiding', 'conspiracy', 'official warning',
        'breaking news!!', 'spread this', 'warning!', 'unbelievable', 'shocking',
        'forwarded as received', 'emergency alert!', 'broadcast', 'click here',
        'save your family', 'urgent notice', 'government hiding', 'viral',
        'exclusive', 'don\'t drink', 'gas mask', 'radiation'
    ]
    exaggeration_words = [
        'end of world', 'entire continent', 'no one survived', '10.', '11.', '12.',
        'billion dead', 'everything destroyed', 'wiped out', 'total annihilation',
        'doomsday', 'armageddon', 'all died', 'millions trapped', 'city gone',
        'history\'s biggest', 'wiped off map', 'apocalypse', 'extinction',
        'billions affected', 'complete chaos', 'no hope', 'deadly wave'
    ]
    user_input = input("\nEnter text to analyze: ")

    if not user_input.strip():
        return

    # 3. Feature Extraction
    input_lower = user_input.lower()
    has_context = any(word in input_lower for word in disaster_keywords)
    is_uncertain = any(word in input_lower for word in uncertain_words)
    is_alarmist = any(word in input_lower for word in alarmist_indicators)
    is_exaggerated = any(word in input_lower for word in exaggeration_words)

    result = classifier(user_input)[0]
    score = result['score']
    label = result['label']

    # 4. Decision Logic (Informative vs Non-Informative vs OOD)
    # -----------------------------------------------------------------
    # GATE 1: OOD Check
    if not has_context:
        status = "OOD / IRRELEVANT"
        emoji = "⚪"
        color_code = "\033[94m" # Blue

    # GATE 2: NON-INFORMATIVE (Fake/Panic/Uncertain)
    elif is_uncertain or is_alarmist or is_exaggerated or (label == 'POSITIVE'):
        status = "NON-INFORMATIVE"
        emoji = "❌"
        color_code = "\033[91m" # Red

    # GATE 3: INFORMATIVE (Real/Serious Crisis)
    elif has_context and label == 'NEGATIVE' and score > 0.85:
        status = "INFORMATIVE"
        emoji = "✅"
        color_code = "\033[92m" # Green

    else:
        status = "NON-INFORMATIVE"
        emoji = "❌"
        color_code = "\033[91m" # Red

    # 5. Output Report
    print("\n" + "—"*55)
    print(f"VERDICT    : {color_code}{emoji} {status}\033[0m")
    print(f"CONFIDENCE : {score:.2%}")
    print(f"CONTEXT    : {'Crisis Detected' if has_context else 'General Data'}")
    print("—"*55 + "\n")

if __name__ == "__main__":
    run_integrity_audit()

Device set to use cuda:0



⚡ CRISIS INTEGRITY ENGINE : FULL SPECTRUM ANALYSIS

Enter text to analyze: scientists say a massive flood will hit the world

———————————————————————————————————————————————————————
VERDICT    : [91m❌ NON-INFORMATIVE[0m
CONFIDENCE : 99.90%
CONTEXT    : Crisis Detected
———————————————————————————————————————————————————————



In [23]:
import shutil
import os

# 1. Create a dedicated folder for the final version
save_path = "./disaster_model_final"
if not os.path.exists(save_path):
    os.makedirs(save_path)

# 2. Save the Model and Tokenizer
print("Saving model weights and tokenizer...")
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# 3. Create a Zip file of the folder
zip_name = "disaster_verification_model"
shutil.make_archive(zip_name, 'zip', save_path)

print(f"\n✅ SUCCESS!")
print(f"Model saved to: {save_path}")
print(f"Zip file created: {zip_name}.zip")
print("Check the 'Files' folder in the left sidebar to download it.")

Saving model weights and tokenizer...

✅ SUCCESS!
Model saved to: ./disaster_model_final
Zip file created: disaster_verification_model.zip
Check the 'Files' folder in the left sidebar to download it.
