In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Path to your processed text data
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'

try:
    df = pd.read_csv(processed_csv_path)
    print("DataFrame loaded successfully.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {processed_csv_path}")
    df = None
except Exception as e:
    print(f"Error loading CSV: {e}")
    df = None

Mounted at /content/drive
DataFrame loaded successfully.


In [None]:
if df is not None:
    print("Column names in your DataFrame:")
    print(df.columns)
else:
    print("DataFrame was not loaded. Please check the loading step.")

Column names in your DataFrame:
Index(['tweet_id', 'image_id', 'text_info', 'text_info_conf', 'image_info',
       'image_info_conf', 'text_human', 'text_human_conf', 'image_human',
       'image_human_conf', 'tweet_text', 'image_url', 'image_path',
       'crisis_type', 'is_california_fire', 'has_image', 'cleaned_text',
       'hashtags', 'entities', 'metadata', 'text_embedding'],
      dtype='object')


In [None]:
if df is not None and 'text_human' in df.columns:
    print("Unique values in 'text_human' column:")
    print(df['text_human'].unique())
else:
    print("DataFrame not loaded or 'text_human' column not found.")

Unique values in 'text_human' column:
['infrastructure_and_utility_damage' 'other_relevant_information'
 'not_humanitarian' 'injured_or_dead_people'
 'rescue_volunteering_or_donation_effort' 'affected_individuals'
 'vehicle_damage' 'missing_or_found_people']


In [None]:
if df is not None and 'text_info' in df.columns:
    print("Unique values in 'text_info' column:")
    print(df['text_info'].unique())
else:
    print("DataFrame not loaded or 'text_info' column not found.")

Unique values in 'text_info' column:
['informative' 'not_informative']


In [None]:
if df is not None:
    # Select the text and labels
    text_column = 'cleaned_text'
    label_column = 'text_info'  # Using 'text_info' as the label column

    # Check if the label column exists
    if label_column not in df.columns:
        print(f"Error: Label column '{label_column}' not found in DataFrame.")
        df = None
    else:
        # Convert labels to numerical (assuming 'informative' is string 'informative'/'not informative')
        df['labels'] = df[label_column].apply(lambda x: 1 if str(x).lower() == 'informative' else 0)

        # Select only the necessary columns
        df = df[[text_column, 'labels']].dropna()

        # Split data into training and validation sets
        from sklearn.model_selection import train_test_split

        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

        print(f"Training set size: {len(train_df)}")
        print(f"Validation set size: {len(val_df)}")
else:
    print("DataFrame not loaded, cannot prepare data.")

Training set size: 14465
Validation set size: 3617


In [None]:
if train_df is not None and val_df is not None:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification

    model_name = 'distilbert-base-uncased'  # A smaller, faster model for demonstration
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # 2 classes: informative and not informative
else:
    print("Data not prepared, cannot load model.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
if model is not None:
    from datasets import Dataset

    # Convert pandas DataFrames to Hugging Face Datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    def tokenize_function(examples):
        return tokenizer(examples['cleaned_text'], truncation=True, padding='max_length', max_length=128) # Example: limit to 128 tokens

    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
    tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

    # Remove the original text column as the model will use the tokenized inputs
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(["cleaned_text", "__index_level_0__"])
    tokenized_val_dataset = tokenized_val_dataset.remove_columns(["cleaned_text", "__index_level_0__"])

    # Rename the label column to 'labels' as expected by the Trainer
    tokenized_train_dataset = tokenized_train_dataset.rename_column("labels", "label")
    tokenized_val_dataset = tokenized_val_dataset.rename_column("labels", "label")

    print("Datasets tokenized.")
else:
    print("Model not loaded, cannot tokenize datasets.")

Map:   0%|          | 0/14465 [00:00<?, ? examples/s]

Map:   0%|          | 0/3617 [00:00<?, ? examples/s]

Datasets tokenized.


In [None]:
if train_dataset is not None and val_dataset is not None:
    from transformers import TrainingArguments

    training_args = TrainingArguments(
      output_dir='./results_informative',
      learning_rate=2e-5,
      per_device_train_batch_size=16,  # Reduced batch size
      per_device_eval_batch_size=16,
      gradient_accumulation_steps=2,
      num_train_epochs=3,
      weight_decay=0.01,
      evaluation_strategy='epoch',
      save_strategy='epoch',
      logging_dir='./logs_informative',
      report_to="none"
)

else:
    print("Datasets not ready, cannot define training arguments.")





In [None]:
if model is not None and tokenized_train_dataset is not None and tokenized_val_dataset is not None and training_args is not None:
    from transformers import Trainer
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision = precision_score(labels, preds, average='binary')
        recall = recall_score(labels, preds, average='binary')
        f1 = f1_score(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    print("Starting training...")
    trainer.train()

    print("Training finished. Evaluating...")
    evaluation_results = trainer.evaluate()
    print(evaluation_results)
else:
    print("Trainer not set up, cannot start training.")

  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.375412,0.842964,0.892179,0.870048,0.915466
2,0.287300,0.424813,0.841028,0.89214,0.860347,0.926373


Training finished. Evaluating...


{'eval_loss': 0.42481330037117004, 'eval_accuracy': 0.8410284766380979, 'eval_f1': 0.8921403113862315, 'eval_precision': 0.8603473227206947, 'eval_recall': 0.9263731982859369, 'eval_runtime': 769.3811, 'eval_samples_per_second': 4.701, 'eval_steps_per_second': 0.295, 'epoch': 2.994475138121547}


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load tokenizer and (ideally) the fine-tuned model ---
model_name = 'distilbert-base-uncased' # Use this if you can't reload fine-tuned
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # Ensure num_labels matches your task

# Forcefully add a pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# --- Load your DataFrame ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
    print("DataFrame loaded successfully.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {processed_csv_path}")
    df = None
except Exception as e:
    print(f"Error loading CSV: {e}")
    df = None

# --- Prepare labels (assuming 'text_info' column) ---
if df is not None and 'text_info' in df.columns:
    df['labels'] = df['text_info'].apply(lambda x: 1 if str(x).lower() == 'informative' else 0)
    df_processed = df[['cleaned_text', 'labels']].dropna()
else:
    print("Error: DataFrame not loaded or 'text_info' column not found.")
    df_processed = None

# --- Prepare validation data ---
if df_processed is not None:
    train_df, val_df = train_test_split(df_processed, test_size=0.2, random_state=42)
else:
    val_df = None

if val_df is not None:
    prompt_template = "Classify the following tweet as either informative or not informative regarding a disaster: [TEXT]"

    def create_prompted_text(text):
        return prompt_template.replace("[TEXT]", text)

    prompted_val_texts = [create_prompted_text(text) for text in val_df['cleaned_text'].tolist()]

    # Tokenize the prompted texts
    tokenizer.pad_token = '[PAD]'  # Ensure pad_token is explicitly set
    prompted_val_tokens = tokenizer(prompted_val_texts, truncation=True, padding=True, return_tensors="pt", max_length=128).to(model.device)

    # Get predictions from the (ideally fine-tuned) model
    model.eval()
    with torch.no_grad():
        prompted_predictions = model(**prompted_val_tokens)
        prompted_predicted_labels = torch.argmax(prompted_predictions.logits, dim=-1).cpu().numpy()

    # Get true labels from the validation set
    true_labels = val_df['labels'].tolist()

    # Generate the classification report
    report_with_prompts = classification_report(true_labels, prompted_predicted_labels, target_names=['not_informative', 'informative'])
    print("\nClassification Report on Validation Set (with Prompt):")
    print(report_with_prompts)

else:
    print("Validation DataFrame not available.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DataFrame loaded successfully.

Classification Report on Validation Set (with Prompt):
                 precision    recall  f1-score   support

not_informative       0.00      0.00      0.00      1050
    informative       0.71      1.00      0.83      2567

       accuracy                           0.71      3617
      macro avg       0.35      0.50      0.42      3617
   weighted avg       0.50      0.71      0.59      3617



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load tokenizer and base model ---
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Ensure pad_token is set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# --- Load your DataFrame and prepare validation data (as before) ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
except FileNotFoundError:
    df = None

if df is not None and 'text_info' in df.columns:
    df['labels'] = df['text_info'].apply(lambda x: 1 if str(x).lower() == 'informative' else 0)
    df_processed = df[['cleaned_text', 'labels']].dropna()
    train_df, val_df = train_test_split(df_processed, test_size=0.2, random_state=42)
else:
    val_df = None

if val_df is not None:
    # Tokenize the raw validation texts (without prompt)
    val_texts = val_df['cleaned_text'].tolist()
    val_tokens = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt", max_length=128).to(model.device)

    # Get predictions from the base model
    model.eval()
    with torch.no_grad():
        base_predictions = model(**val_tokens)
        base_predicted_labels = torch.argmax(base_predictions.logits, dim=-1).cpu().numpy()

    true_labels = val_df['labels'].tolist()

    # Generate the classification report for the base model (no prompt)
    report_base = classification_report(true_labels, base_predicted_labels, target_names=['not_informative', 'informative'])
    print("\nClassification Report on Validation Set (Base Model, No Prompt):")
    print(report_base)

else:
    print("Validation DataFrame not available.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Classification Report on Validation Set (Base Model, No Prompt):
                 precision    recall  f1-score   support

not_informative       0.19      0.28      0.22      1050
    informative       0.63      0.49      0.55      2567

       accuracy                           0.43      3617
      macro avg       0.41      0.39      0.39      3617
   weighted avg       0.50      0.43      0.45      3617



In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load tokenizer and (ideally) the fine-tuned model ---
# Replace 'distilbert-base-uncased' with the path to your saved fine-tuned model if available
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Ensure pad_token is set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# --- Load your DataFrame and prepare validation data (as before) ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
except FileNotFoundError:
    df = None

if df is not None and 'text_info' in df.columns:
    df['labels'] = df['text_info'].apply(lambda x: 1 if str(x).lower() == 'informative' else 0)
    df_processed = df[['cleaned_text', 'labels']].dropna()
    train_df, val_df = train_test_split(df_processed, test_size=0.2, random_state=42)
else:
    val_df = None

if val_df is not None:
    # Tokenize the raw validation texts (without prompt)
    val_texts = val_df['cleaned_text'].tolist()
    val_tokens = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt", max_length=128).to(model.device)

    # Get predictions from the base model
    model.eval()
    with torch.no_grad():
        base_predictions = model(**val_tokens)
        base_predicted_labels = torch.argmax(base_predictions.logits, dim=-1).cpu().numpy()

    true_labels = val_df['labels'].tolist()

    # Generate the classification report for the base model (no prompt)
    report_base = classification_report(true_labels, base_predicted_labels, target_names=['not_informative', 'informative'])
    print("\nClassification Report on Validation Set (Base Model, No Prompt):")
    print(report_base)

else:
    print("Validation DataFrame not available.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Classification Report on Validation Set (Base Model, No Prompt):
                 precision    recall  f1-score   support

not_informative       0.26      0.71      0.38      1050
    informative       0.57      0.16      0.25      2567

       accuracy                           0.32      3617
      macro avg       0.41      0.43      0.31      3617
   weighted avg       0.48      0.32      0.29      3617



# **RAG**

In [None]:
import pandas as pd

# --- Load your DataFrame ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
    print("DataFrame loaded successfully for RAG preparation.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {processed_csv_path}")
    df = None
except Exception as e:
    print(f"Error loading CSV: {e}")
    df = None

if df is not None and 'cleaned_text' in df.columns and 'text_info' in df.columns:
    # Filter for informative tweets (using ground truth)
    informative_tweets_df = df[df['text_info'].str.lower() == 'informative'][['cleaned_text']]
    print(f"{len(informative_tweets_df)} informative tweets identified for RAG.")
else:
    informative_tweets_df = None
    print("Error: DataFrame or required columns not found for RAG preparation.")

# Now, informative_tweets_df contains the text of your informative tweets.
# This is your basic knowledge base for retrieval.

DataFrame loaded successfully for RAG preparation.
12855 informative tweets identified for RAG.


 The below retrieved tweets don't seem directly related to the query about "power outages." This highlights the limitations of a simple keyword-based search. The keywords "hurricane," "irma," "weather," and "advisory" were likely present in the informative tweets, but they don't specifically address power outages.

In [None]:
def retrieve_relevant_tweets(user_query, knowledge_base_df, top_n=3):
    """
    Retrieves the top_n most relevant tweets from the knowledge base
    based on keyword matching with the user query.

    Args:
        user_query (str): The user's question or request.
        knowledge_base_df (pd.DataFrame): DataFrame containing informative tweets
                                           with a 'cleaned_text' column.
        top_n (int): The maximum number of relevant tweets to retrieve.

    Returns:
        list: A list of the top_n most relevant tweet texts.
    """
    if knowledge_base_df is None or knowledge_base_df.empty:
        return []

    query_keywords = user_query.lower().split()
    relevant_tweets = []

    for index, row in knowledge_base_df.iterrows():
        tweet_text = row['cleaned_text'].lower()
        for keyword in query_keywords:
            if keyword in tweet_text:
                relevant_tweets.append(row['cleaned_text'])
                break  # Move to the next tweet once a keyword is found

    return relevant_tweets[:top_n]

# Example usage:
user_question = "What is the current situation with power outages?"
relevant_info = retrieve_relevant_tweets(user_question, informative_tweets_df)

if relevant_info:
    print(f"Retrieved relevant information for query: '{user_question}'")
    for i, tweet in enumerate(relevant_info):
        print(f"[{i+1}] {tweet}")
else:
    print(f"No relevant information found for query: '{user_question}'")

Retrieved relevant information for query: 'What is the current situation with power outages?'
[1] rt island barbuda literally water hurricane irma
[2] pm hurricane irma update weather
[3] rt pm advisory hurricane irma firstalertwx


still not very relevant results

In [None]:
def retrieve_relevant_tweets_improved(user_query, knowledge_base_df, top_n=3):
    """
    Retrieves the top_n most relevant tweets based on the number of
    matching keywords.

    Args:
        user_query (str): The user's question or request.
        knowledge_base_df (pd.DataFrame): DataFrame containing informative tweets
                                           with a 'cleaned_text' column.
        top_n (int): The maximum number of relevant tweets to retrieve.

    Returns:
        list: A list of the top_n most relevant tweet texts, ordered by
              the number of matching keywords (descending).
    """
    if knowledge_base_df is None or knowledge_base_df.empty:
        return []

    query_keywords = user_query.lower().split()
    tweet_scores = {}

    for index, row in knowledge_base_df.iterrows():
        tweet_text = row['cleaned_text'].lower()
        score = 0
        for keyword in query_keywords:
            if keyword in tweet_text:
                score += 1
        if score > 0:
            tweet_scores[row['cleaned_text']] = score

    # Sort tweets by score (highest score first) and get the top_n
    sorted_tweets = sorted(tweet_scores.items(), key=lambda item: item[1], reverse=True)
    return [tweet for tweet, score in sorted_tweets[:top_n]]

# Example usage with the improved function:
user_question = "What is the current situation with power outages?"
relevant_info = retrieve_relevant_tweets_improved(user_question, informative_tweets_df)

if relevant_info:
    print(f"Retrieved relevant information for query: '{user_question}' (Improved)")
    for i, tweet in enumerate(relevant_info):
        print(f"[{i+1}] {tweet}")
else:
    print(f"No relevant information found for query: '{user_question}'")

Retrieved relevant information for query: 'What is the current situation with power outages?' (Improved)
[1] theguardianukcrisis grows puerto rico town without water power phone service
[2] powerhouse chef thepalmbeaches partner fundraising dinner sept aid florida key post
[3] powerful hurricane irma could next weather disaster


simulating the generation -ignore
The language model identifies the word "power" in the first tweet.
It understands that a lack of power is a common problem during crises.
It likely disregards the irrelevant context of the other two tweets ("powerhouse chef," "powerful hurricane" in a future context).
It uses its general knowledge to infer that power outages are a relevant concern during disasters.

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# --- Step 1: Prepare Your Informative Data for Retrieval (Ground Truth) ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
    print("DataFrame loaded successfully for RAG preparation.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {processed_csv_path}")
    df = None
except Exception as e:
    print(f"Error loading CSV: {e}")
    df = None

informative_tweets_df = None
if df is not None and 'cleaned_text' in df.columns and 'text_info' in df.columns:
    informative_tweets_df = df[df['text_info'].str.lower() == 'informative'][['cleaned_text']].reset_index(drop=True)
    print(f"{len(informative_tweets_df)} informative tweets identified for RAG.")
else:
    print("Error: DataFrame or required columns not found for RAG preparation.")

# --- Step 2: Implement the Retrieval Mechanism (Improved Keyword Matching) ---
def retrieve_relevant_tweets_improved(user_query, knowledge_base_df, top_n=3):
    if knowledge_base_df is None or knowledge_base_df.empty:
        return []

    query_keywords = user_query.lower().split()
    tweet_scores = {}

    for index, row in knowledge_base_df.iterrows():
        tweet_text = row['cleaned_text'].lower()
        score = 0
        for keyword in query_keywords:
            if keyword in tweet_text:
                score += 1
        if score > 0:
            tweet_scores[row['cleaned_text']] = score

    sorted_tweets = sorted(tweet_scores.items(), key=lambda item: item[1], reverse=True)
    return [tweet for tweet, score in sorted_tweets[:top_n]]

# --- Step 3: Simulate the Generation Step (Illustrative - No actual LLM API call) ---
def generate_answer(user_question, retrieved_tweets):
    if not retrieved_tweets:
        return "No relevant information found to answer your question."

    context = "\n".join([f"Tweet {i+1}: {tweet}" for i, tweet in enumerate(retrieved_tweets)])
    prompt = f"""Answer the user's question based on the following retrieved information:
{context}
User Question: {user_question}"""

    print("\n--- Simulated Prompt ---")
    print(prompt)
    print("\n--- Simulated Answer (Illustrative) ---")
    # In a real application, you would send this prompt to a language model API
    # and get the actual generated answer.
    return "Based on the retrieved information, and general knowledge, here's a potential answer..."

# --- Example RAG Workflow ---
if informative_tweets_df is not None:
    user_question = "What is the current situation with power outages?"
    relevant_info = retrieve_relevant_tweets_improved(user_question, informative_tweets_df)

    print(f"\nRetrieved relevant information for query: '{user_question}' (Improved)")
    for i, tweet in enumerate(relevant_info):
        print(f"[{i+1}] {tweet}")

    answer = generate_answer(user_question, relevant_info)
    print("\n--- RAG Simulated Answer ---")
    print(answer)

else:
    print("Knowledge base not available, cannot run RAG example.")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

DataFrame loaded successfully for RAG preparation.
12855 informative tweets identified for RAG.

Retrieved relevant information for query: 'What is the current situation with power outages?' (Improved)
[1] theguardianukcrisis grows puerto rico town without water power phone service
[2] powerhouse chef thepalmbeaches partner fundraising dinner sept aid florida key post
[3] powerful hurricane irma could next weather disaster

--- Simulated Prompt ---
Answer the user's question based on the following retrieved information:
Tweet 1: theguardianukcrisis grows puerto rico town without water power phone service
Tweet 2: powerhouse chef thepalmbeaches partner fundraising dinner sept aid florida key post
Tweet 3: powerful hurricane irma could next weather disaster
User Question: What is the current situation with power outages?

--- Simulated Answer (Illustrative) ---

--- RAG Simulated Answer ---
Based on the retrieved information, and general knowledge, here's a potential answer...


real generation with llm-ignore

In [None]:
pip install transformers



In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
import torch
print(torch.version.cuda)

11.8


In [None]:
pip uninstall -y torchvision

Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124


In [None]:
pip install torchvision==0.16.0+cu118 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: read)

In [None]:
# --- Ensure no large variables are taking up RAM ---
if 'df' in locals():
    del df

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Load Llama 3 Model and Tokenizer ---
model_name_llama = "meta-llama/Meta-Llama-3-8b-instruct"
your_token = "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"  # Replace with your actual token

# Load the tokenizer
tokenizer_llama = AutoTokenizer.from_pretrained(model_name_llama, token=your_token)

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model_llama = AutoModelForCausalLM.from_pretrained(model_name_llama, token=your_token, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)

print(f"Model and tokenizer for {model_name_llama} loaded on {device}.")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Mistral-ignore

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# --- Load Mistral 7B Instruct v0.2 Model and Tokenizer ---
model_name_mistral = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_mistral = AutoModelForCausalLM.from_pretrained(model_name_mistral, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)

print(f"Model and tokenizer for {model_name_mistral} loaded on {device}.")

# --- Load your DataFrame and prepare informative tweets (if not already done) ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
    informative_tweets_df = df[df['text_info'].str.lower() == 'informative'][['cleaned_text']].reset_index(drop=True)
    print("Informative tweets DataFrame is ready.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {processed_csv_path}")
    informative_tweets_df = None
except Exception as e:
    print(f"Error loading CSV: {e}")
    informative_tweets_df = None

# --- Improved Retrieval Function (if not already defined) ---
def retrieve_relevant_tweets_improved(user_query, knowledge_base_df, top_n=3):
    if knowledge_base_df is None or knowledge_base_df.empty:
        return []

    query_keywords = user_query.lower().split()
    tweet_scores = {}

    for index, row in knowledge_base_df.iterrows():
        tweet_text = row['cleaned_text'].lower()
        score = 0
        for keyword in query_keywords:
            if keyword in tweet_text:
                score += 1
        if score > 0:
            tweet_scores[row['cleaned_text']] = score

    sorted_tweets = sorted(tweet_scores.items(), key=lambda item: item[1], reverse=True)
    return [tweet for tweet, score in sorted_tweets[:top_n]]

# --- Generate Answer with Mistral ---
def generate_answer_with_mistral(user_question, retrieved_tweets, tokenizer, model):
    if not retrieved_tweets:
        return "No relevant information found to answer your question."

    context = "\n".join([f"Tweet {i+1}: {tweet}" for i, tweet in enumerate(retrieved_tweets)])
    prompt = f"""<s>[INST] Answer the user's question based on the following retrieved information:
{context}
User Question: {user_question} [/INST]"""

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=500,  # Adjust as needed
            num_return_sequences=1,
            temperature=0.2,  # Adjust for creativity
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    assistant_start = generated_answer.find("[/INST]")
    if assistant_start != -1:
        return generated_answer[assistant_start + len("[/INST]") :].strip()
    else:
        return generated_answer.strip()

# --- Example RAG Workflow ---
if informative_tweets_df is not None:
    user_question = "What is the current situation with power outages?"
    relevant_info = retrieve_relevant_tweets_improved(user_question, informative_tweets_df)

    print(f"\nRetrieved relevant information for query: '{user_question}' (Improved)")
    for i, tweet in enumerate(relevant_info):
        print(f"[{i+1}] {tweet}")

    answer = generate_answer_with_mistral(user_question, relevant_info, tokenizer_mistral, model_mistral)
    print("\n--- RAG Answer from Mistral 7B Instruct v0.2 ---")
    print(answer)

else:
    print("Knowledge base not available, cannot run RAG example.")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd

# --- Load your DataFrame and prepare informative tweets ---
processed_csv_path = '/content/drive/MyDrive/processed_text_data_flair_extended.csv'
try:
    df = pd.read_csv(processed_csv_path)
    informative_tweets_df = df[df['text_info'].str.lower() == 'informative'][['cleaned_text']].reset_index(drop=True)
    print(f"Original informative tweets DataFrame loaded with {len(informative_tweets_df)} entries.")
    # --- Create a small sample ---
    informative_tweets_df_sample = informative_tweets_df.head(100)
    print(f"Working with a sample of {len(informative_tweets_df_sample)} informative tweets.")
except FileNotFoundError:
    print(f"Error: CSV file not found at {processed_csv_path}")
    informative_tweets_df_sample = None
except Exception as e:
    print(f"Error loading CSV: {e}")
    informative_tweets_df_sample = None

# --- Load Mistral 7B Instruct v0.2 Model and Tokenizer ---
model_name_mistral = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer_mistral = AutoTokenizer.from_pretrained(model_name_mistral)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_mistral = AutoModelForCausalLM.from_pretrained(model_name_mistral, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)

print(f"Model and tokenizer for {model_name_mistral} loaded on {device}.")

# --- Improved Retrieval Function (using the sample) ---
def retrieve_relevant_tweets_improved(user_query, knowledge_base_df, top_n=3):
    if knowledge_base_df is None or knowledge_base_df.empty:
        return []

    query_keywords = user_query.lower().split()
    tweet_scores = {}

    for index, row in knowledge_base_df.iterrows():
        tweet_text = row['cleaned_text'].lower()
        score = 0
        for keyword in query_keywords:
            if keyword in tweet_text:
                score += 1
        if score > 0:
            tweet_scores[row['cleaned_text']] = score

    sorted_tweets = sorted(tweet_scores.items(), key=lambda item: item[1], reverse=True)
    return [tweet for tweet, score in sorted_tweets[:top_n]]

# --- Generate Answer with Mistral ---
def generate_answer_with_mistral(user_question, retrieved_tweets, tokenizer, model):
    if not retrieved_tweets:
        return "No relevant information found to answer your question."

    context = "\n".join([f"Tweet {i+1}: {tweet}" for i, tweet in enumerate(retrieved_tweets)])
    prompt = f"""<s>[INST] Answer the user's question based on the following retrieved information:
{context}
User Question: {user_question} [/INST]"""

    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=500,  # Adjust as needed
            num_return_sequences=1,
            temperature=0.2,  # Adjust for creativity
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    assistant_start = generated_answer.find("[/INST]")
    if assistant_start != -1:
        return generated_answer[assistant_start + len("[/INST]") :].strip()
    else:
        return generated_answer.strip()

# --- Example RAG Workflow (using the sample DataFrame) ---
if informative_tweets_df_sample is not None:
    user_question = "What is the current situation with power outages?"
    relevant_info = retrieve_relevant_tweets_improved(user_question, informative_tweets_df_sample, top_n=3)

    print(f"\nRetrieved relevant information for query: '{user_question}' (Improved - Sample)")
    for i, tweet in enumerate(relevant_info):
        print(f"[{i+1}] {tweet}")

    answer = generate_answer_with_mistral(user_question, relevant_info, tokenizer_mistral, model_mistral)
    print("\n--- RAG Answer from Mistral 7B Instruct v0.2 (Sample) ---")
    print(answer)

else:
    print("Informative tweets DataFrame sample is not available.")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

Original informative tweets DataFrame loaded with 12855 entries.
Working with a sample of 100 informative tweets.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warn(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

ground truth csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
csv_path = '/content/drive/MyDrive/ground_truth_dataset.csv'

In [None]:
import pandas as pd

# --- Replace with the actual path to your friend's CSV file ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset.csv'

try:
    # Load the CSV file into a pandas DataFrame named 'ground_truth_df'
    ground_truth_df = pd.read_csv(csv_path)
    print("CSV file loaded successfully.")
    print(ground_truth_df.head()) # Display the first few rows
    print(ground_truth_df.columns) # Display the column names

    # --- Create the 'Wildfire' label based on the 'state' column ---
    ground_truth_df['Wildfire'] = 'No'
    ground_truth_df.loc[ground_truth_df['state'] == 'California', 'Wildfire'] = 'Yes'

    print("\n'Wildfire' label created.")
    print(ground_truth_df[['state', 'Wildfire']].head()) # Display first few rows with 'Wildfire' label
    print("\nDistribution of 'Wildfire' label:")
    print(ground_truth_df['Wildfire'].value_counts()) # Show counts of 'Yes' and 'No'

except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV file loaded successfully.
       tweet_id                  image_id  \
0  9.177910e+17  917791044158185473_0.jpg   
1  9.177911e+17  917791130590183424_0.jpg   
2  9.177913e+17  917791291823591425_0.jpg   
3  9.177913e+17  917791291823591425_1.jpg   
4  9.177921e+17  917792092100988929_0.jpg   

                                      raw_tweet_text  \
0  RT @Gizmodo: Wildfires raging through Northern...   
1  PHOTOS: Deadly wildfires rage in California ht...   
2  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
3  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
4  RT @TIME: California's raging wildfires as you...   

                                          tweet_text tweet_hashtags  \
0  wildfires raging through northern california a...            NaN   
1         photos deadly wildfires rage in california            NaN   
2  pls share were capturing wildfire response rec...            NaN   
3  pls share were capturing wildfire response rec...            NaN   
4  cali

In [None]:
# Save the DataFrame to a new CSV file
output_csv_path = '/content/ground_truth_dataset_with_wildfire.csv'
ground_truth_df.to_csv(output_csv_path, index=False)

print(f"\nDataFrame with 'Wildfire' label saved to: {output_csv_path}")


DataFrame with 'Wildfire' label saved to: /content/ground_truth_dataset_with_wildfire.csv


Start with the state as the primary location. Since your dataset is focused on California wildfires, we know the state is relevant.

If sub_location is available (not NaN), append it to the state with a separator (e.g., ", "). This will provide more specific location details when they exist.

If sub_location is missing (NaN), just use "California" as the location.

In [None]:
# Create a new 'Location' column
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',  # Handle cases where state might be missing (though unlikely here)
    axis=1
)

print("\n'Location' column created.")
print(ground_truth_df[['state', 'sub_location', 'Location']].head(10)) # Display first 10 rows
print("\nValue counts for 'Location':")
print(ground_truth_df['Location'].value_counts().head(20)) # Show top 20 locations


'Location' column created.
        state sub_location               Location
0  California     northern   California, northern
1  California          NaN             California
2         NaN          NaN  No location mentioned
3         NaN          NaN  No location mentioned
4  California          NaN             California
5  California          NaN             California
6  California    wildfires  California, wildfires
7  California          NaN             California
8  California          NaN             California
9  California          NaN             California

Value counts for 'Location':
Location
No location mentioned                            16730
California                                         966
California, northern                                88
California, southern                                15
California, santa rosa                               9
California, napa                                     8
California, wildfires                                

In [None]:
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
    # Add more keywords and responders as you analyze your 'take_action' data
}

In [None]:
def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()  # Convert to string and lowercase for matching
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)

print("\n'Responders (Suggested)' column created.")
print(ground_truth_df[['distress', 'take_action', 'Responders (Suggested)']].head(20))
print("\nValue counts for 'Responders (Suggested)':")
print(ground_truth_df['Responders (Suggested)'].value_counts().head(20))


'Responders (Suggested)' column created.
    distress                          take_action  \
0          0                                  NaN   
1          0                                  NaN   
2          0                                  NaN   
3          0                                  NaN   
4          0                                  NaN   
5          0                                  NaN   
6          1  send evacuation and shelter support   
7          0                                  NaN   
8          0                                  NaN   
9          0                                  NaN   
10         0                                  NaN   
11         0                                  NaN   
12         0                                  NaN   
13         0                                  NaN   
14         1          start missing person search   
15         0                                  NaN   
16         1          start missing person search   
17  

In [None]:
inspection_sample = ground_truth_df.sample(frac=0.15, random_state=42) # Adjust fraction as needed
print(f"Generated a sample of {len(inspection_sample)} rows for manual inspection.")

Generated a sample of 2712 rows for manual inspection.


In [None]:
inspection_subset = inspection_sample[['raw_tweet_text', 'state', 'sub_location', 'Wildfire', 'distress', 'Location', 'take_action', 'Responders (Suggested)']]
print(inspection_subset.head(20)) # Display the first 20 rows of the sample

                                          raw_tweet_text       state  \
11482  . #Maria is now a weak and ragged looking Cat-...         NaN   
13112  Puerto Rico governor: I answered Trump... http...         NaN   
2501   RT @MPrendergastTX: Buffalo Bayou in Houston. ...         NaN   
322    Company Helps Coordinate Air Attack On Califor...  California   
8422   4th hr's back &amp; louder than Irma #whatifIr...         NaN   
15453  Turkish Red Crescent cooperates with Iraqi Red...         NaN   
11923  #PuertoRico has suffered immense devastation f...         NaN   
7695   BuzzFeed : This Florida county used an interpr...     Florida   
15472  President Dr. Kerem Kinik in Darbendixan distr...         NaN   
7945   .@SecretarySonny Perdue, @marcorubio and @TomR...     Florida   
290    Fire chief: We got outrun by the fires https:/...         NaN   
12303  Hurricane Maria Not Getting Same Amount of Cov...         NaN   
11326  This morning's update on #hurricanemaria - rem...        

*dataset* split

In [None]:
import pandas as pd
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'  # Or the path where you saved it
ground_truth_df = pd.read_csv(csv_path)
print("Ground truth dataset loaded.")

Ground truth dataset loaded.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load the CSV file from the root directory ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'

try:
    ground_truth_df = pd.read_csv(csv_path)
    print("Ground truth dataset loaded.")

    # --- Determine the counts of each combination of 'Wildfire' and 'distress' ---
    label_counts = ground_truth_df.groupby(['Wildfire', 'distress']).size().reset_index(name='counts')
    print("Counts of each label combination:")
    print(label_counts)

    # --- Aim for roughly equal samples (up to 25) from each combination for the 100-sample split ---
    sample_size_per_group = 25
    sample_split = pd.DataFrame()
    sampled_indices = []

    for index, row in label_counts.iterrows():
        wildfire_label = row['Wildfire']
        distress_label = row['distress']
        count = row['counts']

        n_samples = min(count, sample_size_per_group)  # Take up to 25, or fewer if the group is smaller
        group_sample = ground_truth_df[
            (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
        ].sample(n=n_samples, random_state=42)

        sample_split = pd.concat([sample_split, group_sample])
        sampled_indices.extend(group_sample.index)

    print(f"\nSize of the Sample Split: {len(sample_split)}")
    print("\nDistribution of labels in the Sample Split:")
    print(sample_split.groupby(['Wildfire', 'distress']).size())

    # --- Create the remaining DataFrame by removing the sampled rows ---
    remaining_df = ground_truth_df.drop(sampled_indices)
    print(f"\nSize of the Remaining DataFrame: {len(remaining_df)}")

    # Now 'sample_split' contains your 100-sample (or close to it, balanced) held-out set
    # and 'remaining_df' contains the data for the 80/10/10 split.

except FileNotFoundError:
    print(f"Error: File not found at /content/ground-truth_dataset_with_wildfire.csv")
except Exception as e:
    print(f"An error occurred: {e}")

Ground truth dataset loaded.
Counts of each label combination:
  Wildfire  distress  counts
0       No         0   14896
1       No         1    1834
2      Yes         0    1204
3      Yes         1     148

Size of the Sample Split: 100

Distribution of labels in the Sample Split:
Wildfire  distress
No        0           25
          1           25
Yes       0           25
          1           25
dtype: int64

Size of the Remaining DataFrame: 17982


In [None]:
from sklearn.model_selection import train_test_split

# --- Split remaining_df into training (80%) and a temporary set (20%) ---
train_df, temp_df = train_test_split(
    remaining_df,
    test_size=0.2,
    stratify=remaining_df[['Wildfire', 'distress']],
    random_state=42
)

# --- Split the temporary set (20%) into validation (10%) and testing (10%) ---
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # 50% of the temp_df is 10% of the original
    stratify=temp_df[['Wildfire', 'distress']],
    random_state=42
)

print(f"Size of Training Set: {len(train_df)}")
print(f"Size of Validation Set: {len(val_df)}")
print(f"Size of Testing Set: {len(test_df)}")

print("\nDistribution of labels in Training Set:")
print(train_df.groupby(['Wildfire', 'distress']).size() / len(train_df))

print("\nDistribution of labels in Validation Set:")
print(val_df.groupby(['Wildfire', 'distress']).size() / len(val_df))

print("\nDistribution of labels in Testing Set:")
print(test_df.groupby(['Wildfire', 'distress']).size() / len(test_df))

Size of Training Set: 14385
Size of Validation Set: 1798
Size of Testing Set: 1799

Distribution of labels in Training Set:
Wildfire  distress
No        0           0.826973
          1           0.100591
Yes       0           0.065554
          1           0.006882
dtype: float64

Distribution of labels in Validation Set:
Wildfire  distress
No        0           0.827030
          1           0.100667
Yes       0           0.065628
          1           0.006674
dtype: float64

Distribution of labels in Testing Set:
Wildfire  distress
No        0           0.827126
          1           0.100611
Yes       0           0.065592
          1           0.006670
dtype: float64


t5-small

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# --- Load base T5 small model and tokenizer ---
model_name_base = "t5-small"
tokenizer_base = AutoTokenizer.from_pretrained(model_name_base)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name_base).to(device)

print(f"Base model and tokenizer for {model_name_base} loaded on {device}.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Base model and tokenizer for t5-small loaded on cuda.


In [None]:
# --- Prepare prompts for the sample split ---
prompts = []
sample_tweets = sample_split['tweet_text'].tolist()

for tweet in sample_tweets:
    prompts.append(f"Is this tweet about a California wildfire? Tweet: {tweet}")
    prompts.append(f"Does this tweet indicate distress or emergency? Tweet: {tweet}")
    prompts.append(f"What location is mentioned in this tweet? Tweet: {tweet}")
    prompts.append(f"What action and responders are needed based on this tweet? Tweet: {tweet}")

# --- Tokenize the prompts ---
inputs = tokenizer_base.batch_encode_plus(prompts, return_tensors="pt", padding=True, truncation=True).to(model_base.device)

# --- Generate predictions ---
with torch.no_grad():
    outputs = model_base.generate(**inputs, max_length=50, num_return_sequences=1)

# --- Decode the predictions ---
predictions = tokenizer_base.batch_decode(outputs, skip_special_tokens=True)

# --- Display the prompts and predictions for the first few examples ---
num_examples = 5
for i in range(num_examples):
    tweet_index = i // 4
    question_index = i % 4
    question = ["Wildfire?", "Distress?", "Location?", "Action/Responders?"][question_index]
    print(f"Tweet: {sample_tweets[tweet_index][:50]}...")
    print(f"Question: {question}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 30)

Tweet: chamillionaire starts the robins heart foundation ...
Question: Wildfire?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: chamillionaire starts the robins heart foundation ...
Question: Distress?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery? Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: chamillionaire starts the robins heart foundation ...
Question: Location?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: chamillionaire starts the robins heart foundation ...
Question: Action/Responders?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: hurricane maria moves north lee still far f

In [None]:
# --- Prepare prompts and generate predictions for a small sample of tweets ---
num_tweets_to_examine = 3  # You can change this number
sample_tweets = sample_split['tweet_text'].tolist()[:num_tweets_to_examine] # Take the first N tweets

all_predictions = []
all_prompts = []
original_tweets = []

for tweet in sample_tweets:
    original_tweets.append(tweet)
    prompts = [
        f"Is this tweet about a California wildfire? Tweet: {tweet}",
        f"Does this tweet indicate distress or emergency? Tweet: {tweet}",
        f"What location is mentioned in this tweet? Tweet: {tweet}",
        f"What action and responders are needed based on this tweet? Tweet: {tweet}"
    ]
    all_prompts.extend(prompts)

    inputs = tokenizer_base.batch_encode_plus(prompts, return_tensors="pt", padding=True, truncation=True).to(model_base.device)

    with torch.no_grad():
        outputs = model_base.generate(**inputs, max_length=50, num_return_sequences=1)

    predictions = tokenizer_base.batch_decode(outputs, skip_special_tokens=True)
    all_predictions.extend(predictions)

# --- Display the prompts and predictions ---
for i in range(len(original_tweets)):
    tweet = original_tweets[i]
    print(f"Tweet: {tweet[:50]}...")
    for j in range(4):
        question = ["Wildfire?", "Distress?", "Location?", "Action/Responders?"][j]
        prediction = all_predictions[i * 4 + j]
        print(f"  Question: {question}")
        print(f"  Prediction: {prediction}")
    print("-" * 30)

Tweet: chamillionaire starts the robins heart foundation ...
  Question: Wildfire?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
  Question: Distress?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery? Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
  Question: Location?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
  Question: Action/Responders?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: hurricane maria moves north lee still far from lan...
  Question: Wildfire?
  Prediction: Tweet: hurricane maria moves north lee still far from land.
  Question: Distress?
  Prediction: Tweet:
  Question: Location?
  Prediction: Hurricane maria moves north lee still far from land
  Question: Action/Res

In [None]:
# Create a new 'Location' column
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',  # Handle cases where state might be missing (though unlikely here)
    axis=1
)

print("\n'Location' column created.")
print(ground_truth_df[['state', 'sub_location', 'Location']].head(10))
print("\nValue counts for 'Location':")
print(ground_truth_df['Location'].value_counts().head(20))


'Location' column created.
        state sub_location               Location
0  California     northern   California, northern
1  California          NaN             California
2         NaN          NaN  No location mentioned
3         NaN          NaN  No location mentioned
4  California          NaN             California
5  California          NaN             California
6  California    wildfires  California, wildfires
7  California          NaN             California
8  California          NaN             California
9  California          NaN             California

Value counts for 'Location':
Location
No location mentioned                            16730
California                                         966
California, northern                                88
California, southern                                15
California, santa rosa                               9
California, napa                                     8
California, wildfires                                

In [None]:
print("Columns in ground_truth_df after creating 'Location':")
print(ground_truth_df.columns)
print("\nColumns in sample_split:")
print(sample_split.columns)

Columns in ground_truth_df after creating 'Location':
Index(['tweet_id', 'image_id', 'raw_tweet_text', 'tweet_text',
       'tweet_hashtags', 'image_caption', 'distress', 'take_action', 'state',
       'sub_location', 'Wildfire', 'Location'],
      dtype='object')

Columns in sample_split:
Index(['tweet_id', 'image_id', 'raw_tweet_text', 'tweet_text',
       'tweet_hashtags', 'image_caption', 'distress', 'take_action', 'state',
       'sub_location', 'Wildfire'],
      dtype='object')


In [None]:
# --- Aim for roughly equal samples (up to 25) from each combination for the 100-sample split ---
sample_size_per_group = 25
sample_split = pd.DataFrame()
sampled_indices = []

for index, row in label_counts.iterrows():
    wildfire_label = row['Wildfire']
    distress_label = row['distress']
    count = row['counts']

    n_samples = min(count, sample_size_per_group)  # Take up to 25, or fewer if the group is smaller
    group_sample = ground_truth_df[
        (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
    ].sample(n=n_samples, random_state=42)

    sample_split = pd.concat([sample_split, group_sample])
    sampled_indices.extend(group_sample.index)

print(f"\nSize of the Sample Split: {len(sample_split)}")
print("\nDistribution of labels in the Sample Split:")
print(sample_split.groupby(['Wildfire', 'distress']).size())

# --- Create the remaining DataFrame by removing the sampled rows ---
remaining_df = ground_truth_df.drop(sampled_indices)
print(f"\nSize of the Remaining DataFrame: {len(remaining_df)}")


Size of the Sample Split: 100

Distribution of labels in the Sample Split:
Wildfire  distress
No        0           25
          1           25
Yes       0           25
          1           25
dtype: int64

Size of the Remaining DataFrame: 17982


In [None]:
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
    # Add more keywords and responders as you analyze your 'take_action' data
}

def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()  # Convert to string and lowercase for matching
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)

print("\n'Responders (Suggested)' column created.")
print(ground_truth_df[['distress', 'take_action', 'Responders (Suggested)']].head(20))
print("\nValue counts for 'Responders (Suggested)':")
print(ground_truth_df['Responders (Suggested)'].value_counts().head(20))


'Responders (Suggested)' column created.
    distress                          take_action  \
0          0                                  NaN   
1          0                                  NaN   
2          0                                  NaN   
3          0                                  NaN   
4          0                                  NaN   
5          0                                  NaN   
6          1  send evacuation and shelter support   
7          0                                  NaN   
8          0                                  NaN   
9          0                                  NaN   
10         0                                  NaN   
11         0                                  NaN   
12         0                                  NaN   
13         0                                  NaN   
14         1          start missing person search   
15         0                                  NaN   
16         1          start missing person search   
17  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Step 1: Load Data ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'
ground_truth_df = pd.read_csv(csv_path)
print("Data loaded.")

# --- Step 2: Create 'Wildfire' Column ---
ground_truth_df['Wildfire'] = 'No'
ground_truth_df.loc[ground_truth_df['state'] == 'California', 'Wildfire'] = 'Yes'
print("'Wildfire' column created.")

# --- Step 3: Create 'Location' Column ---
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',
    axis=1
)
print("'Location' column created.")

# --- Step 4: Create 'Responders (Suggested)' Column ---
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
}

def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)
print("'Responders (Suggested)' column created.")

# --- Step 5: Create sample_split ---
label_counts = ground_truth_df.groupby(['Wildfire', 'distress']).size().reset_index(name='counts')

sample_size_per_group = 25
sample_split = pd.DataFrame()
sampled_indices = []

for index, row in label_counts.iterrows():
    wildfire_label = row['Wildfire']
    distress_label = row['distress']
    count = row['counts']

    n_samples = min(count, sample_size_per_group)
    group_sample = ground_truth_df[
        (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
    ].sample(n=n_samples, random_state=42)

    sample_split = pd.concat([sample_split, group_sample])
    sampled_indices.extend(group_sample.index)

print("sample_split created.")

# --- Step 6: Check Columns of sample_split ---
print("Columns in sample_split after creation:")
print(sample_split.columns)

# --- Step 7: Prepare Ground Truth for Evaluation ---
ground_truth_wildfire = sample_split['Wildfire'].tolist()
ground_truth_distress = sample_split['distress'].tolist()
ground_truth_location = sample_split['Location'].tolist()
ground_truth_action = sample_split['take_action'].tolist()
ground_truth_responders = sample_split['Responders (Suggested)'].tolist()
print("Ground truth prepared for evaluation.")

Data loaded.
'Wildfire' column created.
'Location' column created.
'Responders (Suggested)' column created.
sample_split created.
Columns in sample_split after creation:
Index(['tweet_id', 'image_id', 'raw_tweet_text', 'tweet_text',
       'tweet_hashtags', 'image_caption', 'distress', 'take_action', 'state',
       'sub_location', 'Wildfire', 'Location', 'Responders (Suggested)'],
      dtype='object')
Ground truth prepared for evaluation.


In [None]:
# --- Prepare prompts for the entire sample split ---
prompts = []
sample_tweets = sample_split['tweet_text'].tolist()

for tweet in sample_tweets:
    prompts.append(f"Is this tweet about a California wildfire? Tweet: {tweet}")
    prompts.append(f"Does this tweet indicate distress or emergency? Tweet: {tweet}")
    prompts.append(f"What location is mentioned in this tweet? Tweet: {tweet}")
    prompts.append(f"What action and responders are needed based on this tweet? Tweet: {tweet}")

# --- Tokenize and generate predictions for the entire sample split ---
inputs = tokenizer_base.batch_encode_plus(prompts, return_tensors="pt", padding=True, truncation=True).to(model_base.device)

with torch.no_grad():
    outputs = model_base.generate(**inputs, max_length=50, num_return_sequences=1)

predictions = tokenizer_base.batch_decode(outputs, skip_special_tokens=True)

# --- Reshape predictions to align with the four questions per tweet ---
reshaped_predictions = [predictions[i:i + 4] for i in range(0, len(predictions), 4)]

print("Predictions generated for the entire Sample Split.")
print(f"Number of tweets in Sample Split: {len(sample_tweets)}")
print(f"Number of sets of predictions: {len(reshaped_predictions)}")
print("First example:")
print(f"Tweet: {sample_tweets[0][:50]}...")
print(f"Predictions: {reshaped_predictions[0]}")

Predictions generated for the entire Sample Split.
Number of tweets in Sample Split: 100
Number of sets of predictions: 100
First example:
Tweet: chamillionaire starts the robins heart foundation ...
Predictions: ['Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.', 'Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery? Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.', 'Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.', 'Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.']


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# --- Map T5 predictions to Yes/No for Wildfire ---
predicted_wildfire = []
for prediction in reshaped_predictions:
    if any(keyword in prediction[0].lower() for keyword in ["yes", "it is", "wildfire", "fire", "burn"]):
        predicted_wildfire.append("Yes")
    else:
        predicted_wildfire.append("No")

# --- Map T5 predictions to Yes/No for Distress ---
predicted_distress = []
for prediction in reshaped_predictions:
    if any(keyword in prediction[1].lower() for keyword in ["help", "urgent", "emergency", "need", "assistance", "critical", "danger"]):
        predicted_distress.append("Yes")
    else:
        predicted_distress.append("No")

# --- Evaluate Wildfire detection ---
wildfire_accuracy = accuracy_score(ground_truth_wildfire, predicted_wildfire)
wildfire_f1 = f1_score(
    [1 if label == "Yes" else 0 for label in ground_truth_wildfire],
    [1 if label == "Yes" else 0 for label in predicted_wildfire]
)

print(f"Wildfire Detection Accuracy: {wildfire_accuracy:.4f}")
print(f"Wildfire Detection F1 Score: {wildfire_f1:.4f}")

# --- Evaluate Distress detection ---
# Note: ground_truth_distress is 0 or 1, so we map predicted_distress accordingly
distress_accuracy = accuracy_score(
    ground_truth_distress, [1 if label == "Yes" else 0 for label in predicted_distress]
)
distress_f1 = f1_score(ground_truth_distress, [1 if label == "Yes" else 0 for label in predicted_distress])

print(f"Distress Detection Accuracy: {distress_accuracy:.4f}")
print(f"Distress Detection F1 Score: {distress_f1:.4f}")

Wildfire Detection Accuracy: 0.9000
Wildfire Detection F1 Score: 0.9057
Distress Detection Accuracy: 0.6400
Distress Detection F1 Score: 0.5000


In [None]:
import nltk
import inspect

print("Contents of nltk.metrics:")
print(inspect.getmembers(nltk.metrics))

Contents of nltk.metrics:
All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x7be6332b72e0>, 'runfile': <function runfile at 0x7be633160f40>, '__IPYTHON__': True, 'display': <function display at 0x7be6341dde40>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1014__': <capsule object NULL at 0x7be62ccaa880>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1011__': <capsule object NULL at 0x7be5da242ca0>, '__pybind11_internals_v4_clang_libstdcpp_cxxabi1

In [None]:
import nltk
import inspect

print("Contents of nltk.translate.metrics:")
print(inspect.getmembers(nltk.translate.metrics))

Contents of nltk.translate.metrics:
All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x7be6332b72e0>, 'runfile': <function runfile at 0x7be633160f40>, '__IPYTHON__': True, 'display': <function display at 0x7be6341dde40>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1014__': <capsule object NULL at 0x7be62ccaa880>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1011__': <capsule object NULL at 0x7be5da242ca0>, '__pybind11_internals_v4_clang_libstdc

In [None]:
!pip install rouge



In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

rouge = Rouge()

# --- Evaluate Location Prediction ---
location_bleu_scores = []
location_rouge_scores = []

for i in range(len(sample_tweets)):
    reference = ground_truth_location[i].lower()
    prediction = reshaped_predictions[i][2].lower()
    if reference and prediction:  # Avoid errors with empty strings
        reference_list = [reference.split()]
        prediction_list = prediction.split()
        bleu_score = sentence_bleu(reference_list, prediction_list)
        scores = rouge.get_scores(prediction, reference)
        location_bleu_scores.append(bleu_score)
        if scores:
            location_rouge_scores.append(scores[0])
        else:
            location_rouge_scores.append({'rouge-1': {'f': 0}, 'rouge-l': {'f': 0}}) # Handle cases with no scores

avg_location_bleu = sum(location_bleu_scores) / len(location_bleu_scores) if location_bleu_scores else 0
avg_location_rouge_1 = sum(score['rouge-1']['f'] for score in location_rouge_scores) / len(location_rouge_scores) if location_rouge_scores else 0
avg_location_rouge_l = sum(score['rouge-l']['f'] for score in location_rouge_scores) / len(location_rouge_scores) if location_rouge_scores else 0

print(f"\nAverage BLEU Score (Location): {avg_location_bleu:.4f}")
print(f"Average ROUGE-1 F1 Score (Location): {avg_location_rouge_1:.4f}")
print(f"Average ROUGE-L F1 Score (Location): {avg_location_rouge_l:.4f}")

# --- Evaluate Action/Responders Prediction ---
action_bleu_scores = []
action_rouge_scores = []

for i in range(len(sample_tweets)):
    reference = (str(ground_truth_action[i]) + " " + str(ground_truth_responders[i])).lower()
    prediction = reshaped_predictions[i][3].lower()
    if reference and prediction:  # Avoid errors with empty strings
        reference_list = [reference.split()]
        prediction_list = prediction.split()
        bleu_score = sentence_bleu(reference_list, prediction_list)
        scores = rouge.get_scores(prediction, reference)
        action_bleu_scores.append(bleu_score)
        if scores:
            action_rouge_scores.append(scores[0])
        else:
            action_rouge_scores.append({'rouge-1': {'f': 0}, 'rouge-l': {'f': 0}}) # Handle cases with no scores

avg_action_bleu = sum(action_bleu_scores) / len(action_bleu_scores) if action_bleu_scores else 0
avg_action_rouge_1 = sum(score['rouge-1']['f'] for score in action_rouge_scores) / len(action_rouge_scores) if action_rouge_scores else 0
avg_action_rouge_l = sum(score['rouge-l']['f'] for score in action_rouge_scores) / len(action_rouge_scores) if action_rouge_scores else 0

print(f"\nAverage BLEU Score (Action/Responders): {avg_action_bleu:.4f}")
print(f"Average ROUGE-1 F1 Score (Action/Responders): {avg_action_rouge_1:.4f}")
print(f"Average ROUGE-L F1 Score (Action/Responders): {avg_action_rouge_l:.4f}")


Average BLEU Score (Location): 0.0000
Average ROUGE-1 F1 Score (Location): 0.0732
Average ROUGE-L F1 Score (Location): 0.0732

Average BLEU Score (Action/Responders): 0.0000
Average ROUGE-1 F1 Score (Action/Responders): 0.0130
Average ROUGE-L F1 Score (Action/Responders): 0.0130


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()




```
# This is formatted as code
```

Phase 4: Instruction Fine-Tuning.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Step 1: Load Data ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'
ground_truth_df = pd.read_csv(csv_path)
print("First few rows of ground_truth_df after loading:")
print(ground_truth_df.head())
print("Data loaded.")

# --- Step 2: Create 'Wildfire' Column ---
ground_truth_df['Wildfire'] = 'No'
ground_truth_df.loc[ground_truth_df['state'] == 'California', 'Wildfire'] = 'Yes'
print("'Wildfire' column created.")

# --- Step 3: Create 'Location' Column ---
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',
    axis=1
)
print("'Location' column created.")

# --- Step 4: Create 'Responders (Suggested)' Column ---
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
}

def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)
print("'Responders (Suggested)' column created.")

# --- Step 5: Create sample_split ---
label_counts = ground_truth_df.groupby(['Wildfire', 'distress']).size().reset_index(name='counts')

sample_size_per_group = 25
sample_split = pd.DataFrame()
sampled_indices = []

for index, row in label_counts.iterrows():
    wildfire_label = row['Wildfire']
    distress_label = row['distress']
    count = row['counts']

    n_samples = min(count, sample_size_per_group)
    group_sample = ground_truth_df[
        (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
    ].sample(n=n_samples, random_state=42)

    sample_split = pd.concat([sample_split, group_sample])
    sampled_indices.extend(group_sample.index)

print("sample_split created.")

# --- Step 6: Check Columns of sample_split ---
print("Columns in sample_split after creation:")
print(sample_split.columns)

# --- Step 7: Prepare Ground Truth for Evaluation ---
ground_truth_wildfire = sample_split['Wildfire'].tolist()
ground_truth_distress = sample_split['distress'].tolist()
ground_truth_location = sample_split['Location'].tolist()
ground_truth_action = sample_split['take_action'].tolist()
ground_truth_responders = sample_split['Responders (Suggested)'].tolist()
print("Ground truth prepared for evaluation.")

First few rows of ground_truth_df after loading:
       tweet_id                  image_id  \
0  9.177910e+17  917791044158185473_0.jpg   
1  9.177911e+17  917791130590183424_0.jpg   
2  9.177913e+17  917791291823591425_0.jpg   
3  9.177913e+17  917791291823591425_1.jpg   
4  9.177921e+17  917792092100988929_0.jpg   

                                      raw_tweet_text  \
0  RT @Gizmodo: Wildfires raging through Northern...   
1  PHOTOS: Deadly wildfires rage in California ht...   
2  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
3  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
4  RT @TIME: California's raging wildfires as you...   

                                          tweet_text tweet_hashtags  \
0  wildfires raging through northern california a...            NaN   
1         photos deadly wildfires rage in california            NaN   
2  pls share were capturing wildfire response rec...            NaN   
3  pls share were capturing wildfire response rec...       

In [None]:
# --- Split the data into training, validation, and test sets ---
train_df, temp_df = train_test_split(ground_truth_df, test_size=0.2, random_state=42, stratify=ground_truth_df[['Wildfire', 'distress']])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df[['Wildfire', 'distress']])

print(f"Size of training set: {len(train_df)}")
print(f"Size of validation set: {len(val_df)}")
print(f"Size of test set: {len(test_df)}")

Size of training set: 14465
Size of validation set: 1808
Size of test set: 1809


In [None]:
# --- Step 8: Prepare the training data from train_df ---
train_data = []
for index, row in train_df.iterrows():
    tweet = row['tweet_text']
    wildfire_answer = row['Wildfire']
    distress_answer = 1 if row['distress'] == 1 else 0  # Keep as 0/1 for consistency
    location_answer = row['Location']
    action_responders_answer = f"{row['take_action']} {row['Responders (Suggested)']}"

    train_data.append({
        'prompt': f"Is this tweet about a California wildfire? Tweet: {tweet}",
        'target': wildfire_answer
    })
    train_data.append({
        'prompt': f"Does this tweet indicate distress or emergency? Tweet: {tweet}",
        'target': str(distress_answer)  # Convert to string for text generation
    })
    train_data.append({
        'prompt': f"What location is mentioned in this tweet? Tweet: {tweet}",
        'target': location_answer
    })
    train_data.append({
        'prompt': f"What action and responders are needed based on this tweet? Tweet: {tweet}",
        'target': action_responders_answer
    })

print(f"Number of training examples: {len(train_data)}")
print("First training example:")
print(train_data[0])

Number of training examples: 57860
First training example:
{'prompt': 'Is this tweet about a California wildfire? Tweet: irma victims need our help they cant recover on their own #irmarecovery #irmavictims 9donate medical suppliesb', 'target': 'No'}


# Llama

In [None]:
!pip install transformers torch peft accelerate



In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-2-7b-hf"

try:
    tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
    model_llama = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",  # Automatically put the model on available GPU(s)
    )
    print("LLaMA-2-7b loaded successfully!")
except Exception as e:
    print(f"Error loading LLaMA-2-7b: {e}")
    print("Please ensure you have accepted the terms on Hugging Face and have a valid access token if required.")
    print("We might need to consider a different model or approach.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



LLaMA-2-7b loaded successfully!


In [None]:
tokenizer_llama.pad_token = tokenizer_llama.eos_token
print(f"Padding token set to: {tokenizer_llama.pad_token}")

Padding token set to: </s>


In [None]:
def prepare_llama_data(data, tokenizer, max_length=512):
    tokenized_inputs = []
    for item in data:
        prompt = item['prompt']
        target = item['target']

        prompt_encodings = tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        target_encodings = tokenizer(
            target,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

        tokenized_inputs.append({
            'input_ids': prompt_encodings['input_ids'][0],
            'attention_mask': prompt_encodings['attention_mask'][0],
            'labels': target_encodings['input_ids'][0],
        })
    return tokenized_inputs

# Prepare the training data
processed_train_data_llama = prepare_llama_data(train_data, tokenizer_llama)

print(f"Number of processed training examples: {len(processed_train_data_llama)}")
print("First processed training example:")
print(processed_train_data_llama[0])

Number of processed training examples: 57860
First processed training example:
{'input_ids': tensor([    1,  1317,   445,  7780,   300,  1048,   263,  8046,  8775,  8696,
        29973,   323, 16668, 29901,  3805,   655,  6879,  9893,   817,  1749,
         1371,   896,  5107,  9792,   373,  1009,  1914,   396,  3568,   598,
        11911, 29891,   396,  3568,   485,   919,  9893, 29871, 29929,  9176,
          403, 16083, 28075, 29890,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,

Given the size of the LLaMA 2 7B model, it's highly likely that we'll run into memory issues if we try to fine-tune the entire model on a standard Colab GPU. To address this, we'll use LoRA (Low-Rank Adaptation).

What is LoRA?

LoRA is a Parameter-Efficient Fine-Tuning (PEFT) technique that freezes the pre-trained model weights and adds a small number of new trainable layers (called "adapters"). These adapters are low-rank matrices, which means they have far fewer parameters than the original model. During fine-tuning, only these adapter weights are updated, significantly reducing the memory footprint and training time.

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank of the LoRA matrices
    lora_alpha=32,  # Scaling factor for LoRA weights
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ]
)

# Get the LoRA model
model_lora = get_peft_model(model_llama, lora_config)
model_lora.print_trainable_parameters()

trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2958


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

model_name = "meta-llama/Llama-2-7b-hf"

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
    tokenizer_llama.pad_token = tokenizer_llama.eos_token

    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_enable_fp32_cpu_offload=False,
    )

    # Load the base model directly onto the GPU with quantization
    model_llama = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map={"": device},  # Load directly to GPU
    )
    print("LLaMA-2-7b loaded with 4-bit quantization onto:", device)

    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "up_proj",
            "down_proj",
            "gate_proj",
        ]
    )

    # Get the LoRA model
    model_lora = get_peft_model(model_llama, lora_config)
    model_lora.print_trainable_parameters()

    from transformers import TrainingArguments, Trainer

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./llama-2-7b-lora-fine-tune",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        learning_rate=2e-4,
        num_train_epochs=3,
        fp16=True if device == "cuda" else False,
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=2,
        report_to="tensorboard"
    )

    # Create the Trainer instance
    trainer = Trainer(
        model=model_lora,
        train_dataset=processed_train_data_llama,
        eval_dataset=None,
        args=training_args,
        data_collator=lambda data: {k: torch.stack([f[k] for f in data]) for k in data[0]},
    )

    # Start training
    trainer.train()

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check the error message for more details.")

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

An error occurred: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 16.12 MiB is free. Process 416338 has 14.72 GiB memory in use. Of the allocated memory 14.41 GiB is allocated by PyTorch, and 189.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Please check the error message for more details.


Excellent! The output confirms that LoRA has been successfully applied to the LLaMA 2 model.

As you can see:

Trainable parameters: 19,988,480
Total parameters: 6,758,404,096
Trainable percentage: 0.2958%
This is a dramatic reduction in the number of parameters that will be updated during training. Only about 0.3% of the model's total parameters will be trained, which will significantly reduce memory usage and speed up the fine-tuning process, making it feasible to run on a Colab GPU.

Now that we have our LoRA-adapted LLaMA 2 model and our processed training data, the next step is to set up the training using the Hugging Face Trainer API.

In [None]:
!pip uninstall -y bitsandbytes

Found existing installation: bitsandbytes 0.45.5
Uninstalling bitsandbytes-0.45.5:
  Successfully uninstalled bitsandbytes-0.45.5


In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

model_name = "meta-llama/Llama-2-7b-hf"

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
    tokenizer_llama.pad_token = tokenizer_llama.eos_token

    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_enable_fp32_cpu_offload=False,
    )

    model_llama = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto",
    )
    print("LLaMA-2-7b loaded with 4-bit quantization.")

    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "up_proj",
            "down_proj",
            "gate_proj",
        ]
    )

    # Get the LoRA model
    model_lora = get_peft_model(model_llama, lora_config)
    model_lora.print_trainable_parameters()

    from transformers import TrainingArguments, Trainer

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./llama-2-7b-lora-fine-tune",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        learning_rate=2e-4,
        num_train_epochs=3,
        fp16=True if device == "cuda" else False,
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=2,
        report_to="tensorboard"
    )

    # Create the Trainer instance
    trainer = Trainer(
        model=model_lora,
        train_dataset=processed_train_data_llama,
        eval_dataset=None,
        args=training_args,
        data_collator=lambda data: {k: torch.stack([f[k] for f in data]) for k in data[0]},
    )

    # Start training
    trainer.train()

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check the error message for more details.")

Using device: cuda
An error occurred: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 
Please check the error message for more details.
