In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
csv_path = '/content/drive/MyDrive/ground_truth_dataset.csv'

In [None]:
import pandas as pd

# --- Replace with the actual path to your friend's CSV file ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset.csv'

try:
    # Load the CSV file into a pandas DataFrame named 'ground_truth_df'
    ground_truth_df = pd.read_csv(csv_path)
    print("CSV file loaded successfully.")
    print(ground_truth_df.head()) # Display the first few rows
    print(ground_truth_df.columns) # Display the column names

    # --- Create the 'Wildfire' label based on the 'state' column ---
    ground_truth_df['Wildfire'] = 'No'
    ground_truth_df.loc[ground_truth_df['state'] == 'California', 'Wildfire'] = 'Yes'

    print("\n'Wildfire' label created.")
    print(ground_truth_df[['state', 'Wildfire']].head()) # Display first few rows with 'Wildfire' label
    print("\nDistribution of 'Wildfire' label:")
    print(ground_truth_df['Wildfire'].value_counts()) # Show counts of 'Yes' and 'No'

except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

CSV file loaded successfully.
       tweet_id                  image_id  \
0  9.177910e+17  917791044158185473_0.jpg   
1  9.177911e+17  917791130590183424_0.jpg   
2  9.177913e+17  917791291823591425_0.jpg   
3  9.177913e+17  917791291823591425_1.jpg   
4  9.177921e+17  917792092100988929_0.jpg   

                                      raw_tweet_text  \
0  RT @Gizmodo: Wildfires raging through Northern...   
1  PHOTOS: Deadly wildfires rage in California ht...   
2  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
3  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
4  RT @TIME: California's raging wildfires as you...   

                                          tweet_text tweet_hashtags  \
0  wildfires raging through northern california a...            NaN   
1         photos deadly wildfires rage in california            NaN   
2  pls share were capturing wildfire response rec...            NaN   
3  pls share were capturing wildfire response rec...            NaN   
4  cali

In [None]:
# Save the DataFrame to a new CSV file
output_csv_path = '/content/ground_truth_dataset_with_wildfire.csv'
ground_truth_df.to_csv(output_csv_path, index=False)

print(f"\nDataFrame with 'Wildfire' label saved to: {output_csv_path}")


DataFrame with 'Wildfire' label saved to: /content/ground_truth_dataset_with_wildfire.csv


Start with the state as the primary location. Since your dataset is focused on California wildfires, we know the state is relevant.

If sub_location is available (not NaN), append it to the state with a separator (e.g., ", "). This will provide more specific location details when they exist.

If sub_location is missing (NaN), just use "California" as the location.

In [None]:
# Create a new 'Location' column
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',  # Handle cases where state might be missing (though unlikely here)
    axis=1
)

print("\n'Location' column created.")
print(ground_truth_df[['state', 'sub_location', 'Location']].head(10)) # Display first 10 rows
print("\nValue counts for 'Location':")
print(ground_truth_df['Location'].value_counts().head(20)) # Show top 20 locations


'Location' column created.
        state sub_location               Location
0  California     northern   California, northern
1  California          NaN             California
2         NaN          NaN  No location mentioned
3         NaN          NaN  No location mentioned
4  California          NaN             California
5  California          NaN             California
6  California    wildfires  California, wildfires
7  California          NaN             California
8  California          NaN             California
9  California          NaN             California

Value counts for 'Location':
Location
No location mentioned                            16730
California                                         966
California, northern                                88
California, southern                                15
California, santa rosa                               9
California, napa                                     8
California, wildfires                                

In [None]:
  responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
    # Add more keywords and responders as you analyze your 'take_action' data
}

In [None]:
def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()  # Convert to string and lowercase for matching
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)

print("\n'Responders (Suggested)' column created.")
print(ground_truth_df[['distress', 'take_action', 'Responders (Suggested)']].head(20))
print("\nValue counts for 'Responders (Suggested)':")
print(ground_truth_df['Responders (Suggested)'].value_counts().head(20))


'Responders (Suggested)' column created.
    distress                          take_action  \
0          0                                  NaN   
1          0                                  NaN   
2          0                                  NaN   
3          0                                  NaN   
4          0                                  NaN   
5          0                                  NaN   
6          1  send evacuation and shelter support   
7          0                                  NaN   
8          0                                  NaN   
9          0                                  NaN   
10         0                                  NaN   
11         0                                  NaN   
12         0                                  NaN   
13         0                                  NaN   
14         1          start missing person search   
15         0                                  NaN   
16         1          start missing person search   
17  

In [None]:
inspection_sample = ground_truth_df.sample(frac=0.15, random_state=42) # Adjust fraction as needed
print(f"Generated a sample of {len(inspection_sample)} rows for manual inspection.")

Generated a sample of 2712 rows for manual inspection.


In [None]:
inspection_subset = inspection_sample[['raw_tweet_text', 'state', 'sub_location', 'Wildfire', 'distress', 'Location', 'take_action', 'Responders (Suggested)']]
print(inspection_subset.head(20)) # Display the first 20 rows of the sample

                                          raw_tweet_text       state  \
11482  . #Maria is now a weak and ragged looking Cat-...         NaN   
13112  Puerto Rico governor: I answered Trump... http...         NaN   
2501   RT @MPrendergastTX: Buffalo Bayou in Houston. ...         NaN   
322    Company Helps Coordinate Air Attack On Califor...  California   
8422   4th hr's back &amp; louder than Irma #whatifIr...         NaN   
15453  Turkish Red Crescent cooperates with Iraqi Red...         NaN   
11923  #PuertoRico has suffered immense devastation f...         NaN   
7695   BuzzFeed : This Florida county used an interpr...     Florida   
15472  President Dr. Kerem Kinik in Darbendixan distr...         NaN   
7945   .@SecretarySonny Perdue, @marcorubio and @TomR...     Florida   
290    Fire chief: We got outrun by the fires https:/...         NaN   
12303  Hurricane Maria Not Getting Same Amount of Cov...         NaN   
11326  This morning's update on #hurricanemaria - rem...        

dataset split

In [None]:
import pandas as pd
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'  # Or the path where you saved it
ground_truth_df = pd.read_csv(csv_path)
print("Ground truth dataset loaded.")

Ground truth dataset loaded.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load the CSV file from the root directory ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'

try:
    ground_truth_df = pd.read_csv(csv_path)
    print("Ground truth dataset loaded.")

    # --- Determine the counts of each combination of 'Wildfire' and 'distress' ---
    label_counts = ground_truth_df.groupby(['Wildfire', 'distress']).size().reset_index(name='counts')
    print("Counts of each label combination:")
    print(label_counts)

    # --- Aim for roughly equal samples (up to 25) from each combination for the 100-sample split ---
    sample_size_per_group = 25
    sample_split = pd.DataFrame()
    sampled_indices = []

    for index, row in label_counts.iterrows():
        wildfire_label = row['Wildfire']
        distress_label = row['distress']
        count = row['counts']

        n_samples = min(count, sample_size_per_group)  # Take up to 25, or fewer if the group is smaller
        group_sample = ground_truth_df[
            (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
        ].sample(n=n_samples, random_state=42)

        sample_split = pd.concat([sample_split, group_sample])
        sampled_indices.extend(group_sample.index)

    print(f"\nSize of the Sample Split: {len(sample_split)}")
    print("\nDistribution of labels in the Sample Split:")
    print(sample_split.groupby(['Wildfire', 'distress']).size())

    # --- Create the remaining DataFrame by removing the sampled rows ---
    remaining_df = ground_truth_df.drop(sampled_indices)
    print(f"\nSize of the Remaining DataFrame: {len(remaining_df)}")

    # Now 'sample_split' contains your 100-sample (or close to it, balanced) held-out set
    # and 'remaining_df' contains the data for the 80/10/10 split.

except FileNotFoundError:
    print(f"Error: File not found at /content/ground-truth_dataset_with_wildfire.csv")
except Exception as e:
    print(f"An error occurred: {e}")

Ground truth dataset loaded.
Counts of each label combination:
  Wildfire  distress  counts
0       No         0   14896
1       No         1    1834
2      Yes         0    1204
3      Yes         1     148

Size of the Sample Split: 100

Distribution of labels in the Sample Split:
Wildfire  distress
No        0           25
          1           25
Yes       0           25
          1           25
dtype: int64

Size of the Remaining DataFrame: 17982


In [None]:
from sklearn.model_selection import train_test_split

# --- Split remaining_df into training (80%) and a temporary set (20%) ---
train_df, temp_df = train_test_split(
    remaining_df,
    test_size=0.2,
    stratify=remaining_df[['Wildfire', 'distress']],
    random_state=42
)

# --- Split the temporary set (20%) into validation (10%) and testing (10%) ---
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,  # 50% of the temp_df is 10% of the original
    stratify=temp_df[['Wildfire', 'distress']],
    random_state=42
)

print(f"Size of Training Set: {len(train_df)}")
print(f"Size of Validation Set: {len(val_df)}")
print(f"Size of Testing Set: {len(test_df)}")

print("\nDistribution of labels in Training Set:")
print(train_df.groupby(['Wildfire', 'distress']).size() / len(train_df))

print("\nDistribution of labels in Validation Set:")
print(val_df.groupby(['Wildfire', 'distress']).size() / len(val_df))

print("\nDistribution of labels in Testing Set:")
print(test_df.groupby(['Wildfire', 'distress']).size() / len(test_df))

Size of Training Set: 14385
Size of Validation Set: 1798
Size of Testing Set: 1799

Distribution of labels in Training Set:
Wildfire  distress
No        0           0.826973
          1           0.100591
Yes       0           0.065554
          1           0.006882
dtype: float64

Distribution of labels in Validation Set:
Wildfire  distress
No        0           0.827030
          1           0.100667
Yes       0           0.065628
          1           0.006674
dtype: float64

Distribution of labels in Testing Set:
Wildfire  distress
No        0           0.827126
          1           0.100611
Yes       0           0.065592
          1           0.006670
dtype: float64


t5-small

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# --- Load base T5 small model and tokenizer ---
model_name_base = "t5-small"
tokenizer_base = AutoTokenizer.from_pretrained(model_name_base)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name_base).to(device)

print(f"Base model and tokenizer for {model_name_base} loaded on {device}.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Base model and tokenizer for t5-small loaded on cuda.


In [None]:
# --- Prepare prompts for the sample split ---
prompts = []
sample_tweets = sample_split['tweet_text'].tolist()

for tweet in sample_tweets:
    prompts.append(f"Is this tweet about a California wildfire? Tweet: {tweet}")
    prompts.append(f"Does this tweet indicate distress or emergency? Tweet: {tweet}")
    prompts.append(f"What location is mentioned in this tweet? Tweet: {tweet}")
    prompts.append(f"What action and responders are needed based on this tweet? Tweet: {tweet}")

# --- Tokenize the prompts ---
inputs = tokenizer_base.batch_encode_plus(prompts, return_tensors="pt", padding=True, truncation=True).to(model_base.device)

# --- Generate predictions ---
with torch.no_grad():
    outputs = model_base.generate(**inputs, max_length=50, num_return_sequences=1)

# --- Decode the predictions ---
predictions = tokenizer_base.batch_decode(outputs, skip_special_tokens=True)

# --- Display the prompts and predictions for the first few examples ---
num_examples = 5
for i in range(num_examples):
    tweet_index = i // 4
    question_index = i % 4
    question = ["Wildfire?", "Distress?", "Location?", "Action/Responders?"][question_index]
    print(f"Tweet: {sample_tweets[tweet_index][:50]}...")
    print(f"Question: {question}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 30)

Tweet: chamillionaire starts the robins heart foundation ...
Question: Wildfire?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: chamillionaire starts the robins heart foundation ...
Question: Distress?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery? Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: chamillionaire starts the robins heart foundation ...
Question: Location?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: chamillionaire starts the robins heart foundation ...
Question: Action/Responders?
Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: hurricane maria moves north lee still far f

In [None]:
# --- Prepare prompts and generate predictions for a small sample of tweets ---
num_tweets_to_examine = 3  # You can change this number
sample_tweets = sample_split['tweet_text'].tolist()[:num_tweets_to_examine] # Take the first N tweets

all_predictions = []
all_prompts = []
original_tweets = []

for tweet in sample_tweets:
    original_tweets.append(tweet)
    prompts = [
        f"Is this tweet about a California wildfire? Tweet: {tweet}",
        f"Does this tweet indicate distress or emergency? Tweet: {tweet}",
        f"What location is mentioned in this tweet? Tweet: {tweet}",
        f"What action and responders are needed based on this tweet? Tweet: {tweet}"
    ]
    all_prompts.extend(prompts)

    inputs = tokenizer_base.batch_encode_plus(prompts, return_tensors="pt", padding=True, truncation=True).to(model_base.device)

    with torch.no_grad():
        outputs = model_base.generate(**inputs, max_length=50, num_return_sequences=1)

    predictions = tokenizer_base.batch_decode(outputs, skip_special_tokens=True)
    all_predictions.extend(predictions)

# --- Display the prompts and predictions ---
for i in range(len(original_tweets)):
    tweet = original_tweets[i]
    print(f"Tweet: {tweet[:50]}...")
    for j in range(4):
        question = ["Wildfire?", "Distress?", "Location?", "Action/Responders?"][j]
        prediction = all_predictions[i * 4 + j]
        print(f"  Question: {question}")
        print(f"  Prediction: {prediction}")
    print("-" * 30)

Tweet: chamillionaire starts the robins heart foundation ...
  Question: Wildfire?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
  Question: Distress?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery? Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
  Question: Location?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
  Question: Action/Responders?
  Prediction: Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.
------------------------------
Tweet: hurricane maria moves north lee still far from lan...
  Question: Wildfire?
  Prediction: Tweet: hurricane maria moves north lee still far from land.
  Question: Distress?
  Prediction: Tweet:
  Question: Location?
  Prediction: Hurricane maria moves north lee still far from land
  Question: Action/Res

In [None]:
# Create a new 'Location' column
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',  # Handle cases where state might be missing (though unlikely here)
    axis=1
)

print("\n'Location' column created.")
print(ground_truth_df[['state', 'sub_location', 'Location']].head(10))
print("\nValue counts for 'Location':")
print(ground_truth_df['Location'].value_counts().head(20))


'Location' column created.
        state sub_location               Location
0  California     northern   California, northern
1  California          NaN             California
2         NaN          NaN  No location mentioned
3         NaN          NaN  No location mentioned
4  California          NaN             California
5  California          NaN             California
6  California    wildfires  California, wildfires
7  California          NaN             California
8  California          NaN             California
9  California          NaN             California

Value counts for 'Location':
Location
No location mentioned                            16730
California                                         966
California, northern                                88
California, southern                                15
California, santa rosa                               9
California, napa                                     8
California, wildfires                                

In [None]:
print("Columns in ground_truth_df after creating 'Location':")
print(ground_truth_df.columns)
print("\nColumns in sample_split:")
print(sample_split.columns)

Columns in ground_truth_df after creating 'Location':
Index(['tweet_id', 'image_id', 'raw_tweet_text', 'tweet_text',
       'tweet_hashtags', 'image_caption', 'distress', 'take_action', 'state',
       'sub_location', 'Wildfire', 'Location'],
      dtype='object')

Columns in sample_split:
Index(['tweet_id', 'image_id', 'raw_tweet_text', 'tweet_text',
       'tweet_hashtags', 'image_caption', 'distress', 'take_action', 'state',
       'sub_location', 'Wildfire'],
      dtype='object')


In [None]:
# --- Aim for roughly equal samples (up to 25) from each combination for the 100-sample split ---
sample_size_per_group = 25
sample_split = pd.DataFrame()
sampled_indices = []

for index, row in label_counts.iterrows():
    wildfire_label = row['Wildfire']
    distress_label = row['distress']
    count = row['counts']

    n_samples = min(count, sample_size_per_group)  # Take up to 25, or fewer if the group is smaller
    group_sample = ground_truth_df[
        (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
    ].sample(n=n_samples, random_state=42)

    sample_split = pd.concat([sample_split, group_sample])
    sampled_indices.extend(group_sample.index)

print(f"\nSize of the Sample Split: {len(sample_split)}")
print("\nDistribution of labels in the Sample Split:")
print(sample_split.groupby(['Wildfire', 'distress']).size())

# --- Create the remaining DataFrame by removing the sampled rows ---
remaining_df = ground_truth_df.drop(sampled_indices)
print(f"\nSize of the Remaining DataFrame: {len(remaining_df)}")


Size of the Sample Split: 100

Distribution of labels in the Sample Split:
Wildfire  distress
No        0           25
          1           25
Yes       0           25
          1           25
dtype: int64

Size of the Remaining DataFrame: 17982


In [None]:
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
    # Add more keywords and responders as you analyze your 'take_action' data
}

def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()  # Convert to string and lowercase for matching
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)

print("\n'Responders (Suggested)' column created.")
print(ground_truth_df[['distress', 'take_action', 'Responders (Suggested)']].head(20))
print("\nValue counts for 'Responders (Suggested)':")
print(ground_truth_df['Responders (Suggested)'].value_counts().head(20))


'Responders (Suggested)' column created.
    distress                          take_action  \
0          0                                  NaN   
1          0                                  NaN   
2          0                                  NaN   
3          0                                  NaN   
4          0                                  NaN   
5          0                                  NaN   
6          1  send evacuation and shelter support   
7          0                                  NaN   
8          0                                  NaN   
9          0                                  NaN   
10         0                                  NaN   
11         0                                  NaN   
12         0                                  NaN   
13         0                                  NaN   
14         1          start missing person search   
15         0                                  NaN   
16         1          start missing person search   
17  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Step 1: Load Data ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'
ground_truth_df = pd.read_csv(csv_path)
print("Data loaded.")

# --- Step 2: Create 'Wildfire' Column ---
ground_truth_df['Wildfire'] = 'No'
ground_truth_df.loc[ground_truth_df['state'] == 'California', 'Wildfire'] = 'Yes'
print("'Wildfire' column created.")

# --- Step 3: Create 'Location' Column ---
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',
    axis=1
)
print("'Location' column created.")

# --- Step 4: Create 'Responders (Suggested)' Column ---
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
}

def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)
print("'Responders (Suggested)' column created.")

# --- Step 5: Create sample_split ---
label_counts = ground_truth_df.groupby(['Wildfire', 'distress']).size().reset_index(name='counts')

sample_size_per_group = 25
sample_split = pd.DataFrame()
sampled_indices = []

for index, row in label_counts.iterrows():
    wildfire_label = row['Wildfire']
    distress_label = row['distress']
    count = row['counts']

    n_samples = min(count, sample_size_per_group)
    group_sample = ground_truth_df[
        (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
    ].sample(n=n_samples, random_state=42)

    sample_split = pd.concat([sample_split, group_sample])
    sampled_indices.extend(group_sample.index)

print("sample_split created.")

# --- Step 6: Check Columns of sample_split ---
print("Columns in sample_split after creation:")
print(sample_split.columns)

# --- Step 7: Prepare Ground Truth for Evaluation ---
ground_truth_wildfire = sample_split['Wildfire'].tolist()
ground_truth_distress = sample_split['distress'].tolist()
ground_truth_location = sample_split['Location'].tolist()
ground_truth_action = sample_split['take_action'].tolist()
ground_truth_responders = sample_split['Responders (Suggested)'].tolist()
print("Ground truth prepared for evaluation.")

Data loaded.
'Wildfire' column created.
'Location' column created.
'Responders (Suggested)' column created.
sample_split created.
Columns in sample_split after creation:
Index(['tweet_id', 'image_id', 'raw_tweet_text', 'tweet_text',
       'tweet_hashtags', 'image_caption', 'distress', 'take_action', 'state',
       'sub_location', 'Wildfire', 'Location', 'Responders (Suggested)'],
      dtype='object')
Ground truth prepared for evaluation.


In [None]:
# --- Prepare prompts for the entire sample split ---
prompts = []
sample_tweets = sample_split['tweet_text'].tolist()

for tweet in sample_tweets:
    prompts.append(f"Is this tweet about a California wildfire? Tweet: {tweet}")
    prompts.append(f"Does this tweet indicate distress or emergency? Tweet: {tweet}")
    prompts.append(f"What location is mentioned in this tweet? Tweet: {tweet}")
    prompts.append(f"What action and responders are needed based on this tweet? Tweet: {tweet}")

# --- Tokenize and generate predictions for the entire sample split ---
inputs = tokenizer_base.batch_encode_plus(prompts, return_tensors="pt", padding=True, truncation=True).to(model_base.device)

with torch.no_grad():
    outputs = model_base.generate(**inputs, max_length=50, num_return_sequences=1)

predictions = tokenizer_base.batch_decode(outputs, skip_special_tokens=True)

# --- Reshape predictions to align with the four questions per tweet ---
reshaped_predictions = [predictions[i:i + 4] for i in range(0, len(predictions), 4)]

print("Predictions generated for the entire Sample Split.")
print(f"Number of tweets in Sample Split: {len(sample_tweets)}")
print(f"Number of sets of predictions: {len(reshaped_predictions)}")
print("First example:")
print(f"Tweet: {sample_tweets[0][:50]}...")
print(f"Predictions: {reshaped_predictions[0]}")

Predictions generated for the entire Sample Split.
Number of tweets in Sample Split: 100
Number of sets of predictions: 100
First example:
Tweet: chamillionaire starts the robins heart foundation ...
Predictions: ['Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.', 'Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery? Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.', 'Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.', 'Tweet: chamillionaire starts the robins heart foundation to assist with harvey recovery.']


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# --- Map T5 predictions to Yes/No for Wildfire ---
predicted_wildfire = []
for prediction in reshaped_predictions:
    if any(keyword in prediction[0].lower() for keyword in ["yes", "it is", "wildfire", "fire", "burn"]):
        predicted_wildfire.append("Yes")
    else:
        predicted_wildfire.append("No")

# --- Map T5 predictions to Yes/No for Distress ---
predicted_distress = []
for prediction in reshaped_predictions:
    if any(keyword in prediction[1].lower() for keyword in ["help", "urgent", "emergency", "need", "assistance", "critical", "danger"]):
        predicted_distress.append("Yes")
    else:
        predicted_distress.append("No")

# --- Evaluate Wildfire detection ---
wildfire_accuracy = accuracy_score(ground_truth_wildfire, predicted_wildfire)
wildfire_f1 = f1_score(
    [1 if label == "Yes" else 0 for label in ground_truth_wildfire],
    [1 if label == "Yes" else 0 for label in predicted_wildfire]
)

print(f"Wildfire Detection Accuracy: {wildfire_accuracy:.4f}")
print(f"Wildfire Detection F1 Score: {wildfire_f1:.4f}")

# --- Evaluate Distress detection ---
# Note: ground_truth_distress is 0 or 1, so we map predicted_distress accordingly
distress_accuracy = accuracy_score(
    ground_truth_distress, [1 if label == "Yes" else 0 for label in predicted_distress]
)
distress_f1 = f1_score(ground_truth_distress, [1 if label == "Yes" else 0 for label in predicted_distress])

print(f"Distress Detection Accuracy: {distress_accuracy:.4f}")
print(f"Distress Detection F1 Score: {distress_f1:.4f}")

Wildfire Detection Accuracy: 0.9000
Wildfire Detection F1 Score: 0.9057
Distress Detection Accuracy: 0.6400
Distress Detection F1 Score: 0.5000


In [None]:
import nltk
import inspect

print("Contents of nltk.metrics:")
print(inspect.getmembers(nltk.metrics))

Contents of nltk.metrics:
All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x79a2254ef2e0>, 'runfile': <function runfile at 0x79a225398f40>, '__IPYTHON__': True, 'display': <function display at 0x79a2263f1e40>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1014__': <capsule object NULL at 0x79a205ec2880>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1011__': <capsule object NULL at 0x79a19679d230>, '__pybind11_internals_v4_clang_libstdcpp_cxxabi1

In [None]:
import nltk
import inspect

print("Contents of nltk.translate.metrics:")
print(inspect.getmembers(nltk.translate.metrics))

Contents of nltk.translate.metrics:
All Rights Reserved.

Copyright (c) 2000 BeOpen.com.
All Rights Reserved.

Copyright (c) 1995-2001 Corporation for National Research Initiatives.
All Rights Reserved.

Copyright (c) 1991-1995 Stichting Mathematisch Centrum, Amsterdam.
All Rights Reserved., 'credits':     Thanks to CWI, CNRI, BeOpen.com, Zope Corporation and a cast of thousands
    for supporting Python development.  See www.python.org for more information., 'license': Type license() to see the full license text, 'help': Type help() for interactive help, or help(object) for help about object., 'execfile': <function execfile at 0x79a2254ef2e0>, 'runfile': <function runfile at 0x79a225398f40>, '__IPYTHON__': True, 'display': <function display at 0x79a2263f1e40>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1014__': <capsule object NULL at 0x79a205ec2880>, '__pybind11_internals_v4_gcc_libstdcpp_cxxabi1011__': <capsule object NULL at 0x79a19679d230>, '__pybind11_internals_v4_clang_libstdc

In [None]:
!pip install rouge



In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

rouge = Rouge()

# --- Evaluate Location Prediction ---
location_bleu_scores = []
location_rouge_scores = []

for i in range(len(sample_tweets)):
    reference = ground_truth_location[i].lower()
    prediction = reshaped_predictions[i][2].lower()
    if reference and prediction:  # Avoid errors with empty strings
        reference_list = [reference.split()]
        prediction_list = prediction.split()
        bleu_score = sentence_bleu(reference_list, prediction_list)
        scores = rouge.get_scores(prediction, reference)
        location_bleu_scores.append(bleu_score)
        if scores:
            location_rouge_scores.append(scores[0])
        else:
            location_rouge_scores.append({'rouge-1': {'f': 0}, 'rouge-l': {'f': 0}}) # Handle cases with no scores

avg_location_bleu = sum(location_bleu_scores) / len(location_bleu_scores) if location_bleu_scores else 0
avg_location_rouge_1 = sum(score['rouge-1']['f'] for score in location_rouge_scores) / len(location_rouge_scores) if location_rouge_scores else 0
avg_location_rouge_l = sum(score['rouge-l']['f'] for score in location_rouge_scores) / len(location_rouge_scores) if location_rouge_scores else 0

print(f"\nAverage BLEU Score (Location): {avg_location_bleu:.4f}")
print(f"Average ROUGE-1 F1 Score (Location): {avg_location_rouge_1:.4f}")
print(f"Average ROUGE-L F1 Score (Location): {avg_location_rouge_l:.4f}")

# --- Evaluate Action/Responders Prediction ---
action_bleu_scores = []
action_rouge_scores = []

for i in range(len(sample_tweets)):
    reference = (str(ground_truth_action[i]) + " " + str(ground_truth_responders[i])).lower()
    prediction = reshaped_predictions[i][3].lower()
    if reference and prediction:  # Avoid errors with empty strings
        reference_list = [reference.split()]
        prediction_list = prediction.split()
        bleu_score = sentence_bleu(reference_list, prediction_list)
        scores = rouge.get_scores(prediction, reference)
        action_bleu_scores.append(bleu_score)
        if scores:
            action_rouge_scores.append(scores[0])
        else:
            action_rouge_scores.append({'rouge-1': {'f': 0}, 'rouge-l': {'f': 0}}) # Handle cases with no scores

avg_action_bleu = sum(action_bleu_scores) / len(action_bleu_scores) if action_bleu_scores else 0
avg_action_rouge_1 = sum(score['rouge-1']['f'] for score in action_rouge_scores) / len(action_rouge_scores) if action_rouge_scores else 0
avg_action_rouge_l = sum(score['rouge-l']['f'] for score in action_rouge_scores) / len(action_rouge_scores) if action_rouge_scores else 0

print(f"\nAverage BLEU Score (Action/Responders): {avg_action_bleu:.4f}")
print(f"Average ROUGE-1 F1 Score (Action/Responders): {avg_action_rouge_1:.4f}")
print(f"Average ROUGE-L F1 Score (Action/Responders): {avg_action_rouge_l:.4f}")


Average BLEU Score (Location): 0.0000
Average ROUGE-1 F1 Score (Location): 0.0732
Average ROUGE-L F1 Score (Location): 0.0732

Average BLEU Score (Action/Responders): 0.0000
Average ROUGE-1 F1 Score (Action/Responders): 0.0130
Average ROUGE-L F1 Score (Action/Responders): 0.0130


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Phase 4: Instruction Fine-Tuning.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Step 1: Load Data ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'
ground_truth_df = pd.read_csv(csv_path)
print("First few rows of ground_truth_df after loading:")
print(ground_truth_df.head())
print("Data loaded.")

# --- Step 2: Create 'Wildfire' Column ---
ground_truth_df['Wildfire'] = 'No'
ground_truth_df.loc[ground_truth_df['state'] == 'California', 'Wildfire'] = 'Yes'
print("'Wildfire' column created.")

# --- Step 3: Create 'Location' Column ---
ground_truth_df['Location'] = ground_truth_df.apply(
    lambda row: f"{row['state']}, {row['sub_location']}"
    if pd.notna(row['sub_location']) and row['state'] == 'California'
    else row['state'] if row['state'] == 'California'
    else 'No location mentioned',
    axis=1
)
print("'Location' column created.")

# --- Step 4: Create 'Responders (Suggested)' Column ---
responder_mapping = {
    'evacuate': 'Fire Department, Emergency Management',
    'shelter': 'Red Cross, Emergency Management',
    'rescue': 'Search and Rescue Teams, Fire Department',
    'search': 'Search and Rescue Teams, Law Enforcement',
    'missing person': 'Search and Rescue Teams, Law Enforcement',
    'medical': 'Emergency Medical Services',
    'aid': 'Various Aid Organizations',
    'help': 'General Emergency Services',
    'fire': 'Fire Department',
    'burn': 'Fire Department',
    'monitor': 'Local Authorities, Emergency Services'
}

def suggest_responders(row):
    if row['distress'] == 1:
        action = str(row['take_action']).lower()
        responders = set()
        for keyword, suggested_responder in responder_mapping.items():
            if keyword in action:
                responders.add(suggested_responder)
        if responders:
            return ", ".join(responders)
        else:
            return "Responders unclear"
    else:
        return "Not applicable"

ground_truth_df['Responders (Suggested)'] = ground_truth_df.apply(suggest_responders, axis=1)
print("'Responders (Suggested)' column created.")

# --- Step 5: Create sample_split ---
label_counts = ground_truth_df.groupby(['Wildfire', 'distress']).size().reset_index(name='counts')

sample_size_per_group = 25
sample_split = pd.DataFrame()
sampled_indices = []

for index, row in label_counts.iterrows():
    wildfire_label = row['Wildfire']
    distress_label = row['distress']
    count = row['counts']

    n_samples = min(count, sample_size_per_group)
    group_sample = ground_truth_df[
        (ground_truth_df['Wildfire'] == wildfire_label) & (ground_truth_df['distress'] == distress_label)
    ].sample(n=n_samples, random_state=42)

    sample_split = pd.concat([sample_split, group_sample])
    sampled_indices.extend(group_sample.index)

print("sample_split created.")

# --- Step 6: Check Columns of sample_split ---
print("Columns in sample_split after creation:")
print(sample_split.columns)

# --- Step 7: Prepare Ground Truth for Evaluation ---
ground_truth_wildfire = sample_split['Wildfire'].tolist()
ground_truth_distress = sample_split['distress'].tolist()
ground_truth_location = sample_split['Location'].tolist()
ground_truth_action = sample_split['take_action'].tolist()
ground_truth_responders = sample_split['Responders (Suggested)'].tolist()
print("Ground truth prepared for evaluation.")

First few rows of ground_truth_df after loading:
       tweet_id                  image_id  \
0  9.177910e+17  917791044158185473_0.jpg   
1  9.177911e+17  917791130590183424_0.jpg   
2  9.177913e+17  917791291823591425_0.jpg   
3  9.177913e+17  917791291823591425_1.jpg   
4  9.177921e+17  917792092100988929_0.jpg   

                                      raw_tweet_text  \
0  RT @Gizmodo: Wildfires raging through Northern...   
1  PHOTOS: Deadly wildfires rage in California ht...   
2  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
3  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
4  RT @TIME: California's raging wildfires as you...   

                                          tweet_text tweet_hashtags  \
0  wildfires raging through northern california a...            NaN   
1         photos deadly wildfires rage in california            NaN   
2  pls share were capturing wildfire response rec...            NaN   
3  pls share were capturing wildfire response rec...       

In [None]:
# --- Split the data into training, validation, and test sets ---
train_df, temp_df = train_test_split(ground_truth_df, test_size=0.2, random_state=42, stratify=ground_truth_df[['Wildfire', 'distress']])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df[['Wildfire', 'distress']])

print(f"Size of training set: {len(train_df)}")
print(f"Size of validation set: {len(val_df)}")
print(f"Size of test set: {len(test_df)}")

Size of training set: 14465
Size of validation set: 1808
Size of test set: 1809


In [None]:
# --- Step 8: Prepare the training data from train_df (handling NaN locations) ---
train_data = []
for index, row in train_df.iterrows():
    tweet = row['tweet_text']
    wildfire_answer = 'yes' if row['Wildfire'] == 1 else 'no'
    distress_answer = 'distress' if row['distress'] == 1 else 'not distress'
    location_answer = row['Location']
    action_responders_answer = f"{row['take_action']} {row['Responders (Suggested)']}"

    # Wildfire Classifier with Instruction
    train_data.append({
        'prompt': f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California, not wildfires in general or other locations. Only respond with one word: 'yes' or 'no'. Do not explain. If the tweet references smoke, flames, evacuations, or fire-related events and mentions a California city or region (e.g., LA, San Francisco, Bay Area), respond with 'yes'. Otherwise, respond 'no'. Tweet: {tweet}",
        'target': wildfire_answer
    })

    # Emergency Detection System with Instruction
    train_data.append({
        'prompt': f"You are an emergency detection system. Determine if the tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: {tweet}",
        'target': distress_answer
    })

    # Location Extraction with Instruction (Handle NaN)
    train_data.append({
        'prompt': f"Extract the most specific real-world geographic location mentioned in the tweet that refers to where the California wildfire is happening. This can be a city, neighborhood, street, highway, or region. If no valid place is mentioned, respond with 'unknown'. Respond with only the location name, no extra words. Tweet: {tweet}",
        'target': location_answer if pd.notna(location_answer) else 'unknown'
    })

    # Disaster Response Coordinator with Instruction
    train_data.append({
        'prompt': f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action that responders should take. Choose only one from the following types: evacuation, medical aid, fire suppression, rescue, or resource delivery. If none of these apply or there's no clear threat, respond with 'monitor only'. Respond with only one action. Tweet: {tweet}",
        'target': row['take_action'] if pd.notna(row['take_action']) else 'monitor only' # Handle potential NaN actions as well (though less likely)
    })

print(f"Number of training examples: {len(train_data)}")
print("First training example:")
print(train_data[0])

Number of training examples: 57860
First training example:
{'prompt': "You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California, not wildfires in general or other locations. Only respond with one word: 'yes' or 'no'. Do not explain. If the tweet references smoke, flames, evacuations, or fire-related events and mentions a California city or region (e.g., LA, San Francisco, Bay Area), respond with 'yes'. Otherwise, respond 'no'. Tweet: irma victims need our help they cant recover on their own #irmarecovery #irmavictims 9donate medical suppliesb", 'target': 'no'}


In [None]:
# --- Step 9: Prepare the training data from train_df with instructions ---
train_data = []
for index, row in train_df.iterrows():
    tweet = row['tweet_text']
    wildfire_answer = 'yes' if row['Wildfire'] == 1 else 'no'
    distress_answer = 'distress' if row['distress'] == 1 else 'not distress'
    location_answer = row['Location']
    action_responders_answer = f"{row['take_action']} {row['Responders (Suggested)']}"

    # Wildfire Classifier with Instruction
    train_data.append({
        'prompt': f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California, not wildfires in general or other locations. Only respond with one word: 'yes' or 'no'. Do not explain. If the tweet references smoke, flames, evacuations, or fire-related events and mentions a California city or region (e.g., LA, San Francisco, Bay Area), respond with 'yes'. Otherwise, respond 'no'. Tweet: {tweet}",
        'target': wildfire_answer
    })

    # Emergency Detection System with Instruction
    train_data.append({
        'prompt': f"You are an emergency detection system. Determine if the tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: {tweet}",
        'target': distress_answer
    })

    # Location Extraction with Instruction (Handle NaN)
    train_data.append({
        'prompt': f"Extract the most specific real-world geographic location mentioned in the tweet that refers to where the California wildfire is happening. This can be a city, neighborhood, street, highway, or region. If no valid place is mentioned, respond with 'unknown'. Respond with only the location name, no extra words. Tweet: {tweet}",
        'target': location_answer if pd.notna(location_answer) else 'unknown'
    })

    # Disaster Response Coordinator with Instruction
    train_data.append({
        'prompt': f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action that responders should take. Choose only one from the following types: evacuation, medical aid, fire suppression, rescue, or resource delivery. If none of these apply or there's no clear threat, respond with 'monitor only'. Respond with only one action. Tweet: {tweet}",
        'target': row['take_action'] if pd.notna(row['take_action']) else 'monitor only'
    })

print(f"Number of training examples: {len(train_data)}")
print("First training example:")
print(train_data[0])

Number of training examples: 57860
First training example:
{'prompt': "You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California, not wildfires in general or other locations. Only respond with one word: 'yes' or 'no'. Do not explain. If the tweet references smoke, flames, evacuations, or fire-related events and mentions a California city or region (e.g., LA, San Francisco, Bay Area), respond with 'yes'. Otherwise, respond 'no'. Tweet: irma victims need our help they cant recover on their own #irmarecovery #irmavictims 9donate medical suppliesb", 'target': 'no'}


 Llama

In [None]:
!pip install transformers torch peft accelerate



In [None]:
from huggingface_hub import login

login(token="***********************")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-2-7b-hf"

try:
    tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
    model_llama = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",  # Automatically put the model on available GPU(s)
    )
    print("LLaMA-2-7b loaded successfully!")
except Exception as e:
    print(f"Error loading LLaMA-2-7b: {e}")
    print("Please ensure you have accepted the terms on Hugging Face and have a valid access token if required.")
    print("We might need to consider a different model or approach.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LLaMA-2-7b loaded successfully!


In [None]:
tokenizer_llama.pad_token = tokenizer_llama.eos_token
print(f"Padding token set to: {tokenizer_llama.pad_token}")

Padding token set to: </s>


In [None]:
def prepare_llama_data(data, tokenizer, max_length=512):
    tokenized_inputs = []
    for i, item in enumerate(data):
        if not isinstance(item, dict) or 'prompt' not in item or 'target' not in item:
            print(f"Error at index {i}: Invalid data item - {item}")
            continue  # Skip this item

        prompt = item['prompt']
        target = item['target']

        if not isinstance(prompt, str):
            print(f"Error at index {i}: Prompt is not a string - {prompt}")
            continue

        if not isinstance(target, str):
            print(f"Error at index {i}: Target is not a string - {target}")
            continue

        prompt_encodings = tokenizer(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        target_encodings = tokenizer(
            target,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

        tokenized_inputs.append({
            'input_ids': prompt_encodings['input_ids'][0],
            'attention_mask': prompt_encodings['attention_mask'][0],
            'labels': target_encodings['input_ids'][0],
        })
    return tokenized_inputs

# Prepare the training data
processed_train_data_llama = prepare_llama_data(train_data, tokenizer_llama)

print(f"Number of processed training examples: {len(processed_train_data_llama)}")
if processed_train_data_llama:
    print("First processed training example:")
    print(processed_train_data_llama[0])

Number of processed training examples: 57860
First processed training example:
{'input_ids': tensor([    1,   887,   526,   263,  8775,  8696,   770,  3709, 29889,  5953,
          837,   457,   565,   278,  7780,   300,   338,  9479,  1048,   263,
         8775,  8696, 10464,   297,  8046, 29892,   451,  8775, 29888,  2658,
          297,  2498,   470,   916, 14354, 29889,  9333, 10049,   411,   697,
         1734, 29901,   525,  3582, 29915,   470,   525,  1217,  4286,  1938,
          451,  5649, 29889,   960,   278,  7780,   300,  9282, 25158, 29892,
         1652,  1280, 29892,  3415, 22061,   800, 29892,   470,  3974, 29899,
        12817,  4959,   322, 26649,   263,  8046,  4272,   470,  5120,   313,
        29872, 29889, 29887,  1696, 17900, 29892,  3087,  8970, 29892,  6211,
        18320,   511, 10049,   411,   525,  3582,  4286, 13466, 29892, 10049,
          525,  1217,  4286,   323, 16668, 29901,  3805,   655,  6879,  9893,
          817,  1749,  1371,   896,  5107,  9792,

Given the size of the LLaMA 2 7B model, it's highly likely that we'll run into memory issues if we try to fine-tune the entire model on a standard Colab GPU. To address this, we'll use LoRA (Low-Rank Adaptation).

What is LoRA?

LoRA is a Parameter-Efficient Fine-Tuning (PEFT) technique that freezes the pre-trained model weights and adds a small number of new trainable layers (called "adapters"). These adapters are low-rank matrices, which means they have far fewer parameters than the original model. During fine-tuning, only these adapter weights are updated, significantly reducing the memory footprint and training time.

model loading and LoRA Configuration and Application:



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

model_name = "meta-llama/Llama-2-7b-hf"

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
    tokenizer_llama.pad_token = tokenizer_llama.eos_token

    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        llm_int8_enable_fp32_cpu_offload=False,
    )

    # Load the base model directly onto the GPU with quantization
    model_llama = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map={"": device},  # Load directly to GPU
    )
    print("LLaMA-2-7b loaded with 4-bit quantization onto:", device)

    # Configure LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "up_proj",
            "down_proj",
            "gate_proj",
        ]
    )

    # Get the LoRA model
    model_lora = get_peft_model(model_llama, lora_config)
    model_lora.print_trainable_parameters()

    from transformers import TrainingArguments, Trainer

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./llama-2-7b-lora-fine-tune",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        learning_rate=2e-4,
        num_train_epochs=3,
        fp16=True if device == "cuda" else False,
        logging_dir="./logs",
        logging_strategy="steps",
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=2,
        report_to="tensorboard"
    )

    # Create the Trainer instance
    trainer = Trainer(
        model=model_lora,
        train_dataset=processed_train_data_llama,
        eval_dataset=None,
        args=training_args,
        data_collator=lambda data: {k: torch.stack([f[k] for f in data]) for k in data[0]},
    )

    # Start training
    trainer.train()

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check the error message for more details.")

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LLaMA-2-7b loaded with 4-bit quantization onto: cuda
trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2958


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


An error occurred: device() received an invalid combination of arguments - got (NoneType), but expected one of:
 * (torch.device device)
      didn't match because some of the arguments have invalid types: (!NoneType!)
 * (str type, int index = -1)

Please check the error message for more details.


The output confirms that LoRA has been successfully applied to the LLaMA 2 model.

As you can see:

Trainable parameters: 19,988,480
Total parameters: 6,758,404,096
Trainable percentage: 0.2958%
This is a dramatic reduction in the number of parameters that will be updated during training. Only about 0.3% of the model's total parameters will be trained, which will significantly reduce memory usage and speed up the fine-tuning process, making it feasible to run on a Colab GPU.

Now that we have our LoRA-adapted LLaMA 2 model and our processed training data, the next step is to set up the training using the Hugging Face Trainer API.

In [None]:
!pip install -U bitsandbytes



In [None]:
!pip show bitsandbytes

Name: bitsandbytes
Version: 0.45.5
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/bitsandbytes-foundation/bitsandbytes
Author: 
Author-email: Tim Dettmers <dettmers@cs.washington.edu>
License: MIT License

Copyright (c) Facebook, Inc. and its affiliates.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHA

In [None]:
!pip install --upgrade bitsandbytes



In [None]:
# Prepare the validation data
val_data = []
for index, row in val_df.iterrows():
    tweet = row['tweet_text']
    wildfire_answer = 'yes' if row['Wildfire'] == 1 else 'no'
    distress_answer = 'distress' if row['distress'] == 1 else 'not distress'
    location_answer = row['Location']
    action_responders_answer = f"{row['take_action']} {row['Responders (Suggested)']}"

    # Wildfire Classifier with Instruction
    val_data.append({
        'prompt': f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California, not wildfires in general or other locations. Only respond with one word: 'yes' or 'no'. Do not explain. If the tweet references smoke, flames, evacuations, or fire-related events and mentions a California city or region (e.g., LA, San Francisco, Bay Area), respond with 'yes'. Otherwise, respond 'no'. Tweet: {tweet}",
        'target': wildfire_answer
    })

    # Emergency Detection System with Instruction
    val_data.append({
        'prompt': f"You are an emergency detection system. Determine if the tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: {tweet}",
        'target': distress_answer
    })

    # Location Extraction with Instruction (Handle NaN)
    val_data.append({
        'prompt': f"Extract the most specific real-world geographic location mentioned in the tweet that refers to where the California wildfire is happening. This can be a city, neighborhood, street, highway, or region. If no valid place is mentioned, respond with 'unknown'. Respond with only the location name, no extra words. Tweet: {tweet}",
        'target': location_answer if pd.notna(location_answer) else 'unknown'
    })

    # Disaster Response Coordinator with Instruction
    val_data.append({
        'prompt': f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action that responders should take. Choose only one from the following types: evacuation, medical aid, fire suppression, rescue, or resource delivery. If none of these apply or there's no clear threat, respond with 'monitor only'. Respond with only one action. Tweet: {tweet}",
        'target': row['take_action'] if pd.notna(row['take_action']) else 'monitor only'
    })

processed_val_data_llama = prepare_llama_data(val_data, tokenizer_llama)

print(f"Number of processed validation examples: {len(processed_val_data_llama)}")
if processed_val_data_llama:
    print("First processed validation example:")
    print(processed_val_data_llama[0])

Number of processed validation examples: 7232
First processed validation example:
{'input_ids': tensor([    1,   887,   526,   263,  8775,  8696,   770,  3709, 29889,  5953,
          837,   457,   565,   278,  7780,   300,   338,  9479,  1048,   263,
         8775,  8696, 10464,   297,  8046, 29892,   451,  8775, 29888,  2658,
          297,  2498,   470,   916, 14354, 29889,  9333, 10049,   411,   697,
         1734, 29901,   525,  3582, 29915,   470,   525,  1217,  4286,  1938,
          451,  5649, 29889,   960,   278,  7780,   300,  9282, 25158, 29892,
         1652,  1280, 29892,  3415, 22061,   800, 29892,   470,  3974, 29899,
        12817,  4959,   322, 26649,   263,  8046,  4272,   470,  5120,   313,
        29872, 29889, 29887,  1696, 17900, 29892,  3087,  8970, 29892,  6211,
        18320,   511, 10049,   411,   525,  3582,  4286, 13466, 29892, 10049,
          525,  1217,  4286,   323, 16668, 29901,   864,   304,  1073,   920,
          396, 29876,  4378, 29883,  6911,  79

In [None]:
!pip show accelerate

Name: accelerate
Version: 1.6.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.11/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: peft


In [None]:
!pip install -U accelerate>=0.26.0

model loading

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Define the model name
model_name = "meta-llama/Llama-2-7b-hf"

# Determine the device to use
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

# Configure 4-bit quantization (optional, but recommended for memory efficiency)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=False,
)

# Load the pre-trained model with the quantization configuration (without device_map initially)
model_llama = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
print("LLaMA-2-7b loaded with 4-bit quantization (initially on CPU).")

# Define the LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,             # Rank of the LoRA matrices
    lora_alpha=32,       # Scaling factor for the LoRA matrices
    lora_dropout=0.05,   # Dropout probability for LoRA layers
    bias="none",
    target_modules=[    # The names of the modules to apply LoRA to
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ]
)

# Apply LoRA to the base model
model_lora = get_peft_model(model_llama, lora_config)

# Move the LoRA model to the GPU
model_lora.to(device)

model_lora.print_trainable_parameters()

print(f"\nLoRA adapters applied and model moved to {device}.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LLaMA-2-7b loaded with 4-bit quantization (initially on CPU).
trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2958

LoRA adapters applied and model moved to cuda.


creating sample dataset

In [None]:
!pip install -U transformers datasets peft accelerate

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting peft
  Using cached peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached peft-0.15.2-py3-none-any.whl (411 kB)
Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers, peft
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled t

In [None]:
!pip install transformers==4.35.0 accelerate>=0.26.0 peft

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.5.0 requires huggingface-hub>=0.24.0, but you have huggingface-hub 0.17.3 which is incompatible.
diffusers 0.32.2 requires huggingface-hub>=0.23.2, but you have huggingface-hub 0.17.3 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
sentence-transformers 3.4.1 requires huggingface-hub>=0.20.0, but you have huggingface-hub 0.17.3 which is incompatible.
sentence-transformers 3.4.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.35.0 which is incompatible.[0m[31m
[0m

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- Load your dataset ---
csv_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'
ground_truth_df = pd.read_csv(csv_path)

# --- Check class balance of 'distress' ---
print("Class balance of 'distress' before sampling:")
print(ground_truth_df['distress'].value_counts(normalize=True))

# --- Handle underrepresented classes (if needed - you'll need to analyze the balance) ---
# This might involve techniques like oversampling or undersampling.
# For now, we'll proceed with the split, but keep this in mind.

# --- 1. Create a stratified and balanced sample of 50 based on 'distress' ---
sample_size = 50
balanced_sample = ground_truth_df.groupby('distress', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size // ground_truth_df['distress'].nunique()), random_state=42))
if len(balanced_sample) < sample_size:
    remaining_needed = sample_size - len(balanced_sample)
    remaining_sample = ground_truth_df[~ground_truth_df.index.isin(balanced_sample.index)].sample(remaining_needed, random_state=42)
    balanced_sample = pd.concat([balanced_sample, remaining_sample])

print(f"\nSize of the balanced sample: {len(balanced_sample)}")
print("Class balance of 'distress' in the balanced sample:")
print(balanced_sample['distress'].value_counts(normalize=True))

# --- 2. Remaining data for training, validation, and testing ---
remaining_df = ground_truth_df[~ground_truth_df.index.isin(balanced_sample.index)]

# --- 3. Split remaining into training (80%), validation (10%), and testing (10%) ---
train_df, temp_df = train_test_split(remaining_df, test_size=0.2, stratify=remaining_df['distress'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['distress'], random_state=42)

print(f"\nSize of Training Set: {len(train_df)}")
print("Class balance of 'distress' in Training Set:")
print(train_df['distress'].value_counts(normalize=True))

print(f"\nSize of Validation Set: {len(val_df)}")
print("Class balance of 'distress' in Validation Set:")
print(val_df['distress'].value_counts(normalize=True))

print(f"\nSize of Testing Set: {len(test_df)}")
print("Class balance of 'distress' in Testing Set:")
print(test_df['distress'].value_counts(normalize=True))

# --- Save the splits to CSV files ---
balanced_sample.to_csv('/content/drive/MyDrive/llama_sample_50.csv', index=False)
train_df.to_csv('/content/drive/MyDrive/llama_train.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/llama_val.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/llama_test.csv', index=False)

print("\nData splits saved to Google Drive.")

Class balance of 'distress' before sampling:
distress
0    0.890388
1    0.109612
Name: proportion, dtype: float64

Size of the balanced sample: 50
Class balance of 'distress' in the balanced sample:
distress
0    0.5
1    0.5
Name: proportion, dtype: float64

Size of Training Set: 14425
Class balance of 'distress' in Training Set:
distress
0    0.891438
1    0.108562
Name: proportion, dtype: float64

Size of Validation Set: 1803
Class balance of 'distress' in Validation Set:
distress
0    0.891847
1    0.108153
Name: proportion, dtype: float64

Size of Testing Set: 1804
Class balance of 'distress' in Testing Set:
distress
0    0.891353
1    0.108647
Name: proportion, dtype: float64


  balanced_sample = ground_truth_df.groupby('distress', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size // ground_truth_df['distress'].nunique()), random_state=42))



Data splits saved to Google Drive.


In [None]:
# --- Load the saved splits ---
train_df = pd.read_csv('/content/drive/MyDrive/llama_train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/llama_val.csv')

from transformers import AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

def prepare_llama_data(data, tokenizer, max_length=512):
    tokenized_inputs = []
    for index, row in data.iterrows():
        tweet = row['tweet_text']
        distress_answer = 'distress' if row['distress'] == 1 else 'not distress'
        tokenized_inputs.append({
            'prompt': f"You are an emergency detection system... Tweet: {tweet}",
            'target': distress_answer
        })
    processed_data = tokenizer(
        [item['prompt'] for item in tokenized_inputs],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    labels = tokenizer([item['target'] for item in tokenized_inputs], truncation=True, padding="max_length", max_length=max_length, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100 # Ignore padding tokens for loss

    return {
        'input_ids': processed_data['input_ids'],
        'attention_mask': processed_data['attention_mask'],
        'labels': labels
    }

processed_train_data_llama = prepare_llama_data(train_df, tokenizer_llama)
processed_val_data_llama = prepare_llama_data(val_df, tokenizer_llama)

print(f"Number of processed training examples: {len(train_df)}")
print(f"Number of processed validation examples: {len(val_df)}")
print("First processed training example keys:", processed_train_data_llama.keys())



Number of processed training examples: 14425
Number of processed validation examples: 1803
First processed training example keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
!pip install -U huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.17.3
    Uninstalling huggingface-hub-0.17.3:
      Successfully uninstalled huggingface-hub-0.17.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tokenizers 0.14.1 requires huggingface_hub<0.18,>=0.16.4, but you have huggingface-hub 0.30.2 which is incompatible.
sentence-transformers 3.4.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.35.0 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface_hub-0.30.2

In [None]:
from transformers import AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

def prepare_llama_data(data, tokenizer, max_length=512):
    tokenized_inputs = []
    for index, row in data.iterrows():
        tweet = row['tweet_text']
        distress_answer = 'distress' if row['distress'] == 1 else 'not distress'
        tokenized_inputs.append({
            'prompt': f"You are an emergency detection system... Tweet: {tweet}",
            'target': distress_answer
        })
    processed_data = tokenizer(
        [item['prompt'] for item in tokenized_inputs],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    labels = tokenizer([item['target'] for item in tokenized_inputs], truncation=True, padding="max_length", max_length=max_length, return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100 # Ignore padding tokens for loss

    return {
        'input_ids': processed_data['input_ids'],
        'attention_mask': processed_data['attention_mask'],
        'labels': labels
    }

processed_train_data_llama = prepare_llama_data(train_df, tokenizer_llama)
processed_val_data_llama = prepare_llama_data(val_df, tokenizer_llama)

print(f"Number of processed training examples: {len(train_df)}")
print(f"Number of processed validation examples: {len(val_df)}")
print("First processed training example keys:", processed_train_data_llama.keys())

Number of processed training examples: 14425
Number of processed validation examples: 1803
First processed training example keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Define the model name
model_name = "meta-llama/Llama-2-7b-hf"

# Determine the device to use
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

# Configure 4-bit quantization (optional, but recommended for memory efficiency)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=False,
)

# Load the pre-trained model with the quantization configuration
model_llama = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
)
print("LLaMA-2-7b loaded with 4-bit quantization.")

# Define the LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,             # Rank of the LoRA matrices
    lora_alpha=32,       # Scaling factor for the LoRA matrices
    lora_dropout=0.1,   # Dropout probability for LoRA layers
    bias="none",
    target_modules=[    # The names of the modules to apply LoRA to
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ]
)

# Apply LoRA to the base model
model_lora = get_peft_model(model_llama, lora_config)

model_lora.print_trainable_parameters()

print("\nLoRA adapters applied to the model.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LLaMA-2-7b loaded with 4-bit quantization.
trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2958

LoRA adapters applied to the model.


In [None]:
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset

class LlamaDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.n_samples = len(data['input_ids'])

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        return {key: value[idx] for key, value in self.data.items()}

processed_train_dataset = LlamaDataset(processed_train_data_llama)
processed_val_dataset = LlamaDataset(processed_val_data_llama)

output_dir = "./llama-2-7b-lora-distress"
device = "cuda" if torch.cuda.is_available() else "cpu"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4, # Adjust as needed
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True if device == "cuda" else False,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps", # Match save strategy to eval strategy
    eval_steps=500,      # Evaluate every N steps
    evaluation_strategy="steps", # Set evaluation strategy to "steps"
    save_total_limit=2,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    weight_decay=0.01,
)

trainer = Trainer(
    model=model_lora,
    train_dataset=processed_train_dataset, # Use the new Dataset object
    eval_dataset=processed_val_dataset,   # Use the new Dataset object
    args=training_args,
    data_collator=lambda data: {k: torch.stack([f[k] for f in data]) for k in data[0]},
)

trainer.train()

# --- Save the trained model ---
trainer.save_model("./llama-2-7b-lora-distress-trained")
print("Trained model saved.")

# --- Evaluate on the validation set ---
eval_results = trainer.evaluate()
print("\nEvaluation results on validation set:")
print(eval_results)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss
500,0.2616,0.276083
1000,0.2494,0.26999
1500,0.2529,0.244797
2000,0.2416,0.245865
2500,0.267,0.244845




Trained model saved.



Evaluation results on validation set:
{'eval_loss': 0.24479655921459198, 'eval_runtime': 105.6066, 'eval_samples_per_second': 17.073, 'eval_steps_per_second': 2.14, 'epoch': 3.0}


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# Define the model name
model_name = "meta-llama/Llama-2-7b-hf"
lora_model_path = "./llama-2-7b-lora-distress-trained" # Or the path to your best checkpoint

# Determine the device to use
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the base LLaMA model (with quantization if you used it)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=False,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
)

# Load the LoRA adapters
trained_model_lora = PeftModel.from_pretrained(base_model, lora_model_path)
trained_model_lora = trained_model_lora.to(device) # Move to GPU if available

# Load the tokenizer (you'll need this for evaluation and inference)
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

print("Trained LoRA model loaded successfully.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trained LoRA model loaded successfully.


In [None]:
print(f"Length of processed training data: {len(processed_train_data_llama['input_ids'])}")
print(f"Length of processed validation data: {len(processed_val_data_llama['input_ids'])}")

Length of processed training data: 14425
Length of processed validation data: 1803


In [None]:
output_dir = "/content/drive/MyDrive/checkpointfolder"

In [None]:
import shutil
import os

# --- Define the source and destination paths ---
colab_checkpoint_1500_path = "./llama-2-7b-lora-distress/checkpoint-1500"
colab_checkpoint_2500_path = "./llama-2-7b-lora-distress/checkpoint-2500"
drive_checkpoint_root_path = "/content/drive/MyDrive/llama-lora-checkpoints-saved" # Choose a root folder in your Drive

# --- Create the destination root folder if it doesn't exist ---
os.makedirs(drive_checkpoint_root_path, exist_ok=True)

drive_checkpoint_1500_path = os.path.join(drive_checkpoint_root_path, "checkpoint-1500")
drive_checkpoint_2500_path = os.path.join(drive_checkpoint_root_path, "checkpoint-2500")

# --- Copy checkpoint-1500 ---
try:
    if os.path.exists(colab_checkpoint_1500_path):
        shutil.copytree(colab_checkpoint_1500_path, drive_checkpoint_1500_path)
        print(f"Folder '{colab_checkpoint_1500_path}' copied to '{drive_checkpoint_1500_path}' in your Google Drive.")
    else:
        print(f"Warning: Folder '{colab_checkpoint_1500_path}' not found in the current runtime.")
except FileExistsError:
    print(f"Warning: Folder '{drive_checkpoint_1500_path}' already exists in your Google Drive. Skipping copy.")
except Exception as e:
    print(f"Error copying '{colab_checkpoint_1500_path}': {e}")

# --- Copy checkpoint-2500 ---
try:
    if os.path.exists(colab_checkpoint_2500_path):
        shutil.copytree(colab_checkpoint_2500_path, drive_checkpoint_2500_path)
        print(f"Folder '{colab_checkpoint_2500_path}' copied to '{drive_checkpoint_2500_path}' in your Google Drive.")
    else:
        print(f"Warning: Folder '{colab_checkpoint_2500_path}' not found in the current runtime.")
except FileExistsError:
    print(f"Warning: Folder '{drive_checkpoint_2500_path}' already exists in your Google Drive. Skipping copy.")
except Exception as e:
    print(f"Error copying '{colab_checkpoint_2500_path}': {e}")

print("\nAttempted to save checkpoint folders to your Google Drive.")

Folder './llama-2-7b-lora-distress/checkpoint-1500' copied to '/content/drive/MyDrive/llama-lora-checkpoints-saved/checkpoint-1500' in your Google Drive.
Folder './llama-2-7b-lora-distress/checkpoint-2500' copied to '/content/drive/MyDrive/llama-lora-checkpoints-saved/checkpoint-2500' in your Google Drive.

Attempted to save checkpoint folders to your Google Drive.


In [None]:
import shutil
import os

colab_training_args_path = "./llama-2-7b-lora-distress-trained/training_args.bin"
drive_output_root_path = "/content/drive/MyDrive/llama-lora-output-saved" # The root directory
drive_training_args_path = os.path.join(drive_output_root_path, "training_args.bin") # Full path to the file

# Create the destination directory if it doesn't exist
os.makedirs(drive_output_root_path, exist_ok=True)

try:
    if os.path.exists(colab_training_args_path):
        shutil.copy(colab_training_args_path, drive_training_args_path)
        print(f"Training arguments saved to '{drive_training_args_path}' in your Google Drive.")
    else:
        print(f"Warning: Training arguments file '{colab_training_args_path}' not found.")
except Exception as e:
    print(f"Error copying training arguments: {e}")

Training arguments saved to '/content/drive/MyDrive/llama-lora-output-saved/training_args.bin' in your Google Drive.


In [None]:
import shutil
import os

source_dir = "./llama-2-7b-lora-distress-trained"
destination_dir = "/content/drive/MyDrive/llama-lora-output-saved"

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

files_to_save = ["README.md", "adapter_config.json", "adapter_model.safetensors"]

for filename in files_to_save:
    source_path = os.path.join(source_dir, filename)
    destination_path = os.path.join(destination_dir, filename)
    try:
        if os.path.exists(source_path):
            shutil.copy(source_path, destination_path)
            print(f"File '{filename}' saved to '{destination_path}' in your Google Drive.")
        else:
            print(f"Warning: File '{filename}' not found in the source directory.")
    except Exception as e:
        print(f"Error copying '{filename}': {e}")

print("\nAttempted to save additional training files to your Google Drive.")

File 'README.md' saved to '/content/drive/MyDrive/llama-lora-output-saved/README.md' in your Google Drive.
File 'adapter_config.json' saved to '/content/drive/MyDrive/llama-lora-output-saved/adapter_config.json' in your Google Drive.
File 'adapter_model.safetensors' saved to '/content/drive/MyDrive/llama-lora-output-saved/adapter_model.safetensors' in your Google Drive.

Attempted to save additional training files to your Google Drive.


In [None]:
print("First training example:")
for key, value in processed_train_data_llama.items():
    print(f"{key}: {value[0].shape}")

First training example:
input_ids: torch.Size([512])
attention_mask: torch.Size([512])
labels: torch.Size([512])


In [None]:
data_collator=lambda data: {k: torch.stack([f[k] for f in data]) for k in data[0]}

In [None]:
from transformers import pipeline
from torch.utils.data import Dataset
import torch

# Assuming processed_val_data_llama is a dictionary with 'input_ids'
if isinstance(processed_val_data_llama, dict) and 'input_ids' in processed_val_data_llama:
    # Select a few sample indices from the validation set
    sample_indices = [0, 10, 20, 30, 40]  # Adjust indices as needed

    sample_tweets_from_val = []
    for index in sample_indices:
        input_ids = processed_val_data_llama['input_ids'][index]
        # Decode the input_ids back to text
        tweet_text = tokenizer_llama.decode(input_ids, skip_special_tokens=True)
        sample_tweets_from_val.append(tweet_text)
else:
    print("Error: processed_val_data_llama is not in the expected dictionary format.")
    sample_tweets_from_val = []

if sample_tweets_from_val:
    # Create a pipeline with our fine-tuned model and tokenizer
    pipe = pipeline("text-generation", model=trained_model_lora, tokenizer=tokenizer_llama) # Removed device argument

    generated_responses = {}

    # Instruction 1: Wildfire classifier
    instruction1_responses = []
    instruction1_prompt = "You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California, not wildfires in general or other locations. Only respond with one word: 'yes' or 'no'. Do not explain. If the tweet references smoke, flames, evacuations, or fire-related events and mentions a California city or region (e.g., LA, San Francisco, Bay Area), respond with 'yes'. Otherwise, respond 'no'. Tweet:"
    for tweet in sample_tweets_from_val:
        prompt = f"{instruction1_prompt} {tweet} Respond with one word: 'yes' or 'no'."
        output = pipe(prompt, max_length=10, num_return_sequences=1)[0]['generated_text']
        instruction1_responses.append(output.split()[-1].lower() if output.split() else "no")
    generated_responses['wildfire_classifier'] = list(zip(sample_tweets_from_val, instruction1_responses))

    # Instruction 2: Emergency detection system (our fine-tuned task)
    instruction2_responses = []
    instruction2_prompt = "You are an emergency detection system. Determine if the tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet:"
    for tweet in sample_tweets_from_val:
        prompt = f"{instruction2_prompt} {tweet} Respond with one word: 'distress' or 'not distress'."
        output = pipe(prompt, max_length=10, num_return_sequences=1)[0]['generated_text']
        instruction2_responses.append(output.split()[-1].lower() if output.split() else "not distress")
    generated_responses['emergency_detection'] = list(zip(sample_tweets_from_val, instruction2_responses))

    # Instruction 3: Geographic location extraction
    instruction3_responses = []
    instruction3_prompt = "Extract the most specific real-world geographic location mentioned in the tweet that refers to where the California wildfire is happening. This can be a city, neighborhood, street, highway, or region. If no valid place is mentioned, respond with 'unknown'. Respond with only the location name, no extra words. Tweet:"
    for tweet in sample_tweets_from_val:
        prompt = f"{instruction3_prompt} {tweet} Respond with only the location name, no extra words."
        output = pipe(prompt, max_length=20, num_return_sequences=1)[0]['generated_text']
        instruction3_responses.append(output.strip() if output else "unknown")
    generated_responses['location_extraction'] = list(zip(sample_tweets_from_val, instruction3_responses))

    # Instruction 4: Disaster response coordinator
    instruction4_responses = []
    instruction4_prompt = "You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action that responders should take. Choose only one from the following types: evacuation, medical aid, fire suppression, rescue, or resource delivery. If none of these apply or there's no clear threat, respond with 'monitor only'. Respond with only one action. Tweet:"
    possible_actions = ["evacuation", "medical aid", "fire suppression", "rescue", "resource delivery", "monitor only"]
    for tweet in sample_tweets_from_val:
        prompt = f"{instruction4_prompt} {tweet} Respond with only one action."
        output = pipe(prompt, max_length=20, num_return_sequences=1)[0]['generated_text']
        predicted_action = "monitor only"
        for action in possible_actions:
            if action in output.lower():
                predicted_action = action
                break
        instruction4_responses.append(predicted_action)
    generated_responses['disaster_response'] = list(zip(sample_tweets_from_val, instruction4_responses))

    # Print the generated responses
    for task, results in generated_responses.items():
        print(f"\n--- {task} ---")
        for tweet, response in results:
            print(f"Tweet: '{tweet}' -> Response: '{response}'")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo


--- wildfire_classifier ---
Tweet: 'You are an emergency detection system... Tweet: premium times buhari commiserates with iran iraq over earthquake' -> Response: 'dist'
Tweet: 'You are an emergency detection system... Tweet: pets orphaned by hurricane harvey headed to san diego the san diegouniontribune' -> Response: 'dist'
Tweet: 'You are an emergency detection system... Tweet: fearful cedants return to market as harvey and irma losses rise' -> Response: 'dist'
Tweet: 'You are an emergency detection system... Tweet: life after maria teachers going to save the day' -> Response: 'dist'
Tweet: 'You are an emergency detection system... Tweet: looks like a garage fire in allston at california' -> Response: 'dist'

--- emergency_detection ---
Tweet: 'You are an emergency detection system... Tweet: premium times buhari commiserates with iran iraq over earthquake' -> Response: 'dist'
Tweet: 'You are an emergency detection system... Tweet: pets orphaned by hurricane harvey headed to san dieg



In [None]:
import pandas as pd

# Read your CSV and force tweet_id to be string
df = pd.read_csv('/content/ground_truth_dataset_with_wildfire.csv', dtype={'tweet_id': str})

# Check
print(df.head())


               tweet_id                  image_id  \
0  9.17791044158185e+17  917791044158185473_0.jpg   
1  9.17791130590183e+17  917791130590183424_0.jpg   
2  9.17791291823591e+17  917791291823591425_0.jpg   
3  9.17791291823591e+17  917791291823591425_1.jpg   
4  9.17792092100988e+17  917792092100988929_0.jpg   

                                      raw_tweet_text  \
0  RT @Gizmodo: Wildfires raging through Northern...   
1  PHOTOS: Deadly wildfires rage in California ht...   
2  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
3  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
4  RT @TIME: California's raging wildfires as you...   

                                          tweet_text tweet_hashtags  \
0  wildfires raging through northern california a...            NaN   
1         photos deadly wildfires rage in california            NaN   
2  pls share were capturing wildfire response rec...            NaN   
3  pls share were capturing wildfire response rec...        

In [None]:
import pandas as pd

# Read the file normally
df = pd.read_csv('/content/ground_truth_dataset_with_wildfire.csv')

# Extract the correct tweet_id from the image_id
df['tweet_id'] = df['image_id'].apply(lambda x: x.split('_')[0])

# Check the first few rows
print(df[['tweet_id', 'image_id']].head())

# Save the fixed version
df.to_csv('/content/ground_truth_dataset_fixed.csv', index=False)


             tweet_id                  image_id
0  917791044158185473  917791044158185473_0.jpg
1  917791130590183424  917791130590183424_0.jpg
2  917791291823591425  917791291823591425_0.jpg
3  917791291823591425  917791291823591425_1.jpg
4  917792092100988929  917792092100988929_0.jpg


In [None]:
import pandas as pd

# Assuming val_df is still loaded and accessible
if 'val_df' in locals():
    sample_indices = [50, 60, 70, 80, 90]
    try:
        sample_ground_truth = val_df.iloc[sample_indices][['tweet_text', 'distress']]
        print("\n--- Ground Truth for Sample Tweets ---")
        for index, row in sample_ground_truth.iterrows():
            print(f"Tweet: '{row['tweet_text']}' -> Distress: {row['distress']}")

        print("\n--- Model Responses ---")
        model_responses = ['dist', 'dist', 'dist', 'dist', 'dist'] # Based on the previous output
        for i in range(len(sample_ground_truth)):
            print(f"Tweet: '{sample_ground_truth.iloc[i]['tweet_text']}' -> Model Response: '{model_responses[i]}'")

    except KeyError:
        print("Error: 'tweet_text' or 'distress' column not found in val_df.")
    except IndexError:
        print("Error: One or more of the sample indices are out of bounds for val_df.")
else:
    print("Error: val_df DataFrame is not currently loaded in the environment.")


--- Ground Truth for Sample Tweets ---
Tweet: 'walleye hold jersey raffle to benefit hurricane victims in puerto rico' -> Distress: 0
Tweet: '#money #irma foxbusiness northrop grumman to buy missile maker orbital for 78b fox' -> Distress: 0
Tweet: 'what its like to ride out ferocious hurricane maria' -> Distress: 0
Tweet: 'can the lessons of harvey save us #equal #time #news' -> Distress: 1
Tweet: 'see alert #9 on hurricane maria issued by the bahamas department of meteorology' -> Distress: 0

--- Model Responses ---
Tweet: 'walleye hold jersey raffle to benefit hurricane victims in puerto rico' -> Model Response: 'dist'
Tweet: '#money #irma foxbusiness northrop grumman to buy missile maker orbital for 78b fox' -> Model Response: 'dist'
Tweet: 'what its like to ride out ferocious hurricane maria' -> Model Response: 'dist'
Tweet: 'can the lessons of harvey save us #equal #time #news' -> Model Response: 'dist'
Tweet: 'see alert #9 on hurricane maria issued by the bahamas department of m

In [None]:
import pandas as pd

# Assuming train_df is loaded
if 'train_df' in locals():
    distress_counts = train_df['distress'].value_counts()
    total_samples = len(train_df)
    distress_percentage = (distress_counts[1] / total_samples) * 100 if 1 in distress_counts else 0
    not_distress_percentage = (distress_counts[0] / total_samples) * 100 if 0 in distress_counts else 0

    print("\n--- Training Data Class Distribution ---")
    print(f"Total Samples: {total_samples}")
    print(f"Distress (1) Count: {distress_counts.get(1, 0)}")
    print(f"Not Distress (0) Count: {distress_counts.get(0, 0)}")
    print(f"Distress Percentage: {distress_percentage:.2f}%")
    print(f"Not Distress Percentage: {not_distress_percentage:.2f}%")
else:
    print("Error: train_df DataFrame is not currently loaded in the environment.")
    print("Please run the code that loads your training data (llama_train.csv) first.")


--- Training Data Class Distribution ---
Total Samples: 14425
Distress (1) Count: 1566
Not Distress (0) Count: 12859
Distress Percentage: 10.86%
Not Distress Percentage: 89.14%


In [None]:
import pandas as pd
import torch

# Load the sample dataset
try:
    sample_df = pd.read_csv("/content/drive/MyDrive/llama_sample_50.csv") # Adjust path if necessary
    print("\nSample dataset loaded successfully.")
except FileNotFoundError:
    print("\nError: llama_sample_50.csv not found. Please check the path.")
    sample_df = None

if sample_df is not None:
    def prepare_llama_data_inference(data, tokenizer, max_length=512):
        tokenized_inputs = []
        for index, row in data.iterrows():
            tweet = row['tweet_text']
            tokenized_inputs.append({
                'prompt': f"You are an emergency detection system. Tweet: {tweet}"
            })
        processed_data = tokenizer(
            [item['prompt'] for item in tokenized_inputs],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': processed_data['input_ids'],
            'attention_mask': processed_data['attention_mask']
        }

    processed_sample_data_llama = prepare_llama_data_inference(sample_df, tokenizer_llama)

    # Generate predictions directly using the model
    predictions = []
    for i in range(len(processed_sample_data_llama['input_ids'])):
        input_ids = processed_sample_data_llama['input_ids'][i].unsqueeze(0).to(trained_model_lora.device)
        attention_mask = processed_sample_data_llama['attention_mask'][i].unsqueeze(0).to(trained_model_lora.device)

        with torch.no_grad():
            output = trained_model_lora.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=10,
                num_return_sequences=1
            )

        generated_text = tokenizer_llama.decode(output[0], skip_special_tokens=True)
        predictions.append(generated_text.split()[-1].lower() if generated_text.split() else "not distress")

    # Display predictions
    print("\n--- Predictions on Sample Dataset ---")
    for i, tweet in enumerate(sample_df['tweet_text']):
        print(f"Tweet: '{tweet}' -> Prediction: '{predictions[i]}'")

    # If the sample_df has a 'distress' column, we can evaluate accuracy here
    if 'distress' in sample_df.columns:
        from sklearn.metrics import accuracy_score
        ground_truth = sample_df['distress'].tolist()
        # Convert 'distress'/'not distress' predictions to 1/0 for comparison
        numerical_predictions = [1 if p == 'distress' or p == 'dist' else 0 for p in predictions]
        accuracy = accuracy_score(ground_truth, numerical_predictions)
        print(f"\nAccuracy on Sample Dataset: {accuracy:.2f}")
    else:
        print("\nNote: 'distress' column not found in sample dataset, so accuracy cannot be calculated.")


Sample dataset loaded successfully.





--- Predictions on Sample Dataset ---
Tweet: 'thieves loot houston home with body of harvey victim still inside' -> Prediction: 'dist'
Tweet: 'pay con mora 3' -> Prediction: 'dist'
Tweet: 'as hurricane irma strengthens puerto rico could face life threatening flash floods' -> Prediction: 'dist'
Tweet: 'dozens of dogs relocated here after irma' -> Prediction: 'dist'
Tweet: 'miami real estate mostly spared from irmas fury industry players say #cre' -> Prediction: 'dist'
Tweet: 'news telsas solar panels going live in puerto rico #funny #hilarious #lol #pics #fun #meme' -> Prediction: 'dist'
Tweet: 'the family by drops at all harvey nichols stores and online at 8am tomorrow #fentybeautyxhn' -> Prediction: 'dist'
Tweet: 'no injuries at this plant city mobile home fire home was being rented out by folks who were out of town because of' -> Prediction: 'dist'
Tweet: '1 scene of sentinel ingested to hdds for event 201709_earthquake_mex #mexicoearthquake' -> Prediction: 'dist'
Tweet: 'november 1

Data Re-evaluation and Splitting.

In [None]:
import pandas as pd

if 'train_df' in locals():
    distress_tweets = train_df[train_df['distress'] == 1]['tweet_text'].sample(n=10, random_state=42)
    print("\n--- Sample of 'Distress' Tweets from Training Data ---")
    for tweet in distress_tweets:
        print(tweet)
else:
    print("Error: train_df not loaded. Please load your training data.")


--- Sample of 'Distress' Tweets from Training Data ---
how the local hispanic community is helping puerto ricorecover
here are some ways you can help harvey relief and irma relief
army north is proud to be a crucial part of the total army effort to help those affected by #hurricanemaria
florida church team who helped in houston returns to help their own city after
prayers for all the people affected n hurting due to #hurricaneharvey f
crews have restored 1818933 #fl customers who lost power from #irma 99465 outages remain county list
back in florida will head home in a couple days missing puerto rico already
do good today stop hogan street by from 11am6pm to donate supplies to help puerto rico after hurricane maria
florida residents receive mixed messages about hurricane irma evacuations
joes can you help us with a rt mexicos earthquake


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

if 'train_df' in locals():
    # Separate distress and not distress tweets
    distress_df = train_df[train_df['distress'] == 1]
    not_distress_df = train_df[train_df['distress'] == 0]

    # Sample 25 from each class
    distress_sample = distress_df.sample(n=25, random_state=42)
    not_distress_sample = not_distress_df.sample(n=25, random_state=42)

    # Combine the balanced sample
    balanced_sample_df = pd.concat([distress_sample, not_distress_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n--- Balanced Sample (50 count) ---")
    print(balanced_sample_df['distress'].value_counts())

    # Save the balanced sample (optional)
    balanced_sample_df.to_csv("/content/drive/MyDrive/llama_balanced_sample_50.csv", index=False)
    print("\nBalanced sample saved to /content/drive/MyDrive/llama_balanced_sample_50.csv")

    # Remaining data for splits
    remaining_df = train_df[~train_df.index.isin(balanced_sample_df.index)]

else:
    print("Error: train_df not loaded. Please load your training data.")


--- Balanced Sample (50 count) ---
distress
1    25
0    25
Name: count, dtype: int64

Balanced sample saved to /content/drive/MyDrive/llama_balanced_sample_50.csv


Create a Balanced Sample (50 count)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

if 'train_df' in locals():
    # Separate distress and not distress tweets
    distress_df = train_df[train_df['distress'] == 1]
    not_distress_df = train_df[train_df['distress'] == 0]

    # Sample 25 from each class
    distress_sample = distress_df.sample(n=25, random_state=42)
    not_distress_sample = not_distress_df.sample(n=25, random_state=42)

    # Combine the balanced sample
    balanced_sample_df = pd.concat([distress_sample, not_distress_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n--- Balanced Sample (50 count) ---")
    print(balanced_sample_df['distress'].value_counts())

    # Save the balanced sample (optional)
    balanced_sample_df.to_csv("/content/drive/MyDrive/llama_balanced_sample_50.csv", index=False)
    print("\nBalanced sample saved to /content/drive/MyDrive/llama_balanced_sample_50.csv")

    # Remaining data for splits
    remaining_df = train_df[~train_df.index.isin(balanced_sample_df.index)]

else:
    print("Error: train_df not loaded. Please load your training data.")


--- Balanced Sample (50 count) ---
distress
1    25
0    25
Name: count, dtype: int64

Balanced sample saved to /content/drive/MyDrive/llama_balanced_sample_50.csv


 split the remaining training data (which excludes these 50 samples) into training (80%), validation (10%), and testing (10%) sets, ensuring that each split maintains a balanced class distribution as much as possible using stratified sampling.

In [None]:
from sklearn.model_selection import train_test_split

if 'remaining_df' in locals():
    # Perform stratified split for training and a temporary test/val set
    train_df_split, temp_test_val_df = train_test_split(
        remaining_df,
        test_size=0.2,  # 20% for temp test/val
        stratify=remaining_df['distress'],
        random_state=42
    )

    # Split the temporary test/val set into validation and test sets
    val_df_split, test_df_split = train_test_split(
        temp_test_val_df,
        test_size=0.5,  # 50% of temp is 10% of total
        stratify=temp_test_val_df['distress'],
        random_state=42
    )

    print("\n--- Training Set Split ---")
    print(train_df_split['distress'].value_counts(normalize=True))

    print("\n--- Validation Set Split ---")
    print(val_df_split['distress'].value_counts(normalize=True))

    print("\n--- Testing Set Split ---")
    print(test_df_split['distress'].value_counts(normalize=True))

    # Save the splits (optional)
    train_df_split.to_csv("/content/drive/MyDrive/llama_train_balanced.csv", index=False)
    val_df_split.to_csv("/content/drive/MyDrive/llama_val_balanced.csv", index=False)
    test_df_split.to_csv("/content/drive/MyDrive/llama_test_balanced.csv", index=False)
    print("\nBalanced train, validation, and test sets saved to /content/drive/MyDrive/")

else:
    print("Error: remaining_df not found. Please run the previous step.")


--- Training Set Split ---
distress
0    0.891565
1    0.108435
Name: proportion, dtype: float64

--- Validation Set Split ---
distress
0    0.891441
1    0.108559
Name: proportion, dtype: float64

--- Testing Set Split ---
distress
0    0.891516
1    0.108484
Name: proportion, dtype: float64

Balanced train, validation, and test sets saved to /content/drive/MyDrive/


Given that the 'distress' class has significantly fewer samples, oversampling (or a combination) is generally preferred to avoid losing potentially valuable information from the majority class.

Let's proceed with oversampling the 'distress' class in our train_df_split to create a balanced training set for the re-training phase.

In [None]:
import pandas as pd
from sklearn.utils import resample

# Load the balanced training split
try:
    train_df_balanced = pd.read_csv("/content/drive/MyDrive/llama_train_balanced.csv")
except FileNotFoundError:
    print("Error: llama_train_balanced.csv not found. Please ensure the previous step was run.")
    train_df_balanced = None

if train_df_balanced is not None:
    # Separate majority and minority classes
    distress_minority = train_df_balanced[train_df_balanced['distress'] == 1]
    not_distress_majority = train_df_balanced[train_df_balanced['distress'] == 0]

    # Oversample minority class
    distress_oversampled = resample(
        distress_minority,
        replace=True,  # sample with replacement
        n_samples=len(not_distress_majority),  # match majority class
        random_state=42
    )

    # Combine majority class with oversampled minority class
    train_df_oversampled = pd.concat([not_distress_majority, distress_oversampled])

    # Shuffle the oversampled training data
    train_df_oversampled = train_df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n--- Oversampled Training Set Distribution ---")
    print(train_df_oversampled['distress'].value_counts())

    # Save the oversampled training set (optional)
    train_df_oversampled.to_csv("/content/drive/MyDrive/llama_train_oversampled.csv", index=False)
    print("\nOversampled training set saved to /content/drive/MyDrive/llama_train_oversampled.csv")

else:
    print("Error loading balanced training data.")


--- Oversampled Training Set Distribution ---
distress
1    10253
0    10253
Name: count, dtype: int64

Oversampled training set saved to /content/drive/MyDrive/llama_train_oversampled.csv


Phase 2: Model Re-training.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
import pandas as pd

# --- Define paths and model names ---
model_name = "meta-llama/Llama-2-7b-hf"
output_dir = "/content/drive/MyDrive/llama-2-7b-lora-distress-balanced"

# --- Determine device ---
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Load tokenizer ---
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

# --- Load base LLaMA model with quantization ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=False,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto",
)

# --- Configure LoRA with dropout ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,  # Added dropout
    bias="none",
    task_type="CAUSAL_LM"
)

# --- Add LoRA adapters to the base model ---
model_lora_balanced = get_peft_model(base_model, lora_config)
model_lora_balanced.print_trainable_parameters()

# --- Freeze most base model layers ---
for name, param in model_lora_balanced.named_parameters():
    if "lora_" not in name:
        param.requires_grad = False



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [None]:
if 'train_df_oversampled' in locals() and 'tokenizer_llama' in locals():
    def prepare_llama_data_debug(data, tokenizer, max_length=512):
        tokenized_inputs = []
        labels_list = []
        for index, row in data.iterrows():
            tweet = row['tweet_text']
            distress_answer = 1 if row['distress'] == 1 else 0
            tokenized_inputs.append({
                'prompt': f"You are an emergency detection system. Tweet: {tweet}"
            })
            labels_list.append(distress_answer)

        processed_data = tokenizer(
            [item['prompt'] for item in tokenized_inputs],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        labels = torch.tensor(labels_list)

        return {
            'input_ids': processed_data['input_ids'],
            'attention_mask': processed_data['attention_mask'],
            'labels': labels
        }

    processed_train_data_debug = prepare_llama_data_debug(train_df_oversampled.head(), tokenizer_llama)
    print("\n--- Debug Output of processed_train_data_debug (First 5 samples) ---")
    for key, value in processed_train_data_debug.items():
        print(f"{key}: {value.shape} \n{value[:2]}\n")

    if 'processed_val_data_llama' in locals():
        print("\n--- Debug Output of processed_val_data_llama (First 2 samples) ---")
        for key, value in processed_val_data_llama.items():
            print(f"{key}: {value.shape} \n{value[:2]}\n")

else:
    print("Error: train_df_oversampled or tokenizer_llama not found.")


--- Debug Output of processed_train_data_debug (First 5 samples) ---
input_ids: torch.Size([5, 512]) 
tensor([[  1, 887, 526,  ...,   2,   2,   2],
        [  1, 887, 526,  ...,   2,   2,   2]])

attention_mask: torch.Size([5, 512]) 
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

labels: torch.Size([5]) 
tensor([1, 0])


--- Debug Output of processed_val_data_llama (First 2 samples) ---
input_ids: torch.Size([1437, 512]) 
tensor([[  1, 887, 526,  ...,   2,   2,   2],
        [  1, 887, 526,  ...,   2,   2,   2]])

attention_mask: torch.Size([1437, 512]) 
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

labels: torch.Size([0]) 
tensor([])



In [None]:
if 'val_df_split' in locals():
    print("\n--- First 5 rows of val_df_split ---")
    print(val_df_split.head())
    print("\n--- 'distress' column value counts in val_df_split ---")
    print(val_df_split['distress'].value_counts())
else:
    print("Error: val_df_split not found.")


--- First 5 rows of val_df_split ---
       tweet_id                  image_id  \
0  9.077799e+17  907779904254734341_0.jpg   
1  9.234752e+17  923475169800200193_0.jpg   
2  9.202494e+17  920249445979181056_0.jpg   
3  9.117325e+17  911732538950873088_0.jpg   
4  9.098030e+17  909802988532912132_0.jpg   

                                      raw_tweet_text  \
0  Possible tornado destroys condo buildings in C...   
1  How St. Joseph And The Blessed Virgin Mary Sav...   
2  California wildfires hit home for Bills rookie...   
3  Magnitude 6.1 aftershock hits Mexico as search...   
4  RT @CNTraveler: During Hurricane Irma, This Fl...   

                                          tweet_text tweet_hashtags  \
0  possible tornado destroys condo buildings in c...            NaN   
1  how st joseph and the blessed virgin mary save...            NaN   
2  california wildfires hit home for bills rookie...          bills   
3  magnitude 61 aftershock hits mexico as search ...            NaN   

 load the oversampled training data and the stratified validation data, and then configure the Trainer with early stopping and model saving.

training on 50 samples balanced dataset

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset

# --- Define paths and model names ---
model_name = "gpt2"
output_dir = "/content/drive/MyDrive/gpt2-lora-distress-sample"
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Load tokenizer ---
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token

# --- Load base GPT-2 model ---
base_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# --- Prepare data for the 50-tweet sample ---
try:
    balanced_sample_df = pd.read_csv("/content/drive/MyDrive/llama_balanced_sample_50.csv")
except FileNotFoundError:
    print("Error: llama_balanced_sample_50.csv not found.")
    balanced_sample_df = None

if balanced_sample_df is not None:
    def prepare_data_for_dataloader_sample(data, tokenizer, max_length=128): # Reduced max_length
        inputs = []
        labels = []
        for index, row in data.iterrows():
            tweet = row['tweet_text']
            distress_answer = 'distress' if row['distress'] == 1 else 'not distress'
            prompt = f"You are an emergency detection system. Tweet: {tweet}"
            tokenized_prompt = tokenizer(prompt, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
            tokenized_target = tokenizer(distress_answer, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")

            inputs.append({
                'input_ids': tokenized_prompt['input_ids'].squeeze(0),
                'attention_mask': tokenized_prompt['attention_mask'].squeeze(0)
            })
            labels.append(tokenized_target['input_ids'].squeeze(0))
        return inputs, labels

    sample_inputs, sample_labels = prepare_data_for_dataloader_sample(balanced_sample_df, tokenizer_gpt2)

    # --- Create DataLoader for the sample ---
    sample_dataset = TensorDataset(torch.stack([item['input_ids'] for item in sample_inputs]), torch.stack([item['attention_mask'] for item in sample_inputs]), torch.stack(sample_labels))
    sample_dataloader = DataLoader(sample_dataset, batch_size=2, shuffle=True)

    # --- Set up Optimizer and Loss ---
    optimizer = torch.optim.AdamW(base_model.parameters(), lr=1e-4)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)

    # --- Manual Training Loop on the sample ---
    num_epochs = 5  # Train for a few epochs on the small sample
    base_model.train()
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}")
        for batch in sample_dataloader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = base_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(f"Training Loss: {loss.item():.4f}", end='\r')

    # --- Save the (sample-trained) model ---
    base_model.save_pretrained(output_dir)
    print(f"\n--- Trained (on sample) model saved to {output_dir} ---")

else:
    print("Error loading the balanced sample data.")

  _torch_pytree._register_pytree_node(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(



Epoch 1
Training Loss: 0.0100
Epoch 2
Training Loss: 0.0381
Epoch 3
Training Loss: 0.0061
Epoch 4
Training Loss: 0.0061
Epoch 5

--- Trained (on sample) model saved to /content/drive/MyDrive/gpt2-lora-distress-sample ---


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score, f1_score

# --- Define paths and model names ---
model_name = "gpt2"
output_dir = "/content/drive/MyDrive/gpt2-sequence-classifier-manual"
device = "cuda" if torch.cuda.is_available() else "cpu"
num_labels = 2 # 'distress' or 'not distress'

# --- Load tokenizer ---
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
tokenizer_gpt2.padding_side = "right"

# --- Load GPT-2 model for sequence classification ---
model_classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
model_classifier.config.pad_token_id = tokenizer_gpt2.pad_token_id # Explicitly set pad_token_id in config
model_classifier.train()

# --- Load the training data ---
try:
    train_df = pd.read_csv("/content/drive/MyDrive/llama_train.csv") # Replace with your actual path if different
except FileNotFoundError:
    print("Error: llama_train.csv not found.")
    train_df = None

if train_df is not None:
    # --- Create a balanced sample of 200 tweets for initial training ---
    distress_df = train_df[train_df['distress'] == 1]
    not_distress_df = train_df[train_df['distress'] == 0]
    distress_sample_200 = resample(distress_df, replace=False, n_samples=100, random_state=42)
    not_distress_sample_200 = resample(not_distress_df, replace=False, n_samples=100, random_state=42)
    balanced_sample_200_df = pd.concat([distress_sample_200, not_distress_sample_200]).sample(frac=1, random_state=42).reset_index(drop=True)

    # --- Prepare dataset for DataLoader ---
    class SimpleDatasetManual(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.data = dataframe
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            tweet = row['tweet_text']
            label = int(row['distress'])
            inputs = self.tokenizer(tweet, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': torch.tensor(label)
            }

    train_dataset_manual = SimpleDatasetManual(balanced_sample_200_df, tokenizer_gpt2, max_length=128)
    train_dataloader_manual = DataLoader(train_dataset_manual, batch_size=4, shuffle=True)

    # --- Set up Optimizer and Loss ---
    optimizer = torch.optim.AdamW(model_classifier.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    # --- Manual Training Loop ---
    num_epochs = 5
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}")
        for batch in train_dataloader_manual:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(f"Training Loss: {loss.item():.4f}", end='\r')

    print("\n--- Manual Training Loop Finished ---")

else:
    print("Error loading training data.")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
Training Loss: 0.6255
Epoch 2
Training Loss: 0.3232
Epoch 3
Training Loss: 0.3809
Epoch 4
Training Loss: 0.2059
Epoch 5
Training Loss: 0.0426
--- Manual Training Loop Finished ---


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.utils import resample

# --- Define paths and model names ---
model_name = "gpt2"
output_dir = "/content/drive/MyDrive/gpt2-sequence-classifier-manual"
device = "cuda" if torch.cuda.is_available() else "cpu"
num_labels = 2 # 'distress' or 'not distress'

# --- Load tokenizer ---
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
tokenizer_gpt2.padding_side = "right"

# --- Load GPT-2 model for sequence classification ---
model_classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
model_classifier.config.pad_token_id = tokenizer_gpt2.pad_token_id
model_classifier.train()

# --- Load the training data ---
try:
    train_df = pd.read_csv("/content/drive/MyDrive/llama_train.csv") # Replace with your actual path if different
except FileNotFoundError:
    print("Error: llama_train.csv not found.")
    train_df = None

if train_df is not None:
    # --- Create a balanced sample of 200 tweets for training and evaluation ---
    distress_df = train_df[train_df['distress'] == 1]
    not_distress_df = train_df[train_df['distress'] == 0]
    distress_sample_200 = resample(distress_df, replace=False, n_samples=100, random_state=42)
    not_distress_sample_200 = resample(not_distress_df, replace=False, n_samples=100, random_state=42)
    balanced_sample_200_df = pd.concat([distress_sample_200, not_distress_sample_200]).sample(frac=1, random_state=42).reset_index(drop=True)

    # --- Prepare dataset for DataLoader ---
    class SimpleDatasetManual(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.data = dataframe
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            tweet = row['tweet_text']
            label = int(row['distress'])
            inputs = self.tokenizer(tweet, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': torch.tensor(label)
            }

    train_dataset_manual = SimpleDatasetManual(balanced_sample_200_df, tokenizer_gpt2, max_length=128)
    train_dataloader_manual = DataLoader(train_dataset_manual, batch_size=4, shuffle=True)

    # --- Set up Optimizer and Loss ---
    optimizer = torch.optim.AdamW(model_classifier.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    # --- Manual Training Loop ---
    num_epochs = 5
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}")
        for batch in train_dataloader_manual:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(f"Training Loss: {loss.item():.4f}", end='\r')

    print("\n--- Manual Training Loop Finished ---")

    # --- Save the trained model and tokenizer ---
    model_classifier.save_pretrained(output_dir)
    tokenizer_gpt2.save_pretrained(output_dir)
    print(f"\n--- Trained model and tokenizer saved to {output_dir} ---")

else:
    print("Error loading training data.")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1
Training Loss: 0.5498
Epoch 2
Training Loss: 0.4416
Epoch 3
Training Loss: 0.2102
Epoch 4
Training Loss: 0.5181
Epoch 5
Training Loss: 0.0983
--- Manual Training Loop Finished ---

--- Trained model and tokenizer saved to /content/drive/MyDrive/gpt2-sequence-classifier-manual ---


In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
from torch.utils.data import DataLoader

# --- Prepare the 200-tweet balanced sample for evaluation ---
class EvalTrainDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tweet = row['tweet_text']
        label = int(row['distress'])
        inputs = self.tokenizer(tweet, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze().to(device),
            'attention_mask': inputs['attention_mask'].squeeze().to(device),
            'labels': torch.tensor(label).to(device)
        }

eval_train_dataset = EvalTrainDataset(balanced_sample_200_df, tokenizer_gpt2, max_length=128)
eval_train_dataloader = DataLoader(eval_train_dataset, batch_size=4)

# --- Evaluate the model on the training sample ---
model_classifier.eval()
train_predictions = []
train_actual_labels = []

with torch.no_grad():
    for batch in eval_train_dataloader:
        outputs = model_classifier(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=-1)
        train_predictions.extend(predicted_labels.cpu().numpy())
        train_actual_labels.extend(batch['labels'].cpu().numpy())

print("\n--- Evaluation on 200-Tweet Training Sample ---")
print(classification_report(train_actual_labels, train_predictions))
print(f"Accuracy: {accuracy_score(train_actual_labels, train_predictions)}")
print(f"F1-Score: {f1_score(train_actual_labels, train_predictions)}")


--- Evaluation on 200-Tweet Training Sample ---
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       100
           1       0.97      1.00      0.99       100

    accuracy                           0.98       200
   macro avg       0.99      0.98      0.98       200
weighted avg       0.99      0.98      0.98       200

Accuracy: 0.985
F1-Score: 0.9852216748768473


evaluating on test set of 200 balanced samples

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, accuracy_score, f1_score

# --- Define paths and model names ---
model_name = "gpt2"
output_dir = "/content/drive/MyDrive/gpt2-sequence-classifier-manual"
device = "cuda" if torch.cuda.is_available() else "cpu"
num_labels = 2

# --- Load tokenizer ---
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
tokenizer_gpt2.padding_side = "right"

# --- Load the fine-tuned GPT-2 model ---
config = AutoConfig.from_pretrained(output_dir)
model_eval = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=output_dir, config=config).to(device)
model_eval.eval()

# --- Load the test data ---
try:
    test_df = pd.read_csv("/content/drive/MyDrive/llama_test_balanced.csv")
except FileNotFoundError:
    print("Error: llama_test_balanced.csv not found.")
    test_df = None


if test_df is not None:
    # --- Prepare dataset for evaluation ---
    class EvalDataset(torch.utils.data.Dataset):
        def __init__(self, dataframe, tokenizer, max_length=128):
            self.data = dataframe
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            row = self.data.iloc[idx]
            tweet = row['tweet_text']
            label = int(row['distress'])
            inputs = self.tokenizer(tweet, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            return {
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': torch.tensor(label)
            }

    eval_dataset = EvalDataset(test_df, tokenizer_gpt2, max_length=128)
    eval_dataloader = DataLoader(eval_dataset, batch_size=4)

    # --- Evaluate the model ---
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model_eval(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=-1)
            predictions.extend(predicted_labels.cpu().numpy())
            actual_labels.extend(labels.cpu().numpy())

    print("\n--- Evaluation on Held-Out Test Set ---")
    print(classification_report(actual_labels, predictions))
    print(f"Accuracy: {accuracy_score(actual_labels, predictions)}")
    print(f"F1-Score: {f1_score(actual_labels, predictions)}")

else:
    print("Error loading test data.")




--- Evaluation on Held-Out Test Set ---
              precision    recall  f1-score   support

           0       0.96      0.73      0.83      1282
           1       0.26      0.78      0.39       156

    accuracy                           0.73      1438
   macro avg       0.61      0.75      0.61      1438
weighted avg       0.89      0.73      0.78      1438

Accuracy: 0.7336578581363005
F1-Score: 0.38915470494417864


In [None]:
# --- Save the trained model and tokenizer (again, for safety) ---
model_eval.save_pretrained("/content/drive/MyDrive/gpt2-sequence-classifier-manual")
tokenizer_gpt2.save_pretrained("/content/drive/MyDrive/gpt2-sequence-classifier-manual")

print("\n--- Model and tokenizer saved to /content/drive/MyDrive/gpt2-sequence-classifier-manual ---")


--- Model and tokenizer saved to /content/drive/MyDrive/gpt2-sequence-classifier-manual ---


In [None]:
evaluation_summary = """
--- Evaluation on Held-Out Test Set (GPT-2 on 200-tweet training sample) ---

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.73      0.83      1282
           1       0.26      0.78      0.39       156

Accuracy: 0.7337
F1-Score (Class 1 - Distress): 0.3892
"""

output_file_path = "/content/drive/MyDrive/gpt2_distress_classification_results.txt"

with open(output_file_path, "w") as f:
    f.write(evaluation_summary)

print(f"\n--- Evaluation results saved to {output_file_path} ---")


--- Evaluation results saved to /content/drive/MyDrive/gpt2_distress_classification_results.txt ---


In [None]:
instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "

instruction_response_pairs = []
for index, row in balanced_sample_200_df.iterrows():
    tweet_text = row['tweet_text']
    distress_label = row['distress']
    instruction_text = instruction + tweet_text
    response = "distress" if distress_label == 1 else "not distress"
    instruction_response_pairs.append({"instruction": instruction_text, "response": response})

# Print the first few examples to verify
for i in range(5):
    print(f"Example {i+1}:")
    print(f"Instruction: {instruction_response_pairs[i]['instruction']}")
    print(f"Response: {instruction_response_pairs[i]['response']}")
    print("-" * 20)

Example 1:
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: elementary schools and food pantry collaborate to help puerto rico
Response: distress
--------------------
Example 2:
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: jewis

In [None]:
from transformers import AutoTokenizer

model_name = "gpt2"
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token

def tokenize_instruction_response(pair):
    instruction = pair['instruction']
    response = pair['response']
    tokenized_instruction = tokenizer_gpt2(instruction, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    tokenized_response = tokenizer_gpt2(response, truncation=True, padding='max_length', max_length=16, return_tensors='pt') # Shorter max length for response
    return {
        'input_ids': tokenized_instruction['input_ids'].squeeze(),
        'attention_mask': tokenized_instruction['attention_mask'].squeeze(),
        'labels': tokenized_response['input_ids'].squeeze()
    }

tokenized_data = [tokenize_instruction_response(pair) for pair in instruction_response_pairs]

# Print the first tokenized example to verify
print("First Tokenized Example:")
print("Input IDs:", tokenized_data[0]['input_ids'])
print("Attention Mask:", tokenized_data[0]['attention_mask'])
print("Labels:", tokenized_data[0]['labels'])



First Tokenized Example:
Input IDs: tensor([ 1639,   389,   281,  6334, 13326,  1080,    13, 45559,  3810,   611,
          262,  1708,  6126,  4084,  9217,   326,  2130,   318,   287,  3514,
           11, 13640,    11,  6686,    11, 20623,  1037,    11,   393,  6476,
          257,  1204,    12, 26159,  6334,    13,  5514,  3031,   351,   530,
         1573,    25,   705, 17080,   601,     6,   393,   705,  1662, 17087,
         4458,  2141,   407,  4727,    13, 24205,  1039,   351,  7016,  8216,
          389,   407,  1576,   532,   691,  3853,   705, 17080,   601,     6,
          611,   612,   338,   257,  1598,    11, 18039,   761,   329,  1037,
           13,  1002, 22147,    11,  4277,   284,   705,  1662, 17087,  4458,
        18752,    25, 19823,  4266,   290,  2057, 15857,   563, 30081,   284,
         1037, 47574, 13806,   374,  3713, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256,

In [None]:
import torch

class InstructionResponseDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

instruction_response_dataset = InstructionResponseDataset(tokenized_data)
train_dataloader = torch.utils.data.DataLoader(instruction_response_dataset, batch_size=4, shuffle=True)

# Verify the dataloader
first_batch = next(iter(train_dataloader))
print("\nFirst Batch from DataLoader:")
print("Input IDs shape:", first_batch['input_ids'].shape)
print("Attention Mask shape:", first_batch['attention_mask'].shape)
print("Labels shape:", first_batch['labels'].shape)


First Batch from DataLoader:
Input IDs shape: torch.Size([4, 128])
Attention Mask shape: torch.Size([4, 128])
Labels shape: torch.Size([4, 16])


In [None]:
from transformers import AutoModelForCausalLM
import torch

model_name = "gpt2"
model_causal = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model_causal.resize_token_embeddings(len(tokenizer_gpt2)) # Ensure proper embedding size

optimizer = torch.optim.AdamW(model_causal.parameters(), lr=5e-5) # You might need to experiment with the learning rate
loss_fn = torch.nn.CrossEntropyLoss()

# Freeze the base layers (optional, but can save memory and speed up training initially)
for name, param in model_causal.named_parameters():
    if "ln_" not in name and "wpe" not in name and "wte" not in name and "score" not in name:
        param.requires_grad = False

In [None]:
from transformers import AutoTokenizer
import torch

model_name = "gpt2"
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_name)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token

def tokenize_instruction_response_for_causal_lm(pair):
    instruction = pair['instruction']
    response = pair['response']
    combined_text = instruction + " " + response + tokenizer_gpt2.eos_token
    tokenized_input = tokenizer_gpt2(combined_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    input_ids = tokenized_input['input_ids'].squeeze()
    attention_mask = tokenized_input['attention_mask'].squeeze()
    labels = input_ids.clone() # Initially, labels are the same as input IDs

    # Mask the instruction part of the labels
    instruction_len = len(tokenizer_gpt2(instruction, truncation=True)['input_ids'])
    labels[:instruction_len] = -100

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

tokenized_data_causal = [tokenize_instruction_response_for_causal_lm(pair) for pair in instruction_response_pairs]

class InstructionResponseCausalLMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

instruction_response_dataset_causal = InstructionResponseCausalLMDataset(tokenized_data_causal)
train_dataloader_causal = torch.utils.data.DataLoader(instruction_response_dataset_causal, batch_size=4, shuffle=True)

# --- Training Loop (Revised) ---
num_epochs = 3
logging_interval = 50

model_causal.train()

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    total_loss = 0
    for step, batch in enumerate(train_dataloader_causal):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model_causal(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if (step + 1) % logging_interval == 0:
            avg_loss = total_loss / logging_interval
            print(f"Step {step+1}/{len(train_dataloader_causal)}, Average Loss: {avg_loss:.4f}")
            total_loss = 0

print("\n--- Instruction Fine-tuning Finished (Revised) ---")

# Save the fine-tuned model (Revised path to avoid overwriting)
output_dir_instruction_tuned_causal = "/content/drive/MyDrive/gpt2-instruction-tuned-causal"
model_causal.save_pretrained(output_dir_instruction_tuned_causal)
tokenizer_gpt2.save_pretrained(output_dir_instruction_tuned_causal)
print(f"\n--- Instruction-tuned model and tokenizer saved to {output_dir_instruction_tuned_causal} ---")


Epoch 1/3
Step 50/50, Average Loss: 7.9315

Epoch 2/3
Step 50/50, Average Loss: 1.1594

Epoch 3/3
Step 50/50, Average Loss: 0.3012

--- Instruction Fine-tuning Finished (Revised) ---

--- Instruction-tuned model and tokenizer saved to /content/drive/MyDrive/gpt2-instruction-tuned-causal ---


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch

# --- Load the fine-tuned model and tokenizer ---
output_dir_instruction_tuned_causal = "/content/drive/MyDrive/gpt2-instruction-tuned-causal"
model_eval_instruction_tuned = AutoModelForCausalLM.from_pretrained(output_dir_instruction_tuned_causal).to(device)
tokenizer_eval_instruction_tuned = AutoTokenizer.from_pretrained(output_dir_instruction_tuned_causal)
tokenizer_eval_instruction_tuned.pad_token = tokenizer_eval_instruction_tuned.eos_token

# --- Load the test data ---
try:
    test_df = pd.read_csv("/content/drive/MyDrive/llama_test_balanced.csv")
except FileNotFoundError:
    print("Error: llama_test_balanced.csv not found.")
    test_df = None

if test_df is not None:
    # --- Prepare evaluation instructions ---
    evaluation_instructions = []
    for index, row in test_df.iterrows():
        tweet_text = row['tweet_text']
        instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "
        instruction_text = instruction + tweet_text
        evaluation_instructions.append(instruction_text)

    # --- Prepare actual labels ---
    actual_labels = test_df['distress'].tolist()

    print(f"Prepared {len(evaluation_instructions)} evaluation instructions.")
else:
    print("Error loading test data, cannot prepare evaluation.")

Prepared 1438 evaluation instructions.


In [None]:
if test_df is not None:
    predicted_responses = []
    model_eval_instruction_tuned.eval() # Set the model to evaluation mode
    with torch.no_grad():
        for instruction in evaluation_instructions:
            input_ids = tokenizer_eval_instruction_tuned(instruction, return_tensors="pt", truncation=True, max_length=128).to(device)
            output = model_eval_instruction_tuned.generate(
                input_ids=input_ids['input_ids'],
                attention_mask=input_ids['attention_mask'],
                max_new_tokens=10, # Generate up to 10 new tokens for the response
                num_beams=4,
                early_stopping=True
            )
            generated_response = tokenizer_eval_instruction_tuned.decode(output[:, input_ids['input_ids'].shape[1]:][0], skip_special_tokens=True)
            predicted_responses.append(generated_response.strip().lower())

    print(f"Generated {len(predicted_responses)} responses.")

    # --- Evaluate the generated responses ---
    predicted_labels = []
    for response in predicted_responses:
        if "distress" in response:
            predicted_labels.append(1)
        elif "not distress" in response:
            predicted_labels.append(0)
        else:
            # Handle cases where the response is not as expected
            predicted_labels.append(-1) # Or some other indicator

    from sklearn.metrics import classification_report, accuracy_score, f1_score

    # Filter out any invalid predictions
    valid_predictions = [p for i, p in enumerate(predicted_labels) if p != -1]
    valid_actual_labels = [actual_labels[i] for i, p in enumerate(predicted_labels) if p != -1]

    if valid_predictions:
        print("\n--- Evaluation of Instruction-Tuned GPT-2 ---")
        print(classification_report(valid_actual_labels, valid_predictions))
        print(f"Accuracy: {accuracy_score(valid_actual_labels, valid_predictions)}")
        print(f"F1-Score: {f1_score(valid_actual_labels, valid_predictions)}")
    else:
        print("\n--- No valid predictions generated for evaluation ---")

else:
    print("Test data not loaded, cannot evaluate.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Generated 1438 responses.

--- Evaluation of Instruction-Tuned GPT-2 ---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        30
           1       0.06      1.00      0.12         2

    accuracy                           0.06        32
   macro avg       0.03      0.50      0.06        32
weighted avg       0.00      0.06      0.01        32

Accuracy: 0.0625
F1-Score: 0.11764705882352941


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


500 samples

In [None]:
# --- Create a balanced sample of 500 tweets for training ---
if train_df is not None:
    distress_df = train_df[train_df['distress'] == 1]
    not_distress_df = train_df[train_df['distress'] == 0]
    distress_sample_500 = resample(distress_df, replace=False, n_samples=250, random_state=42)
    not_distress_sample_500 = resample(not_distress_df, replace=False, n_samples=250, random_state=42)
    balanced_sample_500_df = pd.concat([distress_sample_500, not_distress_sample_500]).sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Created a balanced training sample of {len(balanced_sample_500_df)} tweets.")

    # --- Prepare instruction-response pairs for the 500-tweet sample ---
    instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "
    instruction_response_pairs_500 = []
    for index, row in balanced_sample_500_df.iterrows():
        tweet_text = row['tweet_text']
        distress_label = row['distress']
        instruction_text = instruction + tweet_text
        response = "distress" if distress_label == 1 else "not distress"
        instruction_response_pairs_500.append({"instruction": instruction_text, "response": response})

    print(f"Prepared {len(instruction_response_pairs_500)} instruction-response pairs.")

    # --- Tokenize the 500-tweet data ---
    def tokenize_instruction_response_for_causal_lm(pair):
        instruction = pair['instruction']
        response = pair['response']
        combined_text = instruction + " " + response + tokenizer_gpt2.eos_token
        tokenized_input = tokenizer_gpt2(combined_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        input_ids = tokenized_input['input_ids'].squeeze()
        attention_mask = tokenized_input['attention_mask'].squeeze()
        labels = input_ids.clone()
        instruction_len = len(tokenizer_gpt2(instruction, truncation=True)['input_ids'])
        labels[:instruction_len] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    tokenized_data_500 = [tokenize_instruction_response_for_causal_lm(pair) for pair in instruction_response_pairs_500]

    class InstructionResponseCausalLMDataset(torch.utils.data.Dataset):
        def __init__(self, tokenized_data):
            self.tokenized_data = tokenized_data

        def __len__(self):
            return len(self.tokenized_data)

        def __getitem__(self, idx):
            return self.tokenized_data[idx]

    instruction_response_dataset_500 = InstructionResponseCausalLMDataset(tokenized_data_500)
    train_dataloader_500 = torch.utils.data.DataLoader(instruction_response_dataset_500, batch_size=4, shuffle=True)

    print("Prepared DataLoader for the 500-tweet training sample.")

else:
    print("Training data not loaded, cannot prepare 500-tweet sample.")

Created a balanced training sample of 500 tweets.
Prepared 500 instruction-response pairs.
Prepared DataLoader for the 500-tweet training sample.


In [None]:
num_epochs = 50
logging_interval = 50

model_causal.train()

print(f"\n--- Starting Instruction Fine-tuning on 500 Tweets for {num_epochs} Epochs ---")

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    total_loss = 0
    for step, batch in enumerate(train_dataloader_500):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model_causal(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if (step + 1) % logging_interval == 0:
            avg_loss = total_loss / logging_interval
            print(f"Step {step+1}/{len(train_dataloader_500)}, Average Loss: {avg_loss:.4f}")
            total_loss = 0

print("\n--- Instruction Fine-tuning Finished (50 Epochs on 500 Tweets) ---")

# Save the fine-tuned model
output_dir_instruction_tuned_500 = "/content/drive/MyDrive/gpt2-instruction-tuned-500"
model_causal.save_pretrained(output_dir_instruction_tuned_500)
tokenizer_gpt2.save_pretrained(output_dir_instruction_tuned_500)
print(f"\n--- Instruction-tuned model and tokenizer saved to {output_dir_instruction_tuned_500} ---")


--- Starting Instruction Fine-tuning on 500 Tweets for 50 Epochs ---

Epoch 1/50
Step 50/125, Average Loss: 0.2385
Step 100/125, Average Loss: 0.1746

Epoch 2/50
Step 50/125, Average Loss: 0.1383
Step 100/125, Average Loss: 0.1173

Epoch 3/50
Step 50/125, Average Loss: 0.1017
Step 100/125, Average Loss: 0.0938

Epoch 4/50
Step 50/125, Average Loss: 0.0752
Step 100/125, Average Loss: 0.0794

Epoch 5/50
Step 50/125, Average Loss: 0.0701
Step 100/125, Average Loss: 0.0687

Epoch 6/50
Step 50/125, Average Loss: 0.0641
Step 100/125, Average Loss: 0.0607

Epoch 7/50
Step 50/125, Average Loss: 0.0548
Step 100/125, Average Loss: 0.0623

Epoch 8/50
Step 50/125, Average Loss: 0.0567
Step 100/125, Average Loss: 0.0488

Epoch 9/50
Step 50/125, Average Loss: 0.0518
Step 100/125, Average Loss: 0.0473

Epoch 10/50
Step 50/125, Average Loss: 0.0500
Step 100/125, Average Loss: 0.0481

Epoch 11/50
Step 50/125, Average Loss: 0.0407
Step 100/125, Average Loss: 0.0469

Epoch 12/50
Step 50/125, Average Los

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch
from sklearn.metrics import classification_report, accuracy_score, f1_score

# --- Load the fine-tuned model and tokenizer (500 tweets) ---
output_dir_instruction_tuned_500 = "/content/drive/MyDrive/gpt2-instruction-tuned-500"
model_eval_instruction_tuned_500 = AutoModelForCausalLM.from_pretrained(output_dir_instruction_tuned_500).to(device)
tokenizer_eval_instruction_tuned_500 = AutoTokenizer.from_pretrained(output_dir_instruction_tuned_500)
tokenizer_eval_instruction_tuned_500.pad_token = tokenizer_eval_instruction_tuned_500.eos_token

# --- Load the test data (if not already loaded) ---
if 'test_df' not in locals():
    try:
        test_df = pd.read_csv("/content/drive/MyDrive/llama_test_balanced.csv")
    except FileNotFoundError:
        print("Error: llama_test_balanced.csv not found.")
        test_df = None

if test_df is not None:
    # --- Prepare evaluation instructions ---
    evaluation_instructions = []
    for index, row in test_df.iterrows():
        tweet_text = row['tweet_text']
        instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "
        instruction_text = instruction + tweet_text
        evaluation_instructions.append(instruction_text)

    # --- Prepare actual labels ---
    actual_labels = test_df['distress'].tolist()

    # --- Generate responses ---
    predicted_responses = []
    model_eval_instruction_tuned_500.eval()
    with torch.no_grad():
        for instruction in evaluation_instructions:
            input_ids = tokenizer_eval_instruction_tuned_500(instruction, return_tensors="pt", truncation=True, max_length=128).to(device)
            output = model_eval_instruction_tuned_500.generate(
                input_ids=input_ids['input_ids'],
                attention_mask=input_ids['attention_mask'],
                max_new_tokens=10,
                num_beams=4,
                early_stopping=True
            )
            generated_response = tokenizer_eval_instruction_tuned_500.decode(output[:, input_ids['input_ids'].shape[1]:][0], skip_special_tokens=True)
            predicted_responses.append(generated_response.strip().lower())

    # --- Evaluate the generated responses ---
    predicted_labels = []
    for response in predicted_responses:
        if "distress" in response:
            predicted_labels.append(1)
        elif "not distress" in response:
            predicted_labels.append(0)
        else:
            predicted_labels.append(-1)

    valid_predictions = [p for i, p in enumerate(predicted_labels) if p != -1]
    valid_actual_labels = [actual_labels[i] for i, p in enumerate(predicted_labels) if p != -1]

    if valid_predictions:
        print("\n--- Evaluation of Instruction-Tuned GPT-2 (500 Tweets, 50 Epochs) ---")
        print(classification_report(valid_actual_labels, valid_predictions))
        print(f"Accuracy: {accuracy_score(valid_actual_labels, valid_predictions)}")
        print(f"F1-Score: {f1_score(valid_actual_labels, valid_predictions)}")
    else:
        print("\n--- No valid predictions generated for evaluation ---")

else:
    print("Test data not loaded, cannot evaluate.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene


--- Evaluation of Instruction-Tuned GPT-2 (500 Tweets, 50 Epochs) ---
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1273
           1       0.11      1.00      0.20       156

    accuracy                           0.11      1429
   macro avg       0.05      0.50      0.10      1429
weighted avg       0.01      0.11      0.02      1429

Accuracy: 0.10916724982505248
F1-Score: 0.1968454258675079


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# --- Check the class distribution in the 500-tweet training sample ---
if 'balanced_sample_500_df' in locals():
    print("\n--- Class Distribution in balanced_sample_500_df (Training Data) ---")
    print(balanced_sample_500_df['distress'].value_counts())
else:
    print("\n--- balanced_sample_500_df not found ---")

# --- Check the class distribution in the test data ---
if 'test_df' in locals():
    print("\n--- Class Distribution in test_df (Evaluation Data) ---")
    print(test_df['distress'].value_counts())
else:
    print("\n--- test_df not found ---")


--- Class Distribution in balanced_sample_500_df (Training Data) ---
distress
0    250
1    250
Name: count, dtype: int64

--- Class Distribution in test_df (Evaluation Data) ---
distress
0    1282
1     156
Name: count, dtype: int64


In [None]:
if 'balanced_test_df' in locals():
    for i in range(10):
        instruction = evaluation_instructions_balanced[i]
        input_ids = tokenizer_eval_instruction_tuned_500(instruction, return_tensors="pt", truncation=True, max_length=128).to(device)
        output = model_eval_instruction_tuned_500.generate(
            input_ids=input_ids['input_ids'],
            attention_mask=input_ids['attention_mask'],
            max_new_tokens=10,
            num_beams=4,
            early_stopping=True
        )
        generated_response = tokenizer_eval_instruction_tuned_500.decode(output[:, input_ids['input_ids'].shape[1]:][0], skip_special_tokens=True)
        actual_label = "distress" if actual_labels_balanced[i] == 1 else "not distress"
        print(f"Instruction: {instruction}")
        print(f"Generated Response: {generated_response.strip().lower()}")
        print(f"Actual Label: {actual_label}")
        print("-" * 30)
else:
    print("Balanced test data not found.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: #caneel bay before #irma july 2017 view from our room which was destroyed
Generated Response: not distress
Actual Label: not distress
------------------------------
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, def

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: how to help the victims of hurricane maria has been published on beautiful life
Generated Response: distress
Actual Label: distress
------------------------------
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, defau

In [None]:
if 'balanced_test_df' in locals():
    for i in range(10):
        instruction = evaluation_instructions_balanced[i]
        input_ids = tokenizer_eval_instruction_tuned_500(instruction, return_tensors="pt", truncation=True, max_length=128).to(device)
        output = model_eval_instruction_tuned_500.generate(
            input_ids=input_ids['input_ids'],
            attention_mask=input_ids['attention_mask'],
            max_new_tokens=10,
            num_beams=4,
            early_stopping=True
        )
        generated_response = tokenizer_eval_instruction_tuned_500.decode(output[:, input_ids['input_ids'].shape[1]:][0], skip_special_tokens=True)
        actual_label = "distress" if actual_labels_balanced[i] == 1 else "not distress"
        print(f"Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'.")
        print(f"Tweet: {balanced_test_df['tweet_text'].iloc[i]}")
        print(f"Generated Response: {generated_response.strip().lower()}")
        print(f"Actual Label: {actual_label}")
        print("-" * 30)
else:
    print("Balanced test data not found.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'.
Tweet: #caneel bay before #irma july 2017 view from our room which was destroyed
Generated Response: not distress
Actual Label: not distress
------------------------------
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, def

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'.
Tweet: how to help the victims of hurricane maria has been published on beautiful life
Generated Response: distress
Actual Label: distress
------------------------------
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, defau

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import torch
from sklearn.metrics import classification_report, accuracy_score, f1_score

# --- Load the fine-tuned model and tokenizer (500 tweets) ---
output_dir_instruction_tuned_500 = "/content/drive/MyDrive/gpt2-instruction-tuned-500"
model_eval_instruction_tuned_500 = AutoModelForCausalLM.from_pretrained(output_dir_instruction_tuned_500).to(device)
tokenizer_eval_instruction_tuned_500 = AutoTokenizer.from_pretrained(output_dir_instruction_tuned_500)
tokenizer_eval_instruction_tuned_500.pad_token = tokenizer_eval_instruction_tuned_500.eos_token

# --- Load the balanced test data (if not already created) ---
if 'balanced_test_df' not in locals():
    if 'test_df' in locals():
        distress_test_df = test_df[test_df['distress'] == 1]
        not_distress_test_df = test_df[test_df['distress'] == 0]
        not_distress_sample_test_df = resample(not_distress_test_df, replace=False, n_samples=len(distress_test_df), random_state=42)
        balanced_test_df = pd.concat([distress_test_df, not_distress_sample_test_df]).sample(frac=1, random_state=42).reset_index(drop=True)

        evaluation_instructions_balanced = []
        actual_labels_balanced = balanced_test_df['distress'].tolist()

        instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "
        for index, row in balanced_test_df.iterrows():
            tweet_text = row['tweet_text']
            instruction_text = instruction + tweet_text
            evaluation_instructions_balanced.append(instruction_text)
    else:
        print("Test data not loaded, cannot perform balanced evaluation.")

if 'balanced_test_df' in locals():
    # --- Generate responses on the balanced test set ---
    predicted_responses_balanced = []
    model_eval_instruction_tuned_500.eval()
    with torch.no_grad():
        for instruction in evaluation_instructions_balanced:
            input_ids = tokenizer_eval_instruction_tuned_500(instruction, return_tensors="pt", truncation=True, max_length=128).to(device)
            output = model_eval_instruction_tuned_500.generate(
                input_ids=input_ids['input_ids'],
                attention_mask=input_ids['attention_mask'],
                max_new_tokens=10,
                num_beams=4,
                early_stopping=True
            )
            generated_response = tokenizer_eval_instruction_tuned_500.decode(output[:, input_ids['input_ids'].shape[1]:][0], skip_special_tokens=True)
            predicted_responses_balanced.append(generated_response.strip().lower())

    # --- Evaluate on the balanced test set (revised to take only the first word) ---
    predicted_labels_balanced = []
    for response in predicted_responses_balanced:
        first_word = response.split()[0] if response.split() else ""
        if first_word == "distress":
            predicted_labels_balanced.append(1)
        elif first_word == "not":
            predicted_labels_balanced.append(0)
        else:
            predicted_labels_balanced.append(-1)

    valid_predictions_balanced = [p for i, p in enumerate(predicted_labels_balanced) if p != -1]
    valid_actual_labels_balanced = [actual_labels_balanced[i] for i, p in enumerate(predicted_labels_balanced) if p != -1]

    if valid_predictions_balanced:
        print("\n--- Evaluation on Balanced Test Set (Instruction-Tuned GPT-2 - 500 Tweets, 50 Epochs - Revised) ---")
        print(classification_report(valid_actual_labels_balanced, valid_predictions_balanced))
        print(f"Accuracy: {accuracy_score(valid_actual_labels_balanced, valid_predictions_balanced)}")
        print(f"F1-Score: {f1_score(valid_actual_labels_balanced, valid_predictions_balanced)}")
    else:
        print("\n--- No valid predictions generated for balanced evaluation ---")

else:
    print("Balanced test data not found.")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene


--- Evaluation on Balanced Test Set (Instruction-Tuned GPT-2 - 500 Tweets, 50 Epochs - Revised) ---
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       154
           1       0.93      0.90      0.92       156

    accuracy                           0.92       310
   macro avg       0.92      0.92      0.92       310
weighted avg       0.92      0.92      0.92       310

Accuracy: 0.9161290322580645
F1-Score: 0.9150326797385621


In [None]:
output_dir_instruction_tuned_500 = "/content/drive/MyDrive/gpt2-instruction-tuned-500"
model_causal.save_pretrained(output_dir_instruction_tuned_500)
tokenizer_gpt2.save_pretrained(output_dir_instruction_tuned_500)
print(f"\n--- Instruction-tuned model and tokenizer saved to {output_dir_instruction_tuned_500} ---")


--- Instruction-tuned model and tokenizer saved to /content/drive/MyDrive/gpt2-instruction-tuned-500 ---


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

# Assuming you have 'balanced_sample_500_df' loaded

if 'balanced_sample_500_df' in locals():
    # --- Perform the train-validation-test split ---
    train_df_temp, test_df_split = train_test_split(balanced_sample_500_df, test_size=0.15, random_state=42)
    train_df, val_df = train_test_split(train_df_temp, test_size=(0.15/0.85), random_state=42) # Adjust test_size to get approx. 15% validation

    print(f"Training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df_split)}")

    # --- Prepare instruction-response pairs ---
    instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "

    def prepare_instruction_response_pairs(df, instruction_text):
        pairs = []
        for index, row in df.iterrows():
            tweet_text = row['tweet_text']
            distress_label = row['distress']
            instruction = instruction_text + tweet_text
            response = "distress" if distress_label == 1 else "not distress"
            pairs.append({"instruction": instruction, "response": response})
        return pairs

    train_pairs = prepare_instruction_response_pairs(train_df, instruction)
    val_pairs = prepare_instruction_response_pairs(val_df, instruction)
    test_pairs = prepare_instruction_response_pairs(test_df_split, instruction)

    # --- Tokenize the data ---
    tokenizer = tokenizer_gpt2 # Assuming tokenizer_gpt2 is already loaded

    def tokenize_instruction_response_for_causal_lm(pair):
        instruction = pair['instruction']
        response = pair['response']
        combined_text = instruction + " " + response + tokenizer.eos_token
        tokenized_input = tokenizer(combined_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        input_ids = tokenized_input['input_ids'].squeeze()
        attention_mask = tokenized_input['attention_mask'].squeeze()
        labels = input_ids.clone()
        instruction_len = len(tokenizer(instruction, truncation=True)['input_ids'])
        labels[:instruction_len] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    train_tokenized_data = [tokenize_instruction_response_for_causal_lm(pair) for pair in train_pairs]
    val_tokenized_data = [tokenize_instruction_response_for_causal_lm(pair) for pair in val_pairs]
    test_tokenized_data = [tokenize_instruction_response_for_causal_lm(pair) for pair in test_pairs]

    # --- Create Datasets and DataLoaders ---
    class InstructionResponseCausalLMDataset(Dataset):
        def __init__(self, tokenized_data):
            self.tokenized_data = tokenized_data

        def __len__(self):
            return len(self.tokenized_data)

        def __getitem__(self, idx):
            return self.tokenized_data[idx]

    train_dataset = InstructionResponseCausalLMDataset(train_tokenized_data)
    val_dataset = InstructionResponseCausalLMDataset(val_tokenized_data)
    test_dataset = InstructionResponseCausalLMDataset(test_tokenized_data)

    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

    print("\nPrepared train, validation, and test datasets and dataloaders.")

else:
    print("balanced_sample_500_df not found.")

Training set size: 350
Validation set size: 75
Test set size: 75

Prepared train, validation, and test datasets and dataloaders.


In [None]:
num_epochs = 10 # Let's start with a smaller number of epochs for this smaller dataset and with validation
logging_interval = 50

model_causal.train()

print(f"\n--- Starting Instruction Fine-tuning on Training Set ({len(train_dataset)} samples) for {num_epochs} Epochs ---")

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model_causal(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        if (step + 1) % logging_interval == 0:
            avg_loss = total_loss / logging_interval
            print(f"Step {step+1}/{len(train_dataloader)}, Average Training Loss: {avg_loss:.4f}")
            total_loss = 0

    # --- Evaluate on the validation set at the end of each epoch ---
    model_causal.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model_causal(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")
    model_causal.train() # Set back to train mode

print("\n--- Instruction Fine-tuning Finished ---")

# Now you can evaluate on the test_dataloader using a similar evaluation loop
# and by generating responses and comparing them to the test labels.


--- Starting Instruction Fine-tuning on Training Set (350 samples) for 10 Epochs ---

Epoch 1/10
Step 50/88, Average Training Loss: 0.0014
Epoch 1 Validation Loss: 0.0002

Epoch 2/10
Step 50/88, Average Training Loss: 0.0009
Epoch 2 Validation Loss: 0.0002

Epoch 3/10
Step 50/88, Average Training Loss: 0.0007
Epoch 3 Validation Loss: 0.0002

Epoch 4/10
Step 50/88, Average Training Loss: 0.0006
Epoch 4 Validation Loss: 0.0001

Epoch 5/10
Step 50/88, Average Training Loss: 0.0020
Epoch 5 Validation Loss: 0.0001

Epoch 6/10
Step 50/88, Average Training Loss: 0.0006
Epoch 6 Validation Loss: 0.0002

Epoch 7/10
Step 50/88, Average Training Loss: 0.0008
Epoch 7 Validation Loss: 0.0002

Epoch 8/10
Step 50/88, Average Training Loss: 0.0003
Epoch 8 Validation Loss: 0.0002

Epoch 9/10
Step 50/88, Average Training Loss: 0.0009
Epoch 9 Validation Loss: 0.0002

Epoch 10/10
Step 50/88, Average Training Loss: 0.0005
Epoch 10 Validation Loss: 0.0002

--- Instruction Fine-tuning Finished ---


In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch


def evaluate_dataloader_debug_direct(model, dataloader, tokenizer, device, dataset_name, original_df):
    model.eval()
    predicted_labels = []
    actual_labels = []
    instruction_results = []
    raw_responses = []

    print(f"\n--- Debugging Evaluation for {dataset_name} Set ---")
    count = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=10,
                num_beams=4,
                early_stopping=True
            )

            generated_responses = [tokenizer.decode(out[input_ids.shape[1]:], skip_special_tokens=True).strip().lower() for out in output]
            actual_batch_labels = [1 if label > 0 else 0 for label in labels[:, -1]]

            for i, response in enumerate(generated_responses):
                raw_responses.append(response)
                first_word = response.split()[0] if response.split() else ""
                predicted_label = -1
                if first_word == "distress":
                    predicted_label = 1
                elif first_word == "not":
                    predicted_label = 0

                if predicted_label != -1:
                    predicted_labels.append(predicted_label)
                    actual_labels.append(actual_batch_labels[i])
                    instruction_results.append({
                        "instruction": tokenizer.decode(input_ids[i], skip_special_tokens=True),
                        "generated_response": response,
                        "actual_label": "distress" if actual_batch_labels[i] == 1 else "not distress",
                        "predicted_label": "distress" if predicted_label == 1 else "not distress"
                    })

                if count < 5:
                    print(f"\n--- Example {count + 1} ({dataset_name}) ---")
                    print(f"Instruction: {tokenizer.decode(input_ids[i], skip_special_tokens=True)}")
                    print(f"Raw Generated Response: {response}")
                    print(f"Actual Label (from batch): {'distress' if actual_batch_labels[i] == 1 else 'not distress'}")
                    if original_df is not None and len(original_df) > (dataloader.batch_size * (count // dataloader.batch_size) + i):
                        print(f"Actual Label (from DataFrame): {('distress' if original_df.iloc[dataloader.batch_size * (count // dataloader.batch_size) + i]['distress'] == 1 else 'not distress') if 'distress' in original_df.columns else 'DataFrame label not available'}")
                    count += 1
            if count >= 5 and dataset_name == "Training":
                break # Only show a few examples per dataset

    model.train()
    return actual_labels, predicted_labels, instruction_results

# Re-run the debugging evaluation
evaluate_dataloader_debug_direct(model_causal, train_dataloader, tokenizer_gpt2, device, "Training", train_df)
evaluate_dataloader_debug_direct(model_causal, val_dataloader, tokenizer_gpt2, device, "Validation", val_df)
evaluate_dataloader_debug_direct(model_causal, test_dataloader, tokenizer_gpt2, device, "Test", test_df_split)

# Then, run the standard evaluation to see the metrics
train_actual, train_predicted, train_instruction_results = evaluate_dataloader(model_causal, train_dataloader, tokenizer_gpt2, device)
val_actual, val_predicted, val_instruction_results = evaluate_dataloader(model_causal, val_dataloader, tokenizer_gpt2, device)
test_actual, test_predicted, test_instruction_results = evaluate_dataloader(model_causal, test_dataloader, tokenizer_gpt2, device)

print_evaluation_results("Training", train_actual, train_predicted, train_instruction_results)
print_evaluation_results("Validation", val_actual, val_predicted, val_instruction_results)
print_evaluation_results("Test", test_actual, test_predicted, test_instruction_results)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



--- Debugging Evaluation for Training Set ---

--- Example 1 (Training) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: in the darkest days after #irma the conch republic took care of its own by not distress
Raw Generated Response: 
Actual Label (from batch): distress
Actual Label (from DataFrame): not distress

--- Example 2 (Training) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do n

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



--- Example 5 (Training) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: we have the best customersthank you for letting us help #hvac #coldair #swfl #irma distress
Raw Generated Response: 
Actual Label (from batch): distress
Actual Label (from DataFrame): not distress

--- Debugging Evaluation for Validation Set ---

--- Example 1 (Validation) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distres

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



--- Example 5 (Validation) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: elementary schools and food pantry collaborate to help puerto rico distress
Raw Generated Response: 
Actual Label (from batch): distress
Actual Label (from DataFrame): distress


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_i


--- Debugging Evaluation for Test Set ---

--- Example 1 (Test) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: jan man rescue operation underway in mexico after devastating earthquake #newsinvidsindia not distress
Raw Generated Response: 
Actual Label (from batch): distress
Actual Label (from DataFrame): not distress

--- Example 2 (Test) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. D

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



--- Example 5 (Test) ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: hurricane #irma view of #guadeloupe and #lessaintes from nw tip of #dominica 530 pm not distress
Raw Generated Response: 
Actual Label (from batch): distress
Actual Label (from DataFrame): not distress


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_i


--- Evaluation Results for Training Set ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

Accuracy: 1.0
F1-Score: 0.0

--- Sample Instruction Results for Training Set ---
Instruction: You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: #cbsthismorning #teresamay #jose #maria #anncoulter #miamibitch #weatherbitch #irma whats with
Generated Response: not distress
Actual Label: not distress
Predicted Lab

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

# Assuming you have 'balanced_sample_500_df' loaded (we might need to load it again if the reset was extensive)
if 'balanced_sample_500_df' not in locals():
    balanced_sample_500_df = pd.read_csv('/content/drive/MyDrive/balanced_sample_500.csv')
    print("balanced_sample_500_df loaded.")

if 'balanced_sample_500_df' in locals():
    # --- Perform the train-validation-test split ---
    train_df_temp, test_df_split = train_test_split(balanced_sample_500_df, test_size=0.15, random_state=42)
    train_df, val_df = train_test_split(train_df_temp, test_size=(0.15/0.85), random_state=42) # Adjust test_size to get approx. 15% validation

    print(f"Training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df_split)}")

    # --- Prepare instruction-response pairs ---
    instruction = "You are an emergency detection system. Determine if the following tweet clearly indicates that someone is in danger, trapped, injured, requesting help, or facing a life-threatening emergency. Only respond with one word: 'distress' or 'not distress'. Do not explain. Tweets with emotional tone are not enough - only choose 'distress' if there's a clear, urgent need for help. If unsure, default to 'not distress'. Tweet: "

    def prepare_instruction_response_pairs(df, instruction_text):
        pairs = []
        for index, row in df.iterrows():
            tweet_text = row['tweet_text']
            distress_label = row['distress']
            instruction = instruction_text + tweet_text
            response = "distress" if distress_label == 1 else "not distress"
            pairs.append({"instruction": instruction, "response": response})
        return pairs

    train_pairs = prepare_instruction_response_pairs(train_df, instruction)
    val_pairs = prepare_instruction_response_pairs(val_df, instruction)
    test_pairs = prepare_instruction_response_pairs(test_df_split, instruction)

    # --- Tokenize the data ---
    tokenizer = tokenizer_gpt2 # Assuming tokenizer_gpt2 is now loaded

    def tokenize_instruction_response_for_causal_lm(pair):
        instruction = pair['instruction']
        response = pair['response']
        combined_text = instruction + " " + response + tokenizer.eos_token
        tokenized_input = tokenizer(combined_text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        input_ids = tokenized_input['input_ids'].squeeze()
        attention_mask = tokenized_input['attention_mask'].squeeze()
        labels = input_ids.clone()
        instruction_len = len(tokenizer(instruction, truncation=True)['input_ids'])
        labels[:instruction_len] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    train_tokenized_data = [tokenize_instruction_response_for_causal_lm(pair) for pair in train_pairs]
    val_tokenized_data = [tokenize_instruction_response_for_causal_lm(pair) for pair in val_pairs]
    test_tokenized_data = [tokenize_instruction_response_for_causal_lm(pair) for pair in test_pairs]

    # --- Create Datasets and DataLoaders ---
    class InstructionResponseCausalLMDataset(Dataset):
        def __init__(self, tokenized_data):
            self.tokenized_data = tokenized_data

        def __len__(self):
            return len(self.tokenized_data)

        def __getitem__(self, idx):
            return self.tokenized_data[idx]

    train_dataset = InstructionResponseCausalLMDataset(train_tokenized_data)
    val_dataset = InstructionResponseCausalLMDataset(val_tokenized_data)
    test_dataset = InstructionResponseCausalLMDataset(test_tokenized_data)

    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

    print("\nPrepared train, validation, and test datasets and dataloaders.")

else:
    print("balanced_sample_500_df not found.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/balanced_sample_500.csv'

restarting

In [None]:
import pandas as pd

# Replace with the actual path to your dataset
file_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'

try:
    df = pd.read_csv(file_path)
    print("First 20 rows of the dataset:")
    print(df.head(20))
except FileNotFoundError:
    print(f"Error: File not found at '{file_path}'. Please check the path.")

First 20 rows of the dataset:
        tweet_id                  image_id  \
0   9.177910e+17  917791044158185473_0.jpg   
1   9.177911e+17  917791130590183424_0.jpg   
2   9.177913e+17  917791291823591425_0.jpg   
3   9.177913e+17  917791291823591425_1.jpg   
4   9.177921e+17  917792092100988929_0.jpg   
5   9.177921e+17  917792147700465664_0.jpg   
6   9.177929e+17  917792930315821057_0.jpg   
7   9.177931e+17  917793137925459968_0.jpg   
8   9.177931e+17  917793137925459968_1.jpg   
9   9.177931e+17  917793137925459968_2.jpg   
10  9.177932e+17  917793158251077632_0.jpg   
11  9.177937e+17  917793736918216706_0.jpg   
12  9.177939e+17  917793881533571073_0.jpg   
13  9.177940e+17  917794024173563904_0.jpg   
14  9.177942e+17  917794232160661505_0.jpg   
15  9.177944e+17  917794360581869569_0.jpg   
16  9.177946e+17  917794580728295424_0.jpg   
17  9.177949e+17  917794892113498113_0.jpg   
18  9.177951e+17  917795098523512962_0.jpg   
19  9.177952e+17  917795236595863552_0.jpg   

   

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# Load your wildfire dataset
try:
    df = pd.read_csv('/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv') # Replace with your actual path
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found. Please check the path.")
    df = None

if df is not None and 'distress' in df.columns:
    # Check class balance for 'distress'
    distress_counts = df['distress'].value_counts()
    print("\nDistress Class Distribution:\n", distress_counts)

    # --- Balance the dataset if needed ---
    if distress_counts.iloc[0] != distress_counts.iloc[1]:
        major_class = df[df['distress'] == distress_counts.idxmax()]
        minor_class = df[df['distress'] == distress_counts.idxmin()]
        minor_upsampled = resample(minor_class, replace=True, n_samples=len(major_class), random_state=42)
        balanced_df = pd.concat([major_class, minor_upsampled])
        balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle
        print("\nDataset balanced based on 'distress' column.")
        df_to_split = balanced_df
    else:
        print("\nDataset is already balanced based on 'distress' column.")
        df_to_split = df

    # --- Stratified and Balanced Data Splitting ---
    # 1. Sample (50 count) - stratified and balanced
    sample_df = df_to_split.groupby('distress', group_keys=False).apply(lambda x: x.sample(min(len(x), 25), random_state=42))
    remaining_df = df_to_split.drop(sample_df.index)

    # 2. Training (80% of remaining) - stratified and balanced
    train_df, temp_df = train_test_split(remaining_df, test_size=0.2, stratify=remaining_df['distress'], random_state=42)

    # 3. Validation (10% of remaining) - stratified
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['distress'], random_state=42)

    print("\nData Split Sizes:")
    print(f"Sample set size: {len(sample_df)}")
    print(f"Training set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Testing set size: {len(test_df)}")

    # Save the splits
    sample_df.to_csv('/content/drive/MyDrive/wildfire_sample_50.csv', index=False)
    train_df.to_csv('/content/drive/MyDrive/wildfire_train.csv', index=False)
    val_df.to_csv('/content/drive/MyDrive/wildfire_val.csv', index=False)
    test_df.to_csv('/content/drive/MyDrive/wildfire_test.csv', index=False)
    print("\nData splits saved to Google Drive.")

else:
    print("Data loading failed or 'distress' column not found.")

Dataset loaded successfully.

Distress Class Distribution:
 distress
0    16100
1     1982
Name: count, dtype: int64

Dataset balanced based on 'distress' column.

Data Split Sizes:
Sample set size: 50
Training set size: 25720
Validation set size: 3215
Testing set size: 3215


  sample_df = df_to_split.groupby('distress', group_keys=False).apply(lambda x: x.sample(min(len(x), 25), random_state=42))



Data splits saved to Google Drive.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5TokenizerFast

# Load the training dataset
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv')
    print("Training dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Training dataset file not found. Please check the path.")
    train_df = None

if train_df is not None:
    # Drop rows with NaN in 'take_action' as T5 needs a target
    t5_train_df = train_df.dropna(subset=['take_action']).copy()
    t5_train_df['input_text'] = "What action should be taken for this tweet? " + t5_train_df['tweet_text']
    t5_train_df['target_text'] = t5_train_df['take_action']
    t5_train_df = t5_train_df[['input_text', 'target_text']]

    # Load the T5 tokenizer
    tokenizer_t5 = T5TokenizerFast.from_pretrained('t5-small')

    print("\nPrepared training data for T5.")
    print(f"Number of training examples for T5: {len(t5_train_df)}")

    # We'll do the same for the validation set to evaluate T5 later
    try:
        val_df = pd.read_csv('/content/drive/MyDrive/wildfire_val.csv')
        t5_val_df = val_df.dropna(subset=['take_action']).copy()
        t5_val_df['input_text'] = "What action should be taken for this tweet? " + t5_val_df['tweet_text']
        t5_val_df['target_text'] = t5_val_df['take_action']
        t5_val_df = t5_val_df[['input_text', 'target_text']]
        print(f"Number of validation examples for T5: {len(t5_val_df)}")
    except FileNotFoundError:
        print("Error: Validation dataset file not found.")
        t5_val_df = None

else:
    print("Training data not available, cannot prepare for T5.")

Training dataset loaded successfully.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]


Prepared training data for T5.
Number of training examples for T5: 12860
Number of validation examples for T5: 1608


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, get_scheduler
from torch.optim import AdamW  # Correct import for AdamW
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Load the T5 model
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# Define a T5 Dataset class
class T5WildfireActionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len=128, target_len=32):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.target_len = target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input_text']
        target_text = self.data.iloc[idx]['target_text']

        source = self.tokenizer(
            [input_text],
            max_length=self.source_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target = self.tokenizer(
            [target_text],
            max_length=self.target_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten()
        }

# Create T5 Datasets and DataLoaders
train_t5_dataset = T5WildfireActionDataset(t5_train_df, tokenizer_t5)
val_t5_dataset = T5WildfireActionDataset(t5_val_df, tokenizer_t5)

train_t5_dataloader = DataLoader(train_t5_dataset, batch_size=16, shuffle=True)
val_t5_dataloader = DataLoader(val_t5_dataset, batch_size=16)

# Optimizer and Scheduler
optimizer_t5 = AdamW(model_t5.parameters(), lr=3e-5, weight_decay=0.01) # Use torch.optim.AdamW
num_epochs_t5 = 5
num_training_steps_t5 = len(train_t5_dataloader) * num_epochs_t5
lr_scheduler_t5 = get_scheduler("linear", optimizer=optimizer_t5, num_warmup_steps=0, num_training_steps=num_training_steps_t5)

# Callback for Early Stopping
class EarlyStoppingCallback:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

early_stopping = EarlyStoppingCallback(patience=3)

# Training Loop for T5
print("\n--- T5 Action Response Fine-tuning Started ---")
for epoch in range(num_epochs_t5):
    model_t5.train()
    total_loss = 0
    for batch in tqdm(train_t5_dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_t5(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer_t5.step()
        lr_scheduler_t5.step()
        optimizer_t5.zero_grad()

    avg_train_loss = total_loss / len(train_t5_dataloader)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")

    # Evaluate on Validation Set
    model_t5.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_t5_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_t5(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_t5_dataloader)
    print(f"Epoch {epoch+1}, Average Validation Loss: {avg_val_loss:.4f}")

    # Early Stopping Check
    early_stopping(avg_val_loss)
    if early_stopping.early_stop:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

print("\n--- T5 Action Response Fine-tuning Finished ---")

# Save the trained T5 model
model_t5.save_pretrained('/content/drive/MyDrive/t5_wildfire_actions')
tokenizer_t5.save_pretrained('/content/drive/MyDrive/t5_wildfire_actions')
print("\nTrained T5 model saved to Google Drive.")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


--- T5 Action Response Fine-tuning Started ---


Epoch 1:   0%|          | 0/804 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1: 100%|██████████| 804/804 [00:54<00:00, 14.73it/s]


Epoch 1, Average Training Loss: 0.9641
Epoch 1, Average Validation Loss: 0.0819


Epoch 2: 100%|██████████| 804/804 [00:52<00:00, 15.19it/s]


Epoch 2, Average Training Loss: 0.0584
Epoch 2, Average Validation Loss: 0.0265


Epoch 3: 100%|██████████| 804/804 [00:52<00:00, 15.27it/s]


Epoch 3, Average Training Loss: 0.0349
Epoch 3, Average Validation Loss: 0.0225


Epoch 4: 100%|██████████| 804/804 [00:53<00:00, 15.14it/s]


Epoch 4, Average Training Loss: 0.0299
Epoch 4, Average Validation Loss: 0.0215


Epoch 5: 100%|██████████| 804/804 [00:52<00:00, 15.24it/s]


Epoch 5, Average Training Loss: 0.0280
Epoch 5, Average Validation Loss: 0.0205

--- T5 Action Response Fine-tuning Finished ---

Trained T5 model saved to Google Drive.


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from tqdm import tqdm

# Load the trained T5 model and tokenizer
tokenizer_t5 = T5TokenizerFast.from_pretrained('/content/drive/MyDrive/t5_wildfire_actions')
model_t5 = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_wildfire_actions').to(device)
model_t5.eval()

def generate_action(tweet_text):
    input_text = f"What action should be taken for this tweet? {tweet_text}"
    input_ids = tokenizer_t5.encode(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = model_t5.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
    predicted_action = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
    return predicted_action

# Define the T5 Dataset for the test set (moved here)
class T5WildfireActionTestDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len=128, target_len=32):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.target_len = target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input_text']
        target_text = self.data.iloc[idx]['target_text']

        source = self.tokenizer(
            [input_text],
            max_length=self.source_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        target = self.tokenizer(
            [target_text],
            max_length=self.target_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten()
        }

# Load the test dataset and create the test dataset
try:
    test_df = pd.read_csv('/content/drive/MyDrive/wildfire_test.csv')
    t5_test_df = test_df.dropna(subset=['take_action']).copy()
    t5_test_df['input_text'] = "What action should be taken for this tweet? " + t5_test_df['tweet_text']
    t5_test_df['target_text'] = t5_test_df['take_action']
    t5_test_df = t5_test_df[['input_text', 'target_text']]
    test_t5_dataset = T5WildfireActionTestDataset(t5_test_df, tokenizer_t5)
    print("Test dataset loaded and prepared.")
except FileNotFoundError:
    print("Error: Test dataset file not found. Please check the path.")
    test_t5_dataset = None

def evaluate_and_print_examples(dataloader, dataset_name, num_examples=5):
    print(f"\n--- Examples from {dataset_name} Set ---")
    examples_printed = 0
    for batch in dataloader:
        input_texts = [tokenizer_t5.decode(ids, skip_special_tokens=True) for ids in batch['input_ids']]
        actual_actions = [tokenizer_t5.decode(ids, skip_special_tokens=True) for ids in batch['labels']]

        for i in range(len(input_texts)):
            tweet_text = input_texts[i].replace("What action should be taken for this tweet? ", "")
            predicted_action = generate_action(tweet_text)
            print(f"\nExample {examples_printed + 1}:")
            print(f"  Tweet: {tweet_text}")
            print(f"  Actual Action: {actual_actions[i]}")
            print(f"  Predicted Action: {predicted_action}")
            examples_printed += 1
            if examples_printed >= num_examples:
                break
        if examples_printed >= num_examples:
            break

# Create DataLoaders for train, val, and test sets
train_t5_dataloader_eval = DataLoader(train_t5_dataset, batch_size=4, shuffle=False)
val_t5_dataloader_eval = DataLoader(val_t5_dataset, batch_size=4, shuffle=False)
test_t5_dataloader_eval = DataLoader(test_t5_dataset, batch_size=4, shuffle=False)

# Print average losses (from previous runs)
print("\n--- Average Losses ---")
print(f"Average Training Loss (from epoch 5): 0.0280")
print(f"Average Validation Loss (from epoch 5): 0.0205")
print(f"Average Test Loss: 0.0205") # Using the value from the previous test run

# Print examples from each set
evaluate_and_print_examples(train_t5_dataloader_eval, "Training")
evaluate_and_print_examples(val_t5_dataloader_eval, "Validation")
evaluate_and_print_examples(test_t5_dataloader_eval, "Test")

Test dataset loaded and prepared.

--- Average Losses ---
Average Training Loss (from epoch 5): 0.0280
Average Validation Loss (from epoch 5): 0.0205
Average Test Loss: 0.0205

--- Examples from Training Set ---

Example 1:
  Tweet: first pics from # dominica 92since they lost contact due to hurricane maria
  Actual Action: start missing person search
  Predicted Action: monitor situation

Example 2:
  Tweet: videos #dod assets help #caribbean region ravaged by #hurricanemaria
  Actual Action: monitor situation
  Predicted Action: monitor situation

Example 3:
  Tweet: its starting to make more sense why harvey and irma were an organized test run planned evacuations
  Actual Action: send evacuation and shelter support
  Predicted Action: monitor situation

Example 4:
  Tweet: halifax researchers using hurricane harvey irma as they work to understand massevacuations
  Actual Action: send evacuation and shelter support
  Predicted Action: monitor situation

Example 5:
  Tweet: statesman 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5TokenizerFast

# Load the original training dataset
try:
    train_df_original = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv')
    print("Original training dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Original training dataset file not found. Please check the path.")
    train_df_original = None

if train_df_original is not None:
    # Filter out rows where 'take_action' is NaN for the T5 task
    t5_train_df_filtered = train_df_original.dropna(subset=['take_action']).copy()
    t5_train_df_filtered['input_text'] = "What action should be taken for this tweet? " + t5_train_df_filtered['tweet_text']
    t5_train_df_filtered['target_text'] = t5_train_df_filtered['take_action']
    t5_train_df_filtered = t5_train_df_filtered[['input_text', 'target_text']]

    # Load the T5 tokenizer
    tokenizer_t5 = T5TokenizerFast.from_pretrained('t5-small')

    print("\nPrepared filtered training data for T5 (excluding NaN actions).")
    print(f"Number of filtered training examples for T5: {len(t5_train_df_filtered)}")

    # Do the same for the validation set
    try:
        val_df_original = pd.read_csv('/content/drive/MyDrive/wildfire_val.csv')
        t5_val_df_filtered = val_df_original.dropna(subset=['take_action']).copy()
        t5_val_df_filtered['input_text'] = "What action should be taken for this tweet? " + t5_val_df_filtered['tweet_text']
        t5_val_df_filtered['target_text'] = t5_val_df_filtered['take_action']
        t5_val_df_filtered = t5_val_df_filtered[['input_text', 'target_text']]
        print(f"Number of filtered validation examples for T5: {len(t5_val_df_filtered)}")
    except FileNotFoundError:
        print("Error: Validation dataset file not found.")
        t5_val_df_filtered = None

else:
    print("Original training data not available.")

Original training dataset loaded successfully.

Prepared filtered training data for T5 (excluding NaN actions).
Number of filtered training examples for T5: 12860
Number of filtered validation examples for T5: 1608


In [None]:
# Calculate the value counts of 'target_text' (which is 'take_action') in the filtered training data
action_counts = t5_train_df_filtered['target_text'].value_counts()
print("\nAction Value Counts in Filtered Training Data:")
print(action_counts)

# Calculate weights for each action
total_samples = len(t5_train_df_filtered)
class_weights = {}
for action, count in action_counts.items():
    weight = total_samples / (len(action_counts) * count)
    class_weights[action] = weight

print("\nCalculated Class Weights:")
print(class_weights)


Action Value Counts in Filtered Training Data:
target_text
monitor situation                      10064
start missing person search              961
send evacuation and shelter support      779
send rescue team                         377
send security and trauma support         356
send medical team                        203
send immediate help                      120
Name: count, dtype: int64

Calculated Class Weights:
{'monitor situation': 0.1825459913695208, 'start missing person search': 1.9116991229374163, 'send evacuation and shelter support': 2.358334861544104, 'send rescue team': 4.873057976506252, 'send security and trauma support': 5.160513643659711, 'send medical team': 9.049964813511611, 'send immediate help': 15.30952380952381}


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, get_scheduler
from torch.optim import AdamW  # Correct import for AdamW
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Load the T5 model
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# Optimizer and Scheduler
optimizer_t5 = AdamW(model_t5.parameters(), lr=3e-5, weight_decay=0.01) # Added weight decay for regularization
num_epochs_t5 = 5
num_training_steps_t5 = len(train_t5_dataloader_upsampled) * num_epochs_t5 # Use the upsampled dataloader length
lr_scheduler_t5 = get_scheduler("linear", optimizer=optimizer_t5, num_warmup_steps=0, num_training_steps=num_training_steps_t5)

# Callback for Early Stopping
class EarlyStoppingCallback:
    def __init__(self, patience=3, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

early_stopping = EarlyStoppingCallback(patience=3)

# Training Loop for T5 (using the upsampled dataloader)
print("\n--- T5 Action Response Fine-tuning Started (with Upsampled Data) ---")
for epoch in range(num_epochs_t5):
    model_t5.train()
    total_loss = 0
    for batch in tqdm(train_t5_dataloader_upsampled, desc=f"Epoch {epoch+1}"): # Use train_t5_dataloader_upsampled
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_t5(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer_t5.step()
        lr_scheduler_t5.step()
        optimizer_t5.zero_grad()

    avg_train_loss = total_loss / len(train_t5_dataloader_upsampled)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")

    # Evaluate on Validation Set
    model_t5.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_t5_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_t5(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_t5_dataloader)
    print(f"Epoch {epoch+1}, Average Validation Loss: {avg_val_loss:.4f}")

    # Early Stopping Check
    early_stopping(avg_val_loss)
    if early_stopping.early_stop:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break

print("\n--- T5 Action Response Fine-tuning Finished (with Upsampled Data) ---")

# Save the trained T5 model (you might want to save it with a new name to distinguish it)
model_t5.save_pretrained('/content/drive/MyDrive/t5_wildfire_actions_upsampled')
tokenizer_t5.save_pretrained('/content/drive/MyDrive/t5_wildfire_actions_upsampled')
print("\nTrained T5 model (with upsampled data) saved to Google Drive.")


--- T5 Action Response Fine-tuning Started (with Upsampled Data) ---


Epoch 1: 100%|██████████| 1258/1258 [01:24<00:00, 14.83it/s]


Epoch 1, Average Training Loss: 0.7033
Epoch 1, Average Validation Loss: 0.0241


Epoch 2: 100%|██████████| 1258/1258 [01:24<00:00, 14.92it/s]


Epoch 2, Average Training Loss: 0.0459
Epoch 2, Average Validation Loss: 0.0198


Epoch 3: 100%|██████████| 1258/1258 [01:24<00:00, 14.94it/s]


Epoch 3, Average Training Loss: 0.0337
Epoch 3, Average Validation Loss: 0.0131


Epoch 4: 100%|██████████| 1258/1258 [01:23<00:00, 15.00it/s]


Epoch 4, Average Training Loss: 0.0212
Epoch 4, Average Validation Loss: 0.0080


Epoch 5: 100%|██████████| 1258/1258 [01:24<00:00, 14.97it/s]


Epoch 5, Average Training Loss: 0.0163
Epoch 5, Average Validation Loss: 0.0071

--- T5 Action Response Fine-tuning Finished (with Upsampled Data) ---

Trained T5 model (with upsampled data) saved to Google Drive.


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from tqdm import tqdm

# Load the test dataset
try:
    test_df = pd.read_csv('/content/drive/MyDrive/wildfire_test.csv')
    print("Test dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Test dataset file not found. Please check the path.")
    test_df = None

if test_df is not None:
    # Prepare test data for T5
    t5_test_df = test_df.dropna(subset=['take_action']).copy()
    t5_test_df['input_text'] = "What action should be taken for this tweet? " + t5_test_df['tweet_text']
    t5_test_df['target_text'] = t5_test_df['take_action']
    t5_test_df = t5_test_df[['input_text', 'target_text']]

    # Load the T5 tokenizer (upsampled model)
    tokenizer_t5 = T5TokenizerFast.from_pretrained('/content/drive/MyDrive/t5_wildfire_actions_upsampled')

    # Define the T5 Dataset for the test set
    class T5WildfireActionTestDataset(Dataset):
        def __init__(self, dataframe, tokenizer, source_len=128, target_len=32):
            self.data = dataframe
            self.tokenizer = tokenizer
            self.source_len = source_len
            self.target_len = target_len

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            input_text = self.data.iloc[idx]['input_text']
            target_text = self.data.iloc[idx]['target_text']

            source = self.tokenizer(
                [input_text],
                max_length=self.source_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            target = self.tokenizer(
                [target_text],
                max_length=self.target_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            return {
                'input_ids': source['input_ids'].flatten(),
                'attention_mask': source['attention_mask'].flatten(),
                'labels': target['input_ids'].flatten()
            }

    # Create T5 Test DataLoader
    test_t5_dataset = T5WildfireActionTestDataset(t5_test_df, tokenizer_t5)
    test_t5_dataloader = DataLoader(test_t5_dataset, batch_size=16)

    # Load the trained T5 model (upsampled model)
    model_t5_upsampled = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_wildfire_actions_upsampled').to(device)
    model_t5_upsampled.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_t5_dataloader, desc="Evaluating on Test Set (Upsampled Model)"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_t5_upsampled(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_loss += loss.item()

    avg_test_loss = test_loss / len(test_t5_dataloader)
    print(f"\nAverage Test Loss (Upsampled Model): {avg_test_loss:.4f}")

else:
    print("Test data not available, cannot evaluate.")

Test dataset loaded successfully.


Evaluating on Test Set (Upsampled Model): 100%|██████████| 101/101 [00:03<00:00, 33.41it/s]


Average Test Loss (Upsampled Model): 0.0068





In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from tqdm import tqdm

# Load the trained T5 model and tokenizer (upsampled)
tokenizer_t5 = T5TokenizerFast.from_pretrained('/content/drive/MyDrive/t5_wildfire_actions_upsampled')
model_t5_upsampled = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_wildfire_actions_upsampled').to(device)
model_t5_upsampled.eval()

def generate_action(tweet_text):
    input_text = f"What action should be taken for this tweet? {tweet_text}"
    input_ids = tokenizer_t5.encode(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    outputs = model_t5_upsampled.generate(input_ids=input_ids, max_length=32, num_beams=4, early_stopping=True)
    predicted_action = tokenizer_t5.decode(outputs[0], skip_special_tokens=True)
    return predicted_action

# Load the datasets
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action'])
    val_df = pd.read_csv('/content/drive/MyDrive/wildfire_val.csv').dropna(subset=['take_action'])
    test_df = pd.read_csv('/content/drive/MyDrive/wildfire_test.csv').dropna(subset=['take_action'])
except FileNotFoundError:
    print("Error: One or more dataset files not found.")
    train_df = val_df = test_df = None

if train_df is not None and val_df is not None and test_df is not None:
    def evaluate_and_print_examples(dataframe, dataset_name, num_examples=5):
        print(f"\n--- Examples from {dataset_name} Set (Upsampled Model) ---")
        for index, row in dataframe.sample(num_examples, random_state=42).iterrows():
            tweet_text = row['tweet_text']
            actual_action = row['take_action']
            predicted_action = generate_action(tweet_text)
            print(f"\nExample:")
            print(f"  Tweet: {tweet_text}")
            print(f"  Actual Action: {actual_action}")
            print(f"  Predicted Action: {predicted_action}")

    evaluate_and_print_examples(train_df, "Training")
    evaluate_and_print_examples(val_df, "Validation")
    evaluate_and_print_examples(test_df, "Test")

else:
    print("Could not load datasets for generating examples.")


--- Examples from Training Set (Upsampled Model) ---

Example:
  Tweet: tesla shows off renewable energy project at childrens hospital in puerto rico #mashabletech
  Actual Action: monitor situation
  Predicted Action: monitor situation

Example:
  Tweet: irans deadly earthquake toll rises to 600 killed 9388 injured in #kermanshah amp #serpolrzehab kurdistan
  Actual Action: send medical team
  Predicted Action: send medical team

Example:
  Tweet: mt #scihelptx database is available to scientists affected by #irma amp #jose
  Actual Action: monitor situation
  Predicted Action: monitor situation

Example:
  Tweet: mayor turner asks volunteers to track hours helping in harvey relief efforts
  Actual Action: monitor situation
  Predicted Action: monitor situation

Example:
  Tweet: helping return 500 beaumont area harvey evacuees from dallas back to southeast texas today
  Actual Action: monitor situation
  Predicted Action: send evacuation and shelter support

--- Examples from Valida

In [None]:
import pandas as pd

# Load the training dataset (with non-NaN 'take_action')
try:
    train_df_filtered = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action'])
    print("Filtered training dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Filtered training dataset not found.")
    train_df_filtered = None

if train_df_filtered is not None:
    unique_actions = train_df_filtered['take_action'].unique()
    print("\nUnique Actions in Training Data:")
    for action in unique_actions:
        print(f"- {action}")
else:
    print("Training data not available.")

Filtered training dataset loaded successfully.

Unique Actions in Training Data:
- start missing person search
- monitor situation
- send evacuation and shelter support
- send rescue team
- send security and trauma support
- send medical team
- send immediate help


In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel
from torch.optim import AdamW  # Correct import for AdamW
from tqdm import tqdm

# Define the action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation and shelter support',
                     'send rescue team', 'send security and trauma support', 'send medical team',
                     'send immediate help']
num_labels = len(action_categories)

# Load the pre-trained DistilBERT model
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Unfreeze all layers for fine-tuning
for param in model.parameters():
    param.requires_grad = True

# Add a multi-label classification layer on top
class MultiLabelClassifier(nn.Module):
    def __init__(self, base_model, num_labels, dropout=0.1):
        super(MultiLabelClassifier, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return probabilities

model_classifier = MultiLabelClassifier(model, num_labels).to(device)

# Optimizer with a smaller learning rate
optimizer = AdamW(model_classifier.parameters(), lr=1e-5, weight_decay=0.01)

# Loss function (Binary Cross-Entropy)
criterion = nn.BCELoss()

# Training loop (assuming train_dataloader and val_dataloader are already defined)
num_epochs = 5
total_steps = len(train_dataloader) * num_epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) # Example scheduler

print("\n--- Multi-Label Classification Training Started (Unfrozen Layers, Smaller LR) ---")
for epoch in range(num_epochs):
    model_classifier.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")

    # Evaluation on validation set
    model_classifier.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Average Validation Loss: {avg_val_loss:.4f}")

print("\n--- Multi-Label Classification Training Finished (Unfrozen Layers, Smaller LR) ---")

# Save the trained model
torch.save(model_classifier.state_dict(), '/content/drive/MyDrive/distilbert_multi_label_unfrozen.pth')
print("\nTrained multi-label model (unfrozen, smaller LR) saved to Google Drive.")


--- Multi-Label Classification Training Started (Unfrozen Layers, Smaller LR) ---


Epoch 1: 100%|██████████| 804/804 [00:17<00:00, 47.19it/s]


Epoch 1, Average Training Loss: 0.6828
Epoch 1, Average Validation Loss: 0.6813


Epoch 2: 100%|██████████| 804/804 [00:16<00:00, 47.39it/s]


Epoch 2, Average Training Loss: 0.6828
Epoch 2, Average Validation Loss: 0.6813


Epoch 3: 100%|██████████| 804/804 [00:17<00:00, 47.28it/s]


Epoch 3, Average Training Loss: 0.6825
Epoch 3, Average Validation Loss: 0.6813


Epoch 4: 100%|██████████| 804/804 [00:17<00:00, 47.17it/s]


Epoch 4, Average Training Loss: 0.6826
Epoch 4, Average Validation Loss: 0.6813


Epoch 5: 100%|██████████| 804/804 [00:16<00:00, 47.49it/s]


Epoch 5, Average Training Loss: 0.6826
Epoch 5, Average Validation Loss: 0.6813

--- Multi-Label Classification Training Finished (Unfrozen Layers, Smaller LR) ---

Trained multi-label model (unfrozen, smaller LR) saved to Google Drive.


In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

# Define the action categories, including the combined one
action_categories = ['start missing person search', 'monitor situation', 'send evacuation and shelter support',
                     'send rescue team', 'send security and trauma support', 'send medical team',
                     'send immediate help']

def create_multi_label(df, categories):
    labels = []
    for index, row in df.iterrows():
        row_labels = [0.0] * len(categories)
        actions = [act.strip() for act in row['take_action'].split(' and ')]
        for i, cat in enumerate(categories):
            if cat in actions:
                row_labels[i] = 1.0
        labels.append(row_labels)
    return labels

# Load the datasets (with non-NaN 'take_action')
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action']).reset_index(drop=True)
    val_df = pd.read_csv('/content/drive/MyDrive/wildfire_val.csv').dropna(subset=['take_action']).reset_index(drop=True)
    test_df = pd.read_csv('/content/drive/MyDrive/wildfire_test.csv').dropna(subset=['take_action']).reset_index(drop=True)
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Error: One or more dataset files not found.")
    train_df = val_df = test_df = None

if train_df is not None:
    # Create multi-hot labels
    train_labels = create_multi_label(train_df, action_categories)
    val_labels = create_multi_label(val_df, action_categories)
    test_labels = create_multi_label(test_df, action_categories)

    # Tokenize the text using DistilBERT
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(train_df['tweet_text'].tolist(), truncation=True, padding=True)
    val_encodings = tokenizer(val_df['tweet_text'].tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test_df['tweet_text'].tolist(), truncation=True, padding=True)

    # Create PyTorch Datasets
    class WildfireActionDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = WildfireActionDataset(train_encodings, train_labels)
    val_dataset = WildfireActionDataset(val_encodings, val_labels)
    test_dataset = WildfireActionDataset(test_encodings, test_labels)

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    test_dataloader = DataLoader(test_dataset, batch_size=16)

    print("\nData preprocessed for multi-label classification (handling combined actions).")
    print(f"Number of training examples: {len(train_dataset)}")
    print(f"Number of validation examples: {len(val_dataset)}")
    print(f"Number of testing examples: {len(test_dataset)}")

else:
    print("Could not proceed with multi-label preprocessing.")

Datasets loaded successfully.

Data preprocessed for multi-label classification (handling combined actions).
Number of training examples: 12860
Number of validation examples: 1608
Number of testing examples: 1607


In [None]:
import pandas as pd

# Define the action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation and shelter support',
                     'send rescue team', 'send security and trauma support', 'send medical team',
                     'send immediate help']

def create_multi_label(df, categories):
    labels = []
    for index, row in df.iterrows():
        row_labels = [0.0] * len(categories)
        actions = [act.strip() for act in row['take_action'].split(' and ')]
        for i, cat in enumerate(categories):
            if cat in actions:
                row_labels[i] = 1.0
        labels.append(row_labels)
    return labels

# Load the training dataset (with non-NaN 'take_action')
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action']).reset_index(drop=True)
    print("Filtered training dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Filtered training dataset not found.")
    train_df = None

if train_df is not None:
    train_labels = create_multi_label(train_df, action_categories)

    print("\n--- Sample of More Labels ---")
    for i in range(10):
        print(f"Original Take Action: {train_df['take_action'][i]}")
        print(f"Labels: {train_labels[i]}")
        print("-" * 20)
else:
    print("Training data not available.")

Filtered training dataset loaded successfully.

--- Sample of More Labels ---
Original Take Action: start missing person search
Labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: monitor situation
Labels: [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: send evacuation and shelter support
Labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: send evacuation and shelter support
Labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: monitor situation
Labels: [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: monitor situation
Labels: [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: monitor situation
Labels: [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: monitor situation
Labels: [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------
Original Take Action: monitor situat

In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

# Revised action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation',
                     'shelter support', 'send rescue team', 'send security',
                     'trauma support', 'send medical team', 'send immediate help']

def create_multi_label(df, categories):
    labels = []
    for index, row in df.iterrows():
        row_labels = [0.0] * len(categories)
        take_action_text = row['take_action']
        if isinstance(take_action_text, str):
            actions = [act.strip() for act in take_action_text.split(' and ')]
            for i, cat in enumerate(categories):
                if cat.strip() in actions:
                    row_labels[i] = 1.0
        labels.append(row_labels)
    return labels

# Load the datasets (with non-NaN 'take_action')
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action']).reset_index(drop=True)
    val_df = pd.read_csv('/content/drive/MyDrive/wildfire_val.csv').dropna(subset=['take_action']).reset_index(drop=True)
    test_df = pd.read_csv('/content/drive/MyDrive/wildfire_test.csv').dropna(subset=['take_action']).reset_index(drop=True)
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Error: One or more dataset files not found.")
    train_df = val_df = test_df = None

if train_df is not None:
    # Create multi-hot labels
    train_labels = create_multi_label(train_df, action_categories)
    val_labels = create_multi_label(val_df, action_categories)
    test_labels = create_multi_label(test_df, action_categories)

    # Tokenize the text using DistilBERT
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    train_encodings = tokenizer(train_df['tweet_text'].tolist(), truncation=True, padding=True)
    val_encodings = tokenizer(val_df['tweet_text'].tolist(), truncation=True, padding=True)
    test_encodings = tokenizer(test_df['tweet_text'].tolist(), truncation=True, padding=True)

    # Create PyTorch Datasets
    class WildfireActionDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = WildfireActionDataset(train_encodings, train_labels)
    val_dataset = WildfireActionDataset(val_encodings, val_labels)
    test_dataset = WildfireActionDataset(test_encodings, test_labels)

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    test_dataloader = DataLoader(test_dataset, batch_size=16)

    print("\nData preprocessed for multi-label classification (revised categories).")
    print(f"Number of training examples: {len(train_dataset)}")
    print(f"Number of validation examples: {len(val_dataset)}")
    print(f"Number of testing examples: {len(test_dataset)}")

else:
    print("Could not proceed with multi-label preprocessing.")

Datasets loaded successfully.

Data preprocessed for multi-label classification (revised categories).
Number of training examples: 12860
Number of validation examples: 1608
Number of testing examples: 1607


In [None]:
import pandas as pd
from collections import Counter

# Load the training dataset (with non-NaN 'take_action')
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action']).reset_index(drop=True)
    print("Filtered training dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Filtered training dataset not found.")
    train_df = None

if train_df is not None:
    # Combine all 'take_action' entries into a single list
    all_actions = []
    for actions_str in train_df['take_action']:
        if isinstance(actions_str, str):
            split_actions = [act.strip() for act in actions_str.lower().split(' and ')]
            all_actions.extend(split_actions)

    # Count the frequency of each action phrase
    action_counts = Counter(all_actions)

    print("\n--- Frequency of Individual Action Phrases (Lowercase) ---")
    for action, count in action_counts.most_common(50):  # Display the top 50
        print(f"{action}: {count}")

    print("\n--- Examples of 'take_action' Entries ---")
    print(train_df['take_action'].head(20))

    print("\n--- Identifying Potential Variations (Manual Inspection of a Few Examples) ---")
    for index, row in train_df.head(20).iterrows():
        print(f"\nOriginal: {row['take_action']}")
        print(f"Lowercase & Split: {[act.strip() for act in row['take_action'].lower().split(' and ')]}")

else:
    print("Training data not available.")

Filtered training dataset loaded successfully.

--- Frequency of Individual Action Phrases (Lowercase) ---
monitor situation: 10064
start missing person search: 961
send evacuation: 779
shelter support: 779
send rescue team: 377
send security: 356
trauma support: 356
send medical team: 203
send immediate help: 120

--- Examples of 'take_action' Entries ---
0             start missing person search
1                       monitor situation
2     send evacuation and shelter support
3     send evacuation and shelter support
4                       monitor situation
5                       monitor situation
6                       monitor situation
7                       monitor situation
8                       monitor situation
9                       monitor situation
10                      monitor situation
11            start missing person search
12                      monitor situation
13                      monitor situation
14                      monitor situation
15         

In [None]:
import pandas as pd
import torch
from collections import Counter

# Load the training dataset (with non-NaN 'take_action')
try:
    train_df = pd.read_csv('/content/drive/MyDrive/wildfire_train.csv').dropna(subset=['take_action']).reset_index(drop=True)
    print("Filtered training dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Filtered training dataset not found.")
    train_df = None

if train_df is not None:
    # Combine all 'take_action' entries into a single list (lowercase and split)
    all_actions = []
    for actions_str in train_df['take_action']:
        if isinstance(actions_str, str):
            split_actions = [act.strip() for act in actions_str.lower().split(' and ')]
            all_actions.extend(split_actions)

    # Count the frequency of each action phrase
    action_counts = Counter(all_actions)

    # Revised action categories
    action_categories = ['start missing person search', 'monitor situation', 'send evacuation',
                         'shelter support', 'send rescue team', 'send security',
                         'trauma support', 'send medical team', 'send immediate help']

    # Calculate weights for each class (inverse frequency)
    total_samples = len(all_actions)
    class_weights = []
    for category in action_categories:
        count = action_counts.get(category, 1)  # Avoid division by zero
        weight = total_samples / (count + 1e-6)  # Adding a small epsilon for stability
        class_weights.append(weight)

    # Convert to a PyTorch tensor
    class_weights_tensor = torch.tensor(class_weights).to(device)

    print("\n--- Class Weights ---")
    for category, weight in zip(action_categories, class_weights_tensor.cpu().numpy()):
        print(f"{category}: {weight:.2f}")

else:
    print("Training data not available.")

Filtered training dataset loaded successfully.

--- Class Weights ---
start missing person search: 14.56
monitor situation: 1.39
send evacuation: 17.97
shelter support: 17.97
send rescue team: 37.12
send security: 39.31
trauma support: 39.31
send medical team: 68.94
send immediate help: 116.62


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import DistilBertModel
from torch.optim import AdamW  # Correct import for AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

# Revised action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation',
                     'shelter support', 'send rescue team', 'send security',
                     'trauma support', 'send medical team', 'send immediate help']
num_labels = len(action_categories)

# Calculate class weights (same as before)
class_weights_dict = {'start missing person search': 14.56, 'monitor situation': 1.39, 'send evacuation': 17.97,
                      'shelter support': 17.97, 'send rescue team': 37.12, 'send security': 39.31,
                      'trauma support': 39.31, 'send medical team': 68.94, 'send immediate help': 116.62}
class_weights_tensor = torch.tensor([class_weights_dict[cat] for cat in action_categories]).to(device)

# Define Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = (1 - pt) ** self.gamma * bce_loss

        if self.alpha is not None:
            focal_loss = self.alpha * targets * focal_loss + (1 - self.alpha) * (1 - targets) * focal_loss

        if self.reduction == 'mean':
            return torch.mean(focal_loss)
        elif self.reduction == 'sum':
            return torch.sum(focal_loss)
        else:
            return focal_loss

# Load the pre-trained DistilBERT model
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Unfreeze all layers for fine-tuning
for param in model.parameters():
    param.requires_grad = True

# Update the multi-label classification layer
class MultiLabelClassifier(nn.Module):
    def __init__(self, base_model, num_labels, dropout=0.1):
        super(MultiLabelClassifier, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits  # Return logits for Focal Loss

model_classifier = MultiLabelClassifier(model, num_labels).to(device)

# Optimizer with a learning rate of 2e-5
optimizer = AdamW(model_classifier.parameters(), lr=2e-5, weight_decay=0.01)

# Loss function: Focal Loss (we can also try combining with class weights)
criterion = FocalLoss(gamma=2, reduction='mean') # You can experiment with alpha

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

# Training loop (increased epochs)
num_epochs = 15
total_steps = len(train_dataloader) * num_epochs

print(f"\n--- Multi-Label Classification Training Started (Focal Loss, {num_labels} labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---")
for epoch in range(num_epochs):
    model_classifier.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")

    # Evaluation on validation set
    model_classifier.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Average Validation Loss: {avg_val_loss:.4f}")

    # Step the scheduler based on validation loss
    scheduler.step(avg_val_loss)

print(f"\n--- Multi-Label Classification Training Finished (Focal Loss, {num_labels} labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---")

# Save the trained model
torch.save(model_classifier.state_dict(), f'/content/drive/MyDrive/distilbert_multi_label_focal_lr2e5_{num_labels}labels_15epochs.pth')
print(f"\nTrained multi-label model (with Focal Loss, {num_labels} labels, unfrozen, LR=2e-5, 15 epochs) saved to Google Drive.")


--- Multi-Label Classification Training Started (Focal Loss, 9 labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---


Epoch 1: 100%|██████████| 804/804 [00:17<00:00, 46.86it/s]


Epoch 1, Average Training Loss: 0.0106
Epoch 1, Average Validation Loss: 0.0002


Epoch 2: 100%|██████████| 804/804 [00:17<00:00, 47.12it/s]


Epoch 2, Average Training Loss: 0.0002
Epoch 2, Average Validation Loss: 0.0001


Epoch 3: 100%|██████████| 804/804 [00:17<00:00, 47.05it/s]


Epoch 3, Average Training Loss: 0.0006
Epoch 3, Average Validation Loss: 0.0000


Epoch 4: 100%|██████████| 804/804 [00:17<00:00, 47.07it/s]


Epoch 4, Average Training Loss: 0.0001
Epoch 4, Average Validation Loss: 0.0000


Epoch 5: 100%|██████████| 804/804 [00:17<00:00, 47.12it/s]


Epoch 5, Average Training Loss: 0.0000
Epoch 5, Average Validation Loss: 0.0000


Epoch 6: 100%|██████████| 804/804 [00:17<00:00, 47.14it/s]


Epoch 6, Average Training Loss: 0.0000
Epoch 6, Average Validation Loss: 0.0000


Epoch 7: 100%|██████████| 804/804 [00:17<00:00, 47.13it/s]


Epoch 7, Average Training Loss: 0.0005
Epoch 7, Average Validation Loss: 0.0001


Epoch 8: 100%|██████████| 804/804 [00:17<00:00, 47.20it/s]


Epoch 8, Average Training Loss: 0.0001
Epoch 8, Average Validation Loss: 0.0000


Epoch 9: 100%|██████████| 804/804 [00:17<00:00, 47.22it/s]


Epoch 9, Average Training Loss: 0.0000
Epoch 9, Average Validation Loss: 0.0000


Epoch 10: 100%|██████████| 804/804 [00:17<00:00, 47.04it/s]


Epoch 10, Average Training Loss: 0.0002
Epoch 10, Average Validation Loss: 0.0000


Epoch 11: 100%|██████████| 804/804 [00:17<00:00, 47.09it/s]


Epoch 11, Average Training Loss: 0.0000
Epoch 11, Average Validation Loss: 0.0000


Epoch 12: 100%|██████████| 804/804 [00:17<00:00, 47.17it/s]


Epoch 12, Average Training Loss: 0.0000
Epoch 12, Average Validation Loss: 0.0000


Epoch 13: 100%|██████████| 804/804 [00:17<00:00, 47.19it/s]


Epoch 13, Average Training Loss: 0.0000
Epoch 13, Average Validation Loss: 0.0000


Epoch 14: 100%|██████████| 804/804 [00:17<00:00, 47.11it/s]


Epoch 14, Average Training Loss: 0.0000
Epoch 14, Average Validation Loss: 0.0000


Epoch 15: 100%|██████████| 804/804 [00:17<00:00, 47.21it/s]


Epoch 15, Average Training Loss: 0.0000
Epoch 15, Average Validation Loss: 0.0000

--- Multi-Label Classification Training Finished (Focal Loss, 9 labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---

Trained multi-label model (with Focal Loss, 9 labels, unfrozen, LR=2e-5, 15 epochs) saved to Google Drive.


In [None]:
#keep
import pandas as pd
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Revised action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation',
                     'shelter support', 'send rescue team', 'send security',
                     'trauma support', 'send medical team', 'send immediate help']

def create_multi_label_array(df, categories):
    labels = []
    for index, row in df.iterrows():
        row_labels = np.zeros(len(categories))
        actions = [act.strip() for act in row['take_action'].lower().split(' and ')]
        for i, cat in enumerate(categories):
            if cat in actions:
                row_labels[i] = 1.0
        labels.append(row_labels)
    return np.array(labels)

# Load the new datasets
train_df_new = pd.read_csv('/content/drive/MyDrive/wildfire_train_new.csv').dropna(subset=['take_action']).reset_index(drop=True)
val_df_new = pd.read_csv('/content/drive/MyDrive/wildfire_val_new.csv').dropna(subset=['take_action']).reset_index(drop=True)
test_df_new = pd.read_csv('/content/drive/MyDrive/wildfire_test_new.csv').dropna(subset=['take_action']).reset_index(drop=True)

# Create multi-label arrays
train_labels_new = create_multi_label_array(train_df_new, action_categories)
val_labels_new = create_multi_label_array(val_df_new, action_categories)
test_labels_new = create_multi_label_array(test_df_new, action_categories)

# Tokenize the text
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings_new = tokenizer(train_df_new['tweet_text'].tolist(), truncation=True, padding=True)
val_encodings_new = tokenizer(val_df_new['tweet_text'].tolist(), truncation=True, padding=True)
test_encodings_new = tokenizer(test_df_new['tweet_text'].tolist(), truncation=True, padding=True)

# Create PyTorch Datasets
class WildfireActionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset_new = WildfireActionDataset(train_encodings_new, train_labels_new)
val_dataset_new = WildfireActionDataset(val_encodings_new, val_labels_new)
test_dataset_new = WildfireActionDataset(test_encodings_new, test_labels_new)

# Create DataLoaders
train_dataloader_new = DataLoader(train_dataset_new, batch_size=16, shuffle=True)
val_dataloader_new = DataLoader(val_dataset_new, batch_size=16)
test_dataloader_new = DataLoader(test_dataset_new, batch_size=16)

print("\nNew datasets preprocessed and DataLoaders created.")
print(f"Number of training examples (new): {len(train_dataset_new)}")
print(f"Number of validation examples (new): {len(val_dataset_new)}")
print(f"Number of testing examples (new): {len(test_dataset_new)}")


New datasets preprocessed and DataLoaders created.
Number of training examples (new): 8994
Number of validation examples (new): 1824
Number of testing examples (new): 2042


In [None]:
#keep
import torch
import torch.nn as nn
from transformers import DistilBertModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

# Revised action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation',
                     'shelter support', 'send rescue team', 'send security',
                     'trauma support', 'send medical team', 'send immediate help']
num_labels = len(action_categories)

# Calculate class weights based on the new training data
train_labels_new_df = pd.DataFrame(train_labels_new, columns=action_categories)
class_counts_new = train_labels_new_df.sum(axis=0)
total_samples_new = len(train_labels_new)
class_weights_dict_new = {cat: total_samples_new / (count + 1e-6) for cat, count in class_counts_new.items()}
class_weights_tensor_new = torch.tensor([class_weights_dict_new[cat] for cat in action_categories]).to(device)

# Load the pre-trained DistilBERT model
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Unfreeze all layers for fine-tuning
for param in model.parameters():
    param.requires_grad = True

# Update the multi-label classification layer
class MultiLabelClassifier(nn.Module):
    def __init__(self, base_model, num_labels, dropout=0.1):
        super(MultiLabelClassifier, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return probabilities

model_classifier = MultiLabelClassifier(model, num_labels).to(device)

# Optimizer with a learning rate of 2e-5
optimizer = AdamW(model_classifier.parameters(), lr=2e-5, weight_decay=0.01)

# Loss function with class weights (using weights from the new training data)
criterion = nn.BCELoss(weight=class_weights_tensor_new)

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

# Training loop (increased epochs)
num_epochs = 15
total_steps = len(train_dataloader_new) * num_epochs

print(f"\n--- Multi-Label Classification Training Started (New Split, With Class Weights, {num_labels} labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---")
for epoch in range(num_epochs):
    model_classifier.train()
    total_loss = 0
    for batch in tqdm(train_dataloader_new, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader_new)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")

    # Evaluation on validation set (new)
    model_classifier.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader_new:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model_classifier(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader_new)
    print(f"Epoch {epoch+1}, Average Validation Loss: {avg_val_loss:.4f}")

    # Step the scheduler based on validation loss
    scheduler.step(avg_val_loss)

print(f"\n--- Multi-Label Classification Training Finished (New Split, With Class Weights, {num_labels} labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---")

# Save the trained model
torch.save(model_classifier.state_dict(), f'/content/drive/MyDrive/distilbert_multi_label_newsplit_weighted_lr2e5_{num_labels}labels_15epochs.pth')
print(f"\nTrained multi-label model (new split, with class weights, {num_labels} labels, unfrozen, LR=2e-5, 15 epochs) saved to Google Drive.")


--- Multi-Label Classification Training Started (New Split, With Class Weights, 9 labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---


Epoch 1: 100%|██████████| 563/563 [00:11<00:00, 49.80it/s]


Epoch 1, Average Training Loss: 2.1285
Epoch 1, Average Validation Loss: 0.6238


Epoch 2: 100%|██████████| 563/563 [00:11<00:00, 50.14it/s]


Epoch 2, Average Training Loss: 0.2934
Epoch 2, Average Validation Loss: 0.3621


Epoch 3: 100%|██████████| 563/563 [00:11<00:00, 50.24it/s]


Epoch 3, Average Training Loss: 0.1529
Epoch 3, Average Validation Loss: 0.3356


Epoch 4: 100%|██████████| 563/563 [00:11<00:00, 50.43it/s]


Epoch 4, Average Training Loss: 0.0983
Epoch 4, Average Validation Loss: 0.3178


Epoch 5: 100%|██████████| 563/563 [00:11<00:00, 50.39it/s]


Epoch 5, Average Training Loss: 0.0682
Epoch 5, Average Validation Loss: 0.3188


Epoch 6: 100%|██████████| 563/563 [00:11<00:00, 50.21it/s]


Epoch 6, Average Training Loss: 0.0489
Epoch 6, Average Validation Loss: 0.3104


Epoch 7: 100%|██████████| 563/563 [00:11<00:00, 50.27it/s]


Epoch 7, Average Training Loss: 0.0360
Epoch 7, Average Validation Loss: 0.0704


Epoch 8: 100%|██████████| 563/563 [00:11<00:00, 50.41it/s]


Epoch 8, Average Training Loss: 0.0266
Epoch 8, Average Validation Loss: 0.0674


Epoch 9: 100%|██████████| 563/563 [00:11<00:00, 50.31it/s]


Epoch 9, Average Training Loss: 0.0197
Epoch 9, Average Validation Loss: 0.0652


Epoch 10: 100%|██████████| 563/563 [00:11<00:00, 50.33it/s]


Epoch 10, Average Training Loss: 0.0148
Epoch 10, Average Validation Loss: 0.0661


Epoch 11: 100%|██████████| 563/563 [00:11<00:00, 49.94it/s]


Epoch 11, Average Training Loss: 0.0110
Epoch 11, Average Validation Loss: 0.0731


Epoch 12: 100%|██████████| 563/563 [00:11<00:00, 50.35it/s]


Epoch 12, Average Training Loss: 0.0083
Epoch 12, Average Validation Loss: 0.0742


Epoch 13: 100%|██████████| 563/563 [00:11<00:00, 50.28it/s]


Epoch 13, Average Training Loss: 0.0070
Epoch 13, Average Validation Loss: 0.0742


Epoch 14: 100%|██████████| 563/563 [00:11<00:00, 50.39it/s]


Epoch 14, Average Training Loss: 0.0068
Epoch 14, Average Validation Loss: 0.0750


Epoch 15: 100%|██████████| 563/563 [00:11<00:00, 50.47it/s]


Epoch 15, Average Training Loss: 0.0065
Epoch 15, Average Validation Loss: 0.0797

--- Multi-Label Classification Training Finished (New Split, With Class Weights, 9 labels, Unfrozen Layers, LR=2e-5, 15 Epochs, ReduceLROnPlateau) ---

Trained multi-label model (new split, with class weights, 9 labels, unfrozen, LR=2e-5, 15 epochs) saved to Google Drive.


In [None]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Select the tweet text and distress labels
X_distress = california_wildfire_df['tweet_text'].tolist()
y_distress = california_wildfire_df['distress'].tolist()

# Split the data into training+validation+test and test sets (90/10)
train_val_texts_distress, test_texts_distress, train_val_labels_distress, test_labels_distress = train_test_split(
    X_distress, y_distress, test_size=0.1, random_state=42, stratify=y_distress
)

# Split the training+validation set into training and validation sets (80/10 of the 90%, so 80/10/10 split overall)
train_texts_distress, val_texts_distress, train_labels_distress, val_labels_distress = train_test_split(
    train_val_texts_distress, train_val_labels_distress, test_size=(1/9), random_state=42, stratify=train_val_labels_distress
)

# Tokenize the text
tokenizer_distress = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings_distress = tokenizer_distress(train_texts_distress, truncation=True, padding=True)
val_encodings_distress = tokenizer_distress(val_texts_distress, truncation=True, padding=True)
test_encodings_distress = tokenizer_distress(test_texts_distress, truncation=True, padding=True)

# Create PyTorch Datasets
class DistressDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset_distress = DistressDataset(train_encodings_distress, train_labels_distress)
val_dataset_distress = DistressDataset(val_encodings_distress, val_labels_distress)
test_dataset_distress = DistressDataset(test_encodings_distress, test_labels_distress)

# Create DataLoaders
train_dataloader_distress = DataLoader(train_dataset_distress, batch_size=16, shuffle=True)
val_dataloader_distress = DataLoader(val_dataset_distress, batch_size=16)
test_dataloader_distress = DataLoader(test_dataset_distress, batch_size=16)

print(f"Number of training examples for distress detection: {len(train_dataset_distress)}")
print(f"Number of validation examples for distress detection: {len(val_dataset_distress)}")
print(f"Number of testing examples for distress detection: {len(test_dataset_distress)}")

# Calculate class weights for the distress labels in the training set
train_labels_distress_tensor = torch.tensor(train_labels_distress)
class_counts_distress = torch.bincount(train_labels_distress_tensor)
total_samples_distress = len(train_labels_distress)
weights_distress = total_samples_distress / (class_counts_distress + 1e-6)
class_weights_distress = weights_distress / weights_distress.sum()  # Normalize to sum to 1
class_weights_tensor_distress = class_weights_distress.to(device)

print("\nClass weights for distress detection (class 0: not distressed, class 1: distressed):")
print(class_weights_tensor_distress)

Number of training examples for distress detection: 1145
Number of validation examples for distress detection: 144
Number of testing examples for distress detection: 144

Class weights for distress detection (class 0: not distressed, class 1: distressed):
tensor([0.1074, 0.8926], device='cuda:0')


In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

# Load the pre-trained DistilBERT model
distress_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Define the binary classification layer
class BinaryClassifier(nn.Module):
    def __init__(self, base_model, dropout=0.1):
        super(BinaryClassifier, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(base_model.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return probabilities

distress_classifier = BinaryClassifier(distress_model).to(device)

# Optimizer with a learning rate of 2e-5
optimizer_distress = AdamW(distress_classifier.parameters(), lr=2e-5, weight_decay=0.01)

# Custom weighted BCE loss
class WeightedBCELoss(nn.Module):
    def __init__(self, weights):
        super(WeightedBCELoss, self).__init__()
        self.weights = weights

    def forward(self, inputs, targets):
        inputs = torch.clamp(inputs, min=1e-7, max=1-1e-7)
        bce_loss = - (targets * torch.log(inputs) + (1 - targets) * torch.log(1 - inputs))
        weights = self.weights[targets.long()].unsqueeze(1)
        weighted_loss = bce_loss * weights
        return torch.mean(weighted_loss)

criterion_distress = WeightedBCELoss(class_weights_tensor_distress)

# Learning rate scheduler
scheduler_distress = ReduceLROnPlateau(optimizer_distress, mode='min', factor=0.1, patience=2)

# Training loop
num_epochs_distress = 10  # You can adjust the number of epochs
print("\n--- Distress Classification Training Started (Custom Weighted Loss) ---")
for epoch in range(num_epochs_distress):
    distress_classifier.train()
    total_loss = 0
    for batch in tqdm(train_dataloader_distress, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].unsqueeze(1).to(device)

        outputs = distress_classifier(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion_distress(outputs, labels)
        total_loss += loss.item()

        optimizer_distress.zero_grad()
        loss.backward()
        optimizer_distress.step()

    avg_train_loss = total_loss / len(train_dataloader_distress)
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")

    # Evaluation on validation set
    distress_classifier.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader_distress:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].unsqueeze(1).to(device)

            outputs = distress_classifier(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion_distress(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader_distress)
    print(f"Epoch {epoch+1}, Average Validation Loss: {avg_val_loss:.4f}")

    scheduler_distress.step(avg_val_loss)

print("\n--- Distress Classification Training Finished (Custom Weighted Loss) ---")

# Save the trained model
torch.save(distress_classifier.state_dict(), '/content/drive/MyDrive/distilbert_distress_classifier.pth')
print("Trained distress classification model (with custom weighted loss) saved to Google Drive.")


--- Distress Classification Training Started (Custom Weighted Loss) ---


Epoch 1: 100%|██████████| 72/72 [00:01<00:00, 49.66it/s]


Epoch 1, Average Training Loss: 0.0718
Epoch 1, Average Validation Loss: 0.0409


Epoch 2: 100%|██████████| 72/72 [00:01<00:00, 52.64it/s]


Epoch 2, Average Training Loss: 0.0339
Epoch 2, Average Validation Loss: 0.0083


Epoch 3: 100%|██████████| 72/72 [00:01<00:00, 51.88it/s]


Epoch 3, Average Training Loss: 0.0121
Epoch 3, Average Validation Loss: 0.0087


Epoch 4: 100%|██████████| 72/72 [00:01<00:00, 52.90it/s]


Epoch 4, Average Training Loss: 0.0050
Epoch 4, Average Validation Loss: 0.0008


Epoch 5: 100%|██████████| 72/72 [00:01<00:00, 52.77it/s]


Epoch 5, Average Training Loss: 0.0025
Epoch 5, Average Validation Loss: 0.0004


Epoch 6: 100%|██████████| 72/72 [00:01<00:00, 52.04it/s]


Epoch 6, Average Training Loss: 0.0006
Epoch 6, Average Validation Loss: 0.0003


Epoch 7: 100%|██████████| 72/72 [00:01<00:00, 51.59it/s]


Epoch 7, Average Training Loss: 0.0002
Epoch 7, Average Validation Loss: 0.0002


Epoch 8: 100%|██████████| 72/72 [00:01<00:00, 50.85it/s]


Epoch 8, Average Training Loss: 0.0001
Epoch 8, Average Validation Loss: 0.0002


Epoch 9: 100%|██████████| 72/72 [00:01<00:00, 53.07it/s]


Epoch 9, Average Training Loss: 0.0001
Epoch 9, Average Validation Loss: 0.0001


Epoch 10: 100%|██████████| 72/72 [00:01<00:00, 53.03it/s]


Epoch 10, Average Training Loss: 0.0001
Epoch 10, Average Validation Loss: 0.0001

--- Distress Classification Training Finished (Custom Weighted Loss) ---
Trained distress classification model (with custom weighted loss) saved to Google Drive.


In [None]:
import spacy

# Load the small English spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully.")
except OSError:
    print("Error: Could not load the spaCy model. Please ensure you have downloaded it using: python -m spacy download en_core_web_sm")
    nlp = None

spaCy model loaded successfully.


In [None]:
def extract_locations(text):
    if nlp:
        doc = nlp(text)
        locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
        return locations
    else:
        return []

# Apply the location extraction function to the first 100 California wildfire tweets
if california_wildfire_df is not None and nlp is not None:
    sample_tweets = california_wildfire_df['tweet_text'].head(100)
    extracted_locations = sample_tweets.apply(extract_locations)

    print("\nExtracted locations from the first 100 California wildfire tweets:")
    for i, locations in extracted_locations.items():
        if locations:
            print(f"Tweet {i}: {sample_tweets.iloc[i]} -> Locations: {', '.join(locations)}")
        else:
            print(f"Tweet {i}: {sample_tweets.iloc[i]} -> No locations found")ift
else:
    print("Either the California wildfire DataFrame is not loaded or the spaCy model is not available.")


Extracted locations from the first 100 California wildfire tweets:
Tweet 0: wildfires raging through northern california are terrifying -> Locations: northern california
Tweet 1: photos deadly wildfires rage in california -> Locations: california
Tweet 2: californias raging wildfires as youve never seen them before -> No locations found
Tweet 3: wildfires threaten californias first legal cannabis harvest -> No locations found
Tweet 4: mass evacuations in california as wildfires kill at least 10 #californiawildfires -> Locations: california
Tweet 5: california wildfires destroy more than 50 structures #kakenews -> Locations: california
Tweet 6: california wildfires destroy more than 50 structures #kakenews -> Locations: california
Tweet 7: california wildfires destroy more than 50 structures #kakenews -> Locations: california
Tweet 8: southern california fire shrouds disneyland anaheim in dramatic smoky skies -> Locations: california
Tweet 9: california wildfire 4 -> Locations: califor

In [None]:
import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizerFast

# Revised action categories
action_categories = ['start missing person search', 'monitor situation', 'send evacuation',
                     'shelter support', 'send rescue team', 'send security',
                     'trauma support', 'send medical team', 'send immediate help']
num_labels = len(action_categories)

# Load the pre-trained DistilBERT model
action_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Define the multi-label classifier
class MultiLabelClassifier(nn.Module):
    def __init__(self, base_model, num_labels, dropout=0.1):
        super(MultiLabelClassifier, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return probabilities

model_classifier_action = MultiLabelClassifier(action_model, num_labels).to(device)
model_classifier_action.load_state_dict(torch.load('/content/drive/MyDrive/distilbert_multi_label_newsplit_weighted_lr2e5_9labels_15epochs.pth'))
model_classifier_action.eval()

# Tokenizer
tokenizer_action = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the tweet text from the California wildfire DataFrame
california_encodings_action = tokenizer_action(
    california_wildfire_df['tweet_text'].tolist(),
    truncation=True,
    padding=True,
    return_tensors='pt'
).to(device)

print("Action prediction model loaded and California wildfire tweets tokenized.")

Action prediction model loaded and California wildfire tweets tokenized.


In [None]:
# Make predictions
predictions_action = []
with torch.no_grad():
    for i in tqdm(range(0, len(california_wildfire_df), 16), desc="Predicting Actions"):
        batch_input_ids = california_encodings_action['input_ids'][i:i+16]
        batch_attention_mask = california_encodings_action['attention_mask'][i:i+16]
        outputs = model_classifier_action(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        probabilities = outputs.cpu().numpy()
        predicted = (probabilities > 0.5).astype(int)
        predictions_action.extend(predicted)

# Convert predictions to action labels
predicted_actions_list = []
for prediction in predictions_action:
    actions = [action_categories[i] for i, val in enumerate(prediction) if val == 1]
    predicted_actions_list.append(', '.join(actions) if actions else 'monitor situation') # Default if no action predicted

# Add the predicted actions to the DataFrame
california_wildfire_df['predicted_actions'] = predicted_actions_list

# Display the first few rows with predicted actions
print("\nFirst few rows of California wildfire tweets with predicted actions:")
print(california_wildfire_df[['tweet_text', 'state', 'Wildfire', 'distress', 'predicted_actions']].head(20))

Predicting Actions: 100%|██████████| 90/90 [00:00<00:00, 190.74it/s]


First few rows of California wildfire tweets with predicted actions:
                                           tweet_text       state Wildfire  \
0   wildfires raging through northern california a...  California      Yes   
1          photos deadly wildfires rage in california  California      Yes   
2   californias raging wildfires as youve never se...  California      Yes   
3   wildfires threaten californias first legal can...  California      Yes   
4   mass evacuations in california as wildfires ki...  California      Yes   
5   california wildfires destroy more than 50 stru...  California      Yes   
6   california wildfires destroy more than 50 stru...  California      Yes   
7   california wildfires destroy more than 50 stru...  California      Yes   
8   southern california fire shrouds disneyland an...  California      Yes   
9                               california wildfire 4  California      Yes   
10  wildfires still burn in northern california 11...  California      Y




In [None]:
# Apply location extraction to the entire DataFrame
if nlp is not None and 'extracted_locations' not in california_wildfire_df.columns:
    california_wildfire_df['extracted_locations'] = california_wildfire_df['tweet_text'].apply(extract_locations)
    print("Location extraction completed for all California wildfire tweets.")
elif 'extracted_locations' in california_wildfire_df.columns:
    print("Locations already extracted.")
else:
    print("spaCy model not loaded.")

# Now, let's display the information in a clearer format
if california_wildfire_df is not None:
    print("\n--- California Wildfire Tweets with Distress, Locations, and Predicted Actions ---")
    for index, row in california_wildfire_df.head(20).iterrows():
        tweet = row['tweet_text']
        distress = "Yes" if row['distress'] == 1 else "No"
        locations = ", ".join(row['extracted_locations']) if row['extracted_locations'] else "No locations found"
        actions = row['predicted_actions']
        print(f"\nTweet {index}:")
        print(f"  Text: {tweet}")
        print(f"  Distress: {distress}")
        print(f"  Locations: {locations}")
        print(f"  Predicted Actions: {actions}")
else:
    print("California wildfire DataFrame not loaded.")

Location extraction completed for all California wildfire tweets.

--- California Wildfire Tweets with Distress, Locations, and Predicted Actions ---

Tweet 0:
  Text: wildfires raging through northern california are terrifying
  Distress: No
  Locations: northern california
  Predicted Actions: monitor situation

Tweet 1:
  Text: photos deadly wildfires rage in california
  Distress: No
  Locations: california
  Predicted Actions: monitor situation

Tweet 2:
  Text: californias raging wildfires as youve never seen them before
  Distress: No
  Locations: No locations found
  Predicted Actions: monitor situation

Tweet 3:
  Text: wildfires threaten californias first legal cannabis harvest
  Distress: No
  Locations: No locations found
  Predicted Actions: monitor situation

Tweet 4:
  Text: mass evacuations in california as wildfires kill at least 10 #californiawildfires
  Distress: Yes
  Locations: california
  Predicted Actions: send evacuation, shelter support

Tweet 5:
  Text: calif

In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load the trained distress classification model if not already loaded
if 'distress_classifier' not in locals():
    from transformers import DistilBertModel
    import torch.nn as nn

    distress_model_eval = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

    class BinaryClassifierEval(nn.Module):
        def __init__(self, base_model, dropout=0.1):
            super(BinaryClassifierEval, self).__init__()
            self.base_model = base_model
            self.dropout = nn.Dropout(dropout)
            self.classifier = nn.Linear(base_model.config.hidden_size, 1)
            self.sigmoid = nn.Sigmoid()

        def forward(self, input_ids, attention_mask):
            outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.last_hidden_state[:, 0, :]
            pooled_output = self.dropout(pooled_output)
            logits = self.classifier(pooled_output)
            probabilities = self.sigmoid(logits)
            return probabilities

    distress_classifier = BinaryClassifierEval(distress_model_eval).to(device)
    distress_classifier.load_state_dict(torch.load('/content/drive/MyDrive/distilbert_distress_classifier.pth'))
    distress_classifier.eval()

# Prepare the test DataLoader
if 'test_dataloader_distress' not in locals():
    from transformers import DistilBertTokenizerFast
    from torch.utils.data import Dataset, DataLoader

    tokenizer_distress_eval = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    class DistressDatasetEval(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
            return item

        def __len__(self):
            return len(self.labels)

    X_test_distress = test_texts_distress
    y_test_distress = test_labels_distress
    test_encodings_distress = tokenizer_distress_eval(X_test_distress, truncation=True, padding=True)
    test_dataset_distress = DistressDatasetEval(test_encodings_distress, y_test_distress)
    test_dataloader_distress = DataLoader(test_dataset_distress, batch_size=16)

# Make predictions on the test set
predictions = []
true_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader_distress, desc="Evaluating Distress Model"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().numpy()
        outputs = distress_classifier(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = outputs.squeeze().cpu().numpy()
        predicted_labels = (probabilities > 0.5).astype(int)
        predictions.extend(predicted_labels)
        true_labels.extend(labels)

# Generate classification report and confusion matrix
print("\n--- Distress Classification Model Evaluation ---")
print("\nClassification Report:")
print(classification_report(true_labels, predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predictions))

Evaluating Distress Model: 100%|██████████| 9/9 [00:00<00:00, 165.17it/s]


--- Distress Classification Model Evaluation ---

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98       129
         1.0       0.91      0.67      0.77        15

    accuracy                           0.96       144
   macro avg       0.94      0.83      0.87       144
weighted avg       0.96      0.96      0.96       144


Confusion Matrix:
[[128   1]
 [  5  10]]





In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import os

# Define the path to save the distress classification model
distress_model_save_path = '/content/drive/MyDrive/distilbert_distress_classifier_evaluated.pth'

# Save the model state dictionary
torch.save(distress_classifier.state_dict(), distress_model_save_path)
print(f"Evaluated distress classification model saved to: {distress_model_save_path}")

# Prepare the evaluation results as a string
evaluation_report = """
--- Distress Classification Model Evaluation ---

Classification Report:
{}

Confusion Matrix:
{}
""".format(classification_report(true_labels, predictions), confusion_matrix(true_labels, predictions))

# Define the path to save the evaluation results
evaluation_results_path = '/content/drive/MyDrive/distress_classification_evaluation_report.txt'

# Save the evaluation report to a text file
with open(evaluation_results_path, 'w') as f:
    f.write(evaluation_report)

print(f"\nDistress classification evaluation results saved to: {evaluation_results_path}")

Evaluated distress classification model saved to: /content/drive/MyDrive/distilbert_distress_classifier_evaluated.pth

Distress classification evaluation results saved to: /content/drive/MyDrive/distress_classification_evaluation_report.txt


IFT

In [None]:
import pandas as pd
import re
from geopy.geocoders import Nominatim
import spacy
from sklearn.model_selection import train_test_split

# Load the California wildfire DataFrame (REPLACE WITH YOUR ACTUAL PATH)
file_path = '/content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv'
try:
    california_wildfire_df = pd.read_csv(file_path)
    print(f"California wildfire DataFrame loaded successfully from: {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at: {file_path}. Please provide the correct path to your california_wildfire_tweets.csv file.")
    california_wildfire_df = None

if california_wildfire_df is not None:
    # Initialize geolocator for location extraction (we might use this to validate extracted locations)
    geolocator = Nominatim(user_agent="wildfire_locator")

    # Load a small spaCy model for better location extraction
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Error: Could not load the spaCy model. Please ensure you have downloaded it using: python -m spacy download en_core_web_sm")
        nlp = None

    def is_california_wildfire(text):
        text = text.lower()
        california_mentions = "california" in text or "ca" in text or "la" in text or "san francisco" in text or "bay area" in text # Add more California-specific terms if needed
        fire_related = "fire" in text or "wildfire" in text or "smoke" in text or "flames" in text or "evacuat" in text
        return "yes" if california_mentions and fire_related else "no"

    def detect_distress_instruction_final_refined(text):
        text = text.lower()
        distress_keywords = [
            "help", "trapped", "injured", "need rescue", "emergency", "life-threatening", "dying", "burn", "evacuate now", "urgent",
            "terrifying", "scary", "afraid", "worried", "desperate", "panic", "danger", "threatened",
            "lost my home", "lost everything", "can't breathe", "smoke inhalation", "fire is near", "in flames",
            "need medical", "call for help", "stuck", "nowhere to go", "running out of time", "please help us",
            "destruction", "destroy", "suffer", "suffering"
        ]
        if any(keyword in text for keyword in distress_keywords):
            return "distress"
        else:
            return "not distress"

    def extract_location_instruction(text):
        text = text.lower()
        if nlp:
            doc = nlp(text)
            locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
            if locations:
                # Return the first identified location for simplicity
                return locations[0]
        return "unknown"

    def recommend_action_instruction(text, is_distress):
        text = text.lower()
        if is_distress == "distress":
            if "medical" in text or "injured" in text or "burn" in text:
                return "medical aid"
            elif "trapped" in text or "rescue" in text:
                return "rescue"
            elif "evacuate" in text:
                return "evacuation"
            else:
                return "rescue" # Default to rescue if distress is indicated but specific need is unclear
        else:
            if "burn" in text or "fire" in text and ("spread" in text or "threaten" in text):
                return "fire suppression"
            elif "need supplies" in text or "lack food" in text or "water low" in text:
                return "resource delivery"
            elif "evacuat" in text or "shelter" in text:
                return "evacuation" # Could be a report of an ongoing evacuation
            else:
                return "monitor only"

    # Apply the functions to create the label columns
    california_wildfire_df['tweet_text_lower'] = california_wildfire_df['tweet_text'].apply(lambda x: x.lower()) # Lowercase the text once
    california_wildfire_df['is_wildfire_ca'] = california_wildfire_df['tweet_text_lower'].apply(is_california_wildfire)
    california_wildfire_df['distress_label_instruction'] = california_wildfire_df['tweet_text_lower'].apply(detect_distress_instruction_final_refined)
    california_wildfire_df['location_label_instruction'] = california_wildfire_df['tweet_text_lower'].apply(extract_location_instruction)
    california_wildfire_df['action_label_instruction'] = california_wildfire_df.apply(lambda row: recommend_action_instruction(row['tweet_text_lower'], row['distress_label_instruction']), axis=1)

    print("Labeling complete. First few rows with labels:")
    print(california_wildfire_df[['tweet_text', 'is_wildfire_ca', 'distress_label_instruction', 'location_label_instruction', 'action_label_instruction']].head())

    # Check class balance for distress
    distress_balance = california_wildfire_df['distress_label_instruction'].value_counts(normalize=True)
    print("\nClass balance for 'distress_label_instruction':")
    print(distress_balance)

    # --- Data Splitting ---
    if 'distress_label_instruction' in california_wildfire_df.columns:
        # Separate features (tweet text) and the distress label for stratified splitting
        X = california_wildfire_df['tweet_text']
        y_distress = california_wildfire_df['distress_label_instruction']

        # 1. Sample (50 count) - stratified and balanced
        sample_df = california_wildfire_df.groupby('distress_label_instruction', group_keys=False).apply(lambda x: x.sample(min(len(x), 25)))
        print(f"\nSample set size: {len(sample_df)}")

        # 2. Remaining data
        remaining_df = california_wildfire_df.drop(sample_df.index)

        # Split remaining into training, validation, and testing (stratified)
        train_df, temp_df = train_test_split(remaining_df, test_size=0.2, stratify=remaining_df['distress_label_instruction'], random_state=42)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['distress_label_instruction'], random_state=42)

        print(f"Training set size: {len(train_df)}")
        print(f"Validation set size: {len(val_df)}")
        print(f"Testing set size: {len(test_df)}")

        # Save the splits (you might want to save them to CSV files)
        train_df.to_csv('train_instruction_tuning.csv', index=False)
        val_df.to_csv('val_instruction_tuning.csv', index=False)
        test_df.to_csv('test_instruction_tuning.csv', index=False)
        sample_df.to_csv('sample_instruction_tuning.csv', index=False)
        print("\nData splits saved to CSV files.")

    else:
        print("Error: 'distress_label_instruction' column not found, cannot perform stratified split.")

else:
    print("DataFrame not loaded, cannot proceed with labeling and splitting.")

California wildfire DataFrame loaded successfully from: /content/drive/MyDrive/ground_truth_dataset_with_wildfire.csv
Labeling complete. First few rows with labels:
                                          tweet_text is_wildfire_ca  \
0  wildfires raging through northern california a...            yes   
1         photos deadly wildfires rage in california            yes   
2  pls share were capturing wildfire response rec...            yes   
3  pls share were capturing wildfire response rec...            yes   
4  californias raging wildfires as youve never se...            yes   

  distress_label_instruction location_label_instruction  \
0                   distress        northern california   
1               not distress                 california   
2               not distress                    unknown   
3               not distress                    unknown   
4               not distress                    unknown   

  action_label_instruction  
0                   resc

  sample_df = california_wildfire_df.groupby('distress_label_instruction', group_keys=False).apply(lambda x: x.sample(min(len(x), 25)))



Data splits saved to CSV files.


In [None]:
if california_wildfire_df is not None:
    action_balance = california_wildfire_df['action_label_instruction'].value_counts(normalize=True)
    print("\nClass balance for 'action_label_instruction':")
    print(action_balance)
else:
    print("DataFrame not loaded, cannot check action label balance.")


Class balance for 'action_label_instruction':
action_label_instruction
monitor only         0.870479
rescue               0.108284
evacuation           0.013549
medical aid          0.006415
fire suppression     0.001217
resource delivery    0.000055
Name: proportion, dtype: float64


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "meta-llama/Llama-2-7b-hf"  # Or the specific Llama 7b variant you're using
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_csv('train_instruction_tuning.csv')

def format_instruction(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California. Input: {row['tweet_text']}"
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return f"You are an emergency detection system. Determine if the tweet clearly indicates distress. Input: {row['tweet_text']}"
    elif row['location_label_instruction'] != 'unknown':
        return f"Extract the most specific real-world geographic location mentioned in the tweet. Input: {row['tweet_text']}"
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action. Input: {row['tweet_text']}"
    return ""

def format_output(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return row['is_wildfire_ca']
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return row['distress_label_instruction']
    elif row['location_label_instruction'] != 'unknown':
        return row['location_label_instruction']
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return row['action_label_instruction']
    return ""

train_df['instruction'] = train_df.apply(format_instruction, axis=1)
train_df['output'] = train_df.apply(format_output, axis=1)
train_df['input'] = train_df['tweet_text']

# Filter out examples with empty instructions
train_df_filtered = train_df[train_df['instruction'] != ""]

train_dataset = Dataset.from_pandas(train_df_filtered[['instruction', 'input', 'output']])

def preprocess_function(examples):
    inputs = [f"{instruction}\n{input_text}" for instruction, input_text in zip(examples['instruction'], examples['input'])]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
    model_inputs = tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt")
    labels = tokenizer(targets, truncation=True, padding='longest', return_tensors="pt")
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in doc] for doc in labels["input_ids"]
    ]
    return {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"], "labels": labels["input_ids"]}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

print(tokenized_train_dataset[0])
print(tokenized_val_dataset[0])

Map:   0%|          | 0/14425 [00:00<?, ? examples/s]

Map:   0%|          | 0/1803 [00:00<?, ? examples/s]

{'instruction': 'You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California. Input: heres what 51000 pounds of donated goods from houston to puerto rico looks like', 'input': 'heres what 51000 pounds of donated goods from houston to puerto rico looks like', 'output': 'no', 'input_ids': [1, 887, 526, 263, 8775, 8696, 770, 3709, 29889, 5953, 837, 457, 565, 278, 7780, 300, 338, 9479, 1048, 263, 8775, 8696, 10464, 297, 8046, 29889, 10567, 29901, 902, 267, 825, 29871, 29945, 29896, 29900, 29900, 29900, 24261, 310, 1016, 630, 22535, 515, 298, 283, 7352, 304, 2653, 261, 517, 364, 1417, 3430, 763, 13, 2276, 267, 825, 29871, 29945, 29896, 29900, 29900, 29900, 24261, 310, 1016, 630, 22535, 515, 298, 283, 7352, 304, 2653, 261, 517, 364, 1417, 3430, 763, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
for name, param in model.named_parameters():
    if 'model.layers.' in name:
        layer_num = int(name.split('.')[2])
        if layer_num < 26:  # Freeze layers up to layer 25 (adjust as needed)
            param.requires_grad = False

# Verify the number of trainable parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")
print(f"Percentage trainable: {trainable_params / total_params * 100:.2f}%")

Total parameters: 6738415616
Trainable parameters: 1476448256
Percentage trainable: 21.91%


In [None]:
from transformers import TrainingArguments

output_dir = "./llama-7b-wildfire-multi-task"  # Directory to save checkpoints and logs

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,  # Adjust as needed
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    weight_decay=0.01,
    report_to="tensorboard",  # Or "wandb" if you prefer
)

In [None]:
import pandas as pd
from datasets import Dataset

val_df = pd.read_csv('val_instruction_tuning.csv')

def format_instruction_val(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California. Input: {row['tweet_text']}"
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return f"You are an emergency detection system. Determine if the tweet clearly indicates distress. Input: {row['tweet_text']}"
    elif row['location_label_instruction'] != 'unknown':
        return f"Extract the most specific real-world geographic location mentioned in the tweet. Input: {row['tweet_text']}"
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action. Input: {row['tweet_text']}"
    return ""

def format_output_val(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return row['is_wildfire_ca']
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return row['distress_label_instruction']
    elif row['location_label_instruction'] != 'unknown':
        return row['location_label_instruction']
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return row['action_label_instruction']
    return ""

val_df['instruction'] = val_df.apply(format_instruction_val, axis=1)
val_df['output'] = val_df.apply(format_output_val, axis=1)
val_df['input'] = val_df['tweet_text']

# Filter out examples with empty instructions
val_df_filtered = val_df[val_df['instruction'] != ""]

val_dataset = Dataset.from_pandas(val_df_filtered[['instruction', 'input', 'output']])

def preprocess_function_val(examples):
    inputs = [f"{instruction}\n{input_text}" for instruction, input_text in zip(examples['instruction'], examples['input'])]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
    model_inputs = tokenizer(inputs, truncation=True, padding="longest", return_tensors="pt")
    labels = tokenizer(targets, truncation=True, padding="longest", return_tensors="pt")
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in doc] for doc in labels["input_ids"]
    ]
    return {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"], "labels": labels["input_ids"]}

tokenized_val_dataset = val_dataset.map(preprocess_function_val, batched=True)

print(tokenized_val_dataset[0])

Map:   0%|          | 0/1803 [00:00<?, ? examples/s]

{'instruction': 'You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California. Input: central #mexico #earthquake #worksheet #geography #junior cycle #jcgeography', 'input': 'central #mexico #earthquake #worksheet #geography #junior cycle #jcgeography', 'output': 'no', 'input_ids': [1, 887, 526, 263, 8775, 8696, 770, 3709, 29889, 5953, 837, 457, 565, 278, 7780, 300, 338, 9479, 1048, 263, 8775, 8696, 10464, 297, 8046, 29889, 10567, 29901, 6555, 396, 29885, 735, 1417, 396, 799, 386, 339, 1296, 396, 1287, 9855, 396, 479, 5275, 396, 29926, 348, 1611, 11412, 396, 29926, 29883, 479, 5275, 13, 25171, 396, 29885, 735, 1417, 396, 799, 386, 339, 1296, 396, 1287, 9855, 396, 479, 5275, 396, 29926, 348, 1611, 11412, 396, 29926, 29883, 479, 5275, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
)

print("Trainer initialized.")

Trainer initialized.


In [None]:
!rm -rf /usr/local/lib/python3.11/dist-packages/bitsandbytes
!rm -rf /usr/local/lib/python3.11/dist-packages/triton*

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting triton==3.3.0 (from torch<3,>=2.0->bitsandbytes)
  Using cached triton-3.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
Using cached triton-3.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (156.5 MB)
Installing collected packages: triton, bitsandbytes
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.19 requires torch<2.7,>=1.10, but you have torch 2.7.0 which is incompatible.[0m[31m
[0mSuccessfully installed bitsandbytes-0.45.5 triton-3.3.0


In [None]:
!pip install peft

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator

# Initialize Accelerator
accelerator = Accelerator()
device = accelerator.device

# Load the model and tokenizer (without 4-bit quantization)
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

# Configure LoRA
lora_config = LoraConfig(
    r=2,  # Even lower rank for this test
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ],
)

# Get the LoRA model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,997,120 || all params: 6,743,412,736 || trainable%: 0.0741


In [None]:
# Load a balanced sample of the datasets for testing
def balanced_sample(df, n=500, label_column='is_wildfire_ca'): # Choose a relevant label column
    if label_column not in df.columns:
        if 'distress_label_instruction' in df.columns:
            label_column = 'distress_label_instruction'
        elif 'location_label_instruction' in df.columns:
            label_column = 'location_label_instruction'
        elif 'action_label_instruction' in df.columns:
             label_column = 'action_label_instruction'
        else:
            return df.head(min(n, len(df)))  # If no suitable label column, take the head

    grouped = df.groupby(label_column)
    sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
    sampled_df = sampled_df.reset_index(drop=True)
    return sampled_df

train_df = pd.read_csv('train_instruction_tuning.csv')
val_df = pd.read_csv('val_instruction_tuning.csv')

train_df = balanced_sample(train_df, n=500)
val_df = balanced_sample(val_df, n=500)

def format_instruction(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California. Input: {row['tweet_text']}"
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return f"You are an emergency detection system. Determine if the tweet clearly indicates distress. Input: {row['tweet_text']}"
    elif row['location_label_instruction'] != 'unknown':
        return f"Extract the most specific real-world geographic location mentioned in the tweet. Input: {row['tweet_text']}"
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action. Input: {row['tweet_text']}"
    return ""

def format_output(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return row['is_wildfire_ca']
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return row['distress_label_instruction']
    elif row['location_label_instruction'] != 'unknown':
        return row['location_label_instruction']
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return row['action_label_instruction']
    return ""

train_df['instruction'] = train_df.apply(format_instruction, axis=1)
train_df['output'] = train_df.apply(format_output, axis=1)
train_df['input'] = train_df['tweet_text']
train_df_filtered = train_df[train_df['instruction'] != ""]
train_dataset = Dataset.from_pandas(train_df_filtered[['instruction', 'input', 'output']])

val_df['instruction'] = val_df.apply(format_instruction, axis=1)
val_df['output'] = val_df.apply(format_output, axis=1)
val_df['input'] = val_df['tweet_text']
val_df_filtered = val_df[val_df['instruction'] != ""]
val_dataset = Dataset.from_pandas(val_df_filtered[['instruction', 'input', 'output']])

max_length = 128
def preprocess_function(examples):
    inputs = [f"{instruction}\n{input_text}" for instruction, input_text in zip(examples['instruction'], examples['input'])]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"], "labels": labels["input_ids"]}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/414 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

lora_config = LoraConfig(r=2, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"])
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,997,120 || all params: 6,743,412,736 || trainable%: 0.0741


In [None]:
import pandas as pd
from datasets import Dataset

# Load a balanced sample of the datasets for testing
def balanced_sample(df, n=500, label_column='is_wildfire_ca'): # Choose a relevant label column
    if label_column not in df.columns:
        if 'distress_label_instruction' in df.columns:
            label_column = 'distress_label_instruction'
        elif 'location_label_instruction' in df.columns:
            label_column = 'location_label_instruction'
        elif 'action_label_instruction' in df.columns:
             label_column = 'action_label_instruction'
        else:
            return df.head(min(n, len(df)))  # If no suitable label column, take the head

    grouped = df.groupby(label_column)
    sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
    sampled_df = sampled_df.reset_index(drop=True)
    return sampled_df

train_df = pd.read_csv('train_instruction_tuning.csv')
val_df = pd.read_csv('val_instruction_tuning.csv')

train_df = balanced_sample(train_df, n=500)
val_df = balanced_sample(val_df, n=500)

def format_instruction(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return f"You are a wildfire classifier. Determine if the tweet is explicitly about a wildfire happening in California. Input: {row['tweet_text']}"
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return f"You are an emergency detection system. Determine if the tweet clearly indicates distress. Input: {row['tweet_text']}"
    elif row['location_label_instruction'] != 'unknown':
        return f"Extract the most specific real-world geographic location mentioned in the tweet. Input: {row['tweet_text']}"
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return f"You are a disaster response coordinator. Based on the content of the tweet, recommend the most urgent emergency action. Input: {row['tweet_text']}"
    return ""

def format_output(row):
    if row['is_wildfire_ca'] in ['yes', 'no']:
        return row['is_wildfire_ca']
    elif row['distress_label_instruction'] in ['distress', 'not distress']:
        return row['distress_label_instruction']
    elif row['location_label_instruction'] != 'unknown':
        return row['location_label_instruction']
    elif row['action_label_instruction'] in ['evacuation', 'medical aid', 'fire suppression', 'rescue', 'resource delivery', 'monitor only']:
        return row['action_label_instruction']
    return ""

train_df['instruction'] = train_df.apply(format_instruction, axis=1)
train_df['output'] = train_df.apply(format_output, axis=1)
train_df['input'] = train_df['tweet_text']
train_df_filtered = train_df[train_df['instruction'] != ""]
train_dataset = Dataset.from_pandas(train_df_filtered[['instruction', 'input', 'output']])

val_df['instruction'] = val_df.apply(format_instruction, axis=1)
val_df['output'] = val_df.apply(format_output, axis=1)
val_df['input'] = val_df['tweet_text']
val_df_filtered = val_df[val_df['instruction'] != ""]
val_dataset = Dataset.from_pandas(val_df_filtered[['instruction', 'input', 'output']])

max_length = 128
def preprocess_function(examples):
    inputs = [f"{instruction}\n{input_text}" for instruction, input_text in zip(examples['instruction'], examples['input'])]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"], "labels": labels["input_ids"]}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/414 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from datasets import Dataset  # We won't try to import load_metric here
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets.metric import Metric  # Import the Metric class

# Explicitly disable use_cache in the model config
model.config.use_cache = False

# Metric to use for evaluation
metric = Metric.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training arguments (gradient checkpointing disabled)
output_dir = "./llama-7b-wildfire-multi-task-lora-no-gc"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs-lora-no-gc",
    logging_steps=10,
    save_strategy="epoch",
    weight_decay=0.01,
    report_to="tensorboard",
    max_grad_norm=1.0,
    optim="adamw_torch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    gradient_checkpointing=False,  # Gradient checkpointing disabled
    per_device_eval_batch_size=2,  # Reduced evaluation batch size
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Pass the compute_metrics function
)

# Start training
trainer.train()

# Evaluate the model after training
evaluation_results = trainer.evaluate()
print("\nEvaluation Results:")
print(evaluation_results)

ModuleNotFoundError: No module named 'datasets.metric'

In [None]:
!pip install -U bitsandbytes peft transformers datasets
from huggingface_hub import login
login("**********************")


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu1

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

lora_config = LoraConfig(
    r=2,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

trainable params: 4,997,120 || all params: 6,743,412,736 || trainable%: 0.0741


In [None]:
import pandas as pd
from datasets import Dataset

# Load a balanced sample of the datasets for testing
def balanced_sample(df, n=500, label_column=None):
    if 'Wildfire' in df.columns:
        label_column = 'Wildfire'
    elif 'distress' in df.columns:
        label_column = 'distress'
    elif 'state' in df.columns:
        label_column = 'state'
    elif 'take_action' in df.columns:
        label_column = 'take_action'
    else:
        return df.head(min(n, len(df)))  # If no suitable label column, take the head

    if label_column:
        grouped = df.groupby(label_column)
        sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
        sampled_df = sampled_df.reset_index(drop=True)
        return sampled_df
    else:
        return df.head(min(n, len(df)))

train_df = pd.read_csv('/content/drive/MyDrive/llama_train_balanced.csv')
val_df = pd.read_csv('/content/drive/MyDrive/llama_val_balanced.csv')

train_df = balanced_sample(train_df, n=500)
val_df = balanced_sample(val_df, n=500)

def format_instruction(row):
    if 'Wildfire' in row and row['Wildfire'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return f"You are a wildfire classifier. Determine if the tweet is about a wildfire. Input: {row['tweet_text']}"
    elif 'distress' in row and row['distress'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return f"You are an emergency detection system. Determine if the tweet indicates distress. Input: {row['tweet_text']}"
    elif 'state' in row and isinstance(row['state'], str) and row['state'].lower() != 'none':
        return f"Identify the U.S. state mentioned in the tweet. Input: {row['tweet_text']}"
    elif 'take_action' in row and isinstance(row['take_action'], str) and row['take_action'].lower() != 'none':
        return f"Recommend the appropriate action based on the tweet. Input: {row['tweet_text']}"
    return ""

def format_output(row):
    if 'Wildfire' in row and row['Wildfire'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return str(row['Wildfire'])
    elif 'distress' in row and row['distress'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return str(row['distress'])
    elif 'state' in row and isinstance(row['state'], str) and row['state'].lower() != 'none':
        return row['state']
    elif 'take_action' in row and isinstance(row['take_action'], str) and row['take_action'].lower() != 'none':
        return row['take_action']
    return ""

train_df['instruction'] = train_df.apply(format_instruction, axis=1)
train_df['output'] = train_df.apply(format_output, axis=1)
train_df['input'] = train_df['tweet_text']
train_df_filtered = train_df[train_df['instruction'] != ""]
train_dataset = Dataset.from_pandas(train_df_filtered[['instruction', 'input', 'output']])

val_df['instruction'] = val_df.apply(format_instruction, axis=1)
val_df['output'] = val_df.apply(format_output, axis=1)
val_df['input'] = val_df['tweet_text']
val_df_filtered = val_df[val_df['instruction'] != ""]
val_dataset = Dataset.from_pandas(val_df_filtered[['instruction', 'input', 'output']])

max_length = 128
def preprocess_function(examples):
    inputs = [f"{instruction}\n{input_text}" for instruction, input_text in zip(examples['instruction'], examples['input'])]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"], "labels": labels["input_ids"]}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/358 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Explicitly disable use_cache in the model config
model.config.use_cache = False

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Remove padding tokens (-100 is often the ignore_index)
    mask = labels != -100
    predictions_masked = predictions[mask]
    labels_masked = labels[mask]
    return {
        'accuracy': accuracy_score(labels_masked, predictions_masked),
        'precision': precision_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
        'recall': recall_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
        'f1': f1_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
    }

# Training arguments (gradient checkpointing disabled)
output_dir = "./llama-7b-wildfire-multi-task-lora-no-gc"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs-lora-no-gc",
    logging_steps=10,
    save_strategy="epoch",
    weight_decay=0.01,
    report_to="tensorboard",
    max_grad_norm=1.0,
    optim="adamw_torch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    gradient_checkpointing=False,  # Gradient checkpointing disabled
    per_device_eval_batch_size=2,  # Reduced evaluation batch size
    eval_steps=100,  # Evaluate every 100 training steps
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Use the scikit-learn based compute_metrics
)

# Start training
trainer.train()

# Evaluate the model after training (optional, as evaluation will happen during training)
# evaluation_results = trainer.evaluate()
# print("\nEvaluation Results:")
# print(evaluation_results)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,18.0041
20,1.3298
30,0.0557
40,0.0048
50,0.0023
60,0.0027
70,0.0031
80,0.0033
90,0.0021
100,0.0025


TrainOutput(global_step=186, training_loss=1.0447998671261693, metrics={'train_runtime': 423.2597, 'train_samples_per_second': 3.544, 'train_steps_per_second': 0.439, 'total_flos': 7515850974167040.0, 'train_loss': 1.0447998671261693, 'epoch': 2.96})

In [None]:
# Evaluate the model after training (optional, as evaluation will happen during training)
evaluation_results = trainer.evaluate()
print("\nEvaluation Results:")
print(evaluation_results)


Evaluation Results:
{'eval_loss': 0.002721394645050168, 'eval_accuracy': 0.9765625, 'eval_precision': 0.9688120039682541, 'eval_recall': 0.9765625, 'eval_f1': 0.9726718127490039, 'eval_runtime': 36.3999, 'eval_samples_per_second': 9.835, 'eval_steps_per_second': 4.918, 'epoch': 2.96}


In [None]:
import torch

# Take a small sample from the validation dataset
sample_size = 5
sample_indices = [10, 50, 100, 150, 200]  # You can choose different indices
sample_val_dataset = val_dataset.select(sample_indices)

model.eval()  # Set the model to evaluation mode

for example in sample_val_dataset:
    instruction = example['instruction']
    input_text = example['input']
    true_output = example['output']

    input_prompt = f"{instruction}\n{input_text}"
    input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)

    predicted_output = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-1].strip()

    print(f"**Instruction:** {instruction}")
    print(f"**Input:** {input_text}")
    print(f"**True Output:** {true_output}")
    print(f"**Predicted Output:** {predicted_output}")
    print("-" * 50)

model.train()  # Set the model back to training mode if you plan to continue training

**Instruction:** You are an emergency detection system. Determine if the tweet indicates distress. Input: photo of the day a boy bathes with mountain spring water in utuado puerto rico
**Input:** photo of the day a boy bathes with mountain spring water in utuado puerto rico
**True Output:** 0
**Predicted Output:** photo of the day a boy bathes with mountain spring water in utuado puerto rico
--------------------------------------------------
**Instruction:** You are an emergency detection system. Determine if the tweet indicates distress. Input: me when irma hit
**Input:** me when irma hit
**True Output:** 0
**Predicted Output:** me when irma hit
--------------------------------------------------
**Instruction:** You are an emergency detection system. Determine if the tweet indicates distress. Input: you know whats more useless than trump in puerto ricopaperfuckingtowels
**Input:** you know whats more useless than trump in puerto ricopaperfuckingtowels
**True Output:** 0
**Predicted Ou

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
 

In [None]:
import torch
import random
from collections import defaultdict

# Take a random sample from the validation dataset
sample_size = 50
random_indices = random.sample(range(len(val_dataset)), sample_size)
sample_val_dataset = val_dataset.select(random_indices)

model.eval()  # Set the model to evaluation mode
grouped_results = defaultdict(list)

for example in sample_val_dataset:
    input_text = example['input']
    true_output = example['output']  # This will be the output corresponding to the *first* instruction type

    # Determine the group based on the true output (you might need to adjust this logic)
    group = f"{true_output.upper()} Tweet Group"

    instruction = example['instruction']
    input_prompt = f"{instruction}\n{input_text}"
    input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)

    predicted_output = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-1].strip()

    grouped_results[group].append({
        "instruction": instruction,
        "input": input_text,
        "true_output": true_output,
        "predicted_output": predicted_output,
    })

model.train()  # Set the model back to training mode

for group, results in grouped_results.items():
    print(f"============================== {group} ==============================")
    for res in results:
        print(f"Instruction: {res['instruction']}")
        print(f"Input:       {res['input']}")
        print(f"True Output: {res['true_output']}")
        print(f"Response:    {res['predicted_output']}")
        print("-" * 70)
    print("\n")

Instruction: You are an emergency detection system. Determine if the tweet indicates distress. Input: hit hard by california wildfires santa rosa faces housing crisis
Input:       hit hard by california wildfires santa rosa faces housing crisis
True Output: 0
Response:    hit hard by california wildfires santa rosa faces housing crisis
----------------------------------------------------------------------
Instruction: You are an emergency detection system. Determine if the tweet indicates distress. Input: lord we pray you give mercy on california and ease the winds today so firemen can get a handle of this fire
Input:       lord we pray you give mercy on california and ease the winds today so firemen can get a handle of this fire
True Output: 0
Response:    lord we pray you give mercy on california and ease the winds today so firemen can get a handle of this fire
----------------------------------------------------------------------
Instruction: You are an emergency detection system. D

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

model_name = "meta-llama/Llama-2-7b-hf"

# Check if CUDA is available and get the device index
if torch.cuda.is_available():
    device = 0  # Assuming your primary GPU is index 0
    print(f"CUDA is available. Using GPU {device}")
else:
    device = "cpu"
    print("CUDA not available. Using CPU.")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={'': device}
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

lora_config = LoraConfig(
    r=8,  # Increased LoRA rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

CUDA is available. Using GPU 0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2958


In [None]:
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Explicitly disable use_cache in the model config
model.config.use_cache = False

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Remove padding tokens (-100 is often the ignore_index)
    mask = labels != -100
    predictions_masked = predictions[mask]
    labels_masked = labels[mask]
    return {
        'accuracy': accuracy_score(labels_masked, predictions_masked),
        'precision': precision_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
        'recall': recall_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
        'f1': f1_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
    }

# Training arguments (gradient checkpointing disabled)
output_dir = "./llama-7b-wildfire-multi-task-lora-no-gc-v2" # Changed output directory
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=10,  # Increased number of epochs
    fp16=True,
    logging_dir="./logs-lora-no-gc-v2", # Changed logging directory
    logging_steps=10,
    save_strategy="epoch",
    weight_decay=0.01,
    report_to="tensorboard",
    max_grad_norm=1.0,
    optim="adamw_torch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    gradient_checkpointing=False,  # Gradient checkpointing disabled
    per_device_eval_batch_size=2,  # Reduced evaluation batch size
    eval_steps=100,  # Evaluate every 100 training steps
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Use the scikit-learn based compute_metrics
)

# Start training
trainer.train()

# Evaluate the model after training
evaluation_results = trainer.evaluate()
print("\nEvaluation Results:")
print(evaluation_results)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,14.0119
20,0.8131
30,0.0255
40,0.0044
50,0.0022
60,0.0027
70,0.0032
80,0.0029
90,0.0021
100,0.0025



Evaluation Results:
{'eval_loss': 0.0027134146075695753, 'eval_accuracy': 0.9765625, 'eval_precision': 0.9688120039682541, 'eval_recall': 0.9765625, 'eval_f1': 0.9726718127490039, 'eval_runtime': 36.7358, 'eval_samples_per_second': 9.745, 'eval_steps_per_second': 4.873, 'epoch': 9.848}


In [None]:
import torch
import random
from collections import defaultdict

# Take a random sample of unique tweet IDs from the validation dataset
sample_size = 5  # Let's look at a smaller number of tweets with all instructions
unique_tweet_ids = val_df['tweet_id'].unique().tolist()
random_tweet_ids = random.sample(unique_tweet_ids, min(sample_size, len(unique_tweet_ids)))
sample_val_df = val_df[val_df['tweet_id'].isin(random_tweet_ids)]

model.eval()  # Set the model to evaluation mode
grouped_results = defaultdict(lambda: defaultdict(dict))

for index, row in sample_val_df.iterrows():
    tweet_id = row['tweet_id']
    instruction = format_instruction(row)
    input_text = row['tweet_text']
    true_output = format_output(row)

    if instruction:  # Only process rows where an instruction was generated
        input_prompt = f"{instruction}\n{input_text}"
        input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(input_ids=input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)

        predicted_output = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-1].strip()

        grouped_results[tweet_id][instruction]['input'] = input_text
        grouped_results[tweet_id][instruction]['true_output'] = true_output
        grouped_results[tweet_id][instruction]['predicted_output'] = predicted_output

model.train()  # Set the model back to training mode

for tweet_id, instruction_results in grouped_results.items():
    print(f"============================== Tweet ID: {tweet_id} ==============================")
    for instruction, results in instruction_results.items():
        print(f"Instruction: {instruction}")
        print(f"Input:       {results['input']}")
        print(f"Distress?:   {results['true_output']}")
        print(f"Response:    {results['predicted_output']}")
        print("-" * 70)
    print("\n")

Instruction: You are an emergency detection system. Determine if the tweet indicates distress. Input: hospitals in #puertorico are in critical condition #hurricanemaria #publichealth
Input:       hospitals in #puertorico are in critical condition #hurricanemaria #publichealth
Distress?:   1
Response:    hospitals in #puertorico are in critical condition #hurricanemaria #publichealth
----------------------------------------------------------------------


Instruction: You are an emergency detection system. Determine if the tweet indicates distress. Input: as a class apostolic project pk3 pk4 collected items for hurricane harvey evacuees in irving shelters
Input:       as a class apostolic project pk3 pk4 collected items for hurricane harvey evacuees in irving shelters
Distress?:   0
Response:    as a class apostolic project pk3 pk4 collected items for hurricane harvey evacuees in irving shelters
----------------------------------------------------------------------


Instruction: You ar

In [None]:
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Explicitly disable use_cache in the model config
model.config.use_cache = False

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Remove padding tokens (-100 is often the ignore_index)
    mask = labels != -100
    predictions_masked = predictions[mask]
    labels_masked = labels[mask]
    return {
        'accuracy': accuracy_score(labels_masked, predictions_masked),
        'precision': precision_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
        'recall': recall_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
        'f1': f1_score(labels_masked, predictions_masked, average='weighted', zero_division=0),
    }

# Training arguments (gradient checkpointing disabled)
output_dir = "./llama-7b-wildfire-multi-task-lora-no-gc-v2" # Changed output directory
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=5,  # Increased number of epochs
    fp16=True,
    logging_dir="./logs-lora-no-gc-v2", # Changed logging directory
    logging_steps=10,
    save_strategy="epoch",
    weight_decay=0.01,
    report_to="tensorboard",
    max_grad_norm=1.0,
    optim="adamw_torch",
    adam_beta1=0.9,
    adam_beta2=0.999,
    gradient_checkpointing=False,  # Gradient checkpointing disabled
    per_device_eval_batch_size=2,  # Reduced evaluation batch size
    eval_steps=100,  # Evaluate every 100 training steps
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Use the scikit-learn based compute_metrics
)

# Start training
trainer.train()

# Evaluate the model after training
evaluation_results = trainer.evaluate()
print("\nEvaluation Results:")
print(evaluation_results)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,0.0311
20,0.0074
30,0.0043
40,0.0088
50,0.0063
60,0.0051
70,0.0062
80,0.007
90,0.0053
100,0.0055



Evaluation Results:
{'eval_loss': 0.005448967218399048, 'eval_accuracy': 0.9774135824022346, 'eval_precision': 0.9705013953669789, 'eval_recall': 0.9774135824022346, 'eval_f1': 0.97394522491771, 'eval_runtime': 35.7564, 'eval_samples_per_second': 10.012, 'eval_steps_per_second': 5.006, 'epoch': 4.928}


In [None]:
import torch
import random
from collections import defaultdict

# Take a random sample of unique tweet IDs from the validation dataset
sample_size = 5  # Let's look at a smaller number of tweets with all instructions
unique_tweet_ids = val_df['tweet_id'].unique().tolist()
random_tweet_ids = random.sample(unique_tweet_ids, min(sample_size, len(unique_tweet_ids)))
sample_val_df = val_df[val_df['tweet_id'].isin(random_tweet_ids)]

model.eval()  # Set the model to evaluation mode
grouped_results = defaultdict(lambda: defaultdict(dict))

for index, row in sample_val_df.iterrows():
    tweet_id = row['tweet_id']
    instruction = format_instruction(row)
    input_text = row['tweet_text']
    true_output = format_output(row)

    if instruction:  # Only process rows where an instruction was generated
        input_prompt = f"{instruction}\n{input_text}"
        input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(input_ids=input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)

        predicted_output = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-1].strip()

        grouped_results[tweet_id][instruction]['input'] = input_text
        grouped_results[tweet_id][instruction]['true_output'] = true_output
        grouped_results[tweet_id][instruction]['predicted_output'] = predicted_output

model.train()  # Set the model back to training mode

for tweet_id, instruction_results in grouped_results.items():
    print(f"============================== Tweet ID: {tweet_id} ==============================")
    for instruction, results in instruction_results.items():
        print(f"Instruction: {instruction}")
        print(f"Input:       {results['input']}")
        if "emergency response system" in instruction.lower():
            print(f"True Action: {results['true_output']}")
            print(f"Predicted:   {results['predicted_output']}")
        else:
            print(f"True Output: {results['true_output']}")
            print(f"Response:    {results['predicted_output']}")
        print("-" * 70)
    print("\n")

Instruction: You are a disaster response coordinator. Based on the content of the tweet, if it indicates distress, recommend the most urgent emergency action from the following: evacuation, medical aid, fire suppression, rescue, resource delivery, monitor only. Input: the #flood extent in kalutara district #srilanka was captured by #terrasarx on 30 may
Input:       the #flood extent in kalutara district #srilanka was captured by #terrasarx on 30 may
True Output: monitor only
Response:    the #flood extent in kalutara district #srilanka was captured by #terrasarx on 30 may
----------------------------------------------------------------------


Instruction: You are a disaster response coordinator. Based on the content of the tweet, if it indicates distress, recommend the most urgent emergency action from the following: evacuation, medical aid, fire suppression, rescue, resource delivery, monitor only. Input: hurricane harveys impact on #houston commercial real estate #cre
Input:       h

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

model_name = "meta-llama/Llama-2-7b-hf"

# Check if CUDA is available and get the device index
if torch.cuda.is_available():
    device = 0  # Assuming your primary GPU is index 0
    print(f"CUDA is available. Using GPU {device}")
else:
    device = "cpu"
    print("CUDA not available. Using CPU.")

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={'': device}
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

lora_config = LoraConfig(
    r=8,  # Increased LoRA rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "up_proj",
        "down_proj",
        "gate_proj",
    ],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

CUDA is available. Using GPU 0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 19,988,480 || all params: 6,758,404,096 || trainable%: 0.2958


In [None]:
import pandas as pd
from datasets import Dataset

# Load a balanced sample of the datasets for testing
def balanced_sample(df, n=500, label_column=None):
    if 'Wildfire' in df.columns:
        label_column = 'Wildfire'
    elif 'distress' in df.columns:
        label_column = 'distress'
    elif 'state' in df.columns:
        label_column = 'state'
    elif 'take_action' in df.columns:
        label_column = 'take_action'
    else:
        return df.head(min(n, len(df)))  # If no suitable label column, take the head

    if label_column:
        grouped = df.groupby(label_column)
        sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
        sampled_df = sampled_df.reset_index(drop=True)
        return sampled_df
    else:
        return df.head(min(n, len(df)))

train_df = pd.read_csv('/content/drive/MyDrive/llama_train_balanced.csv')
val_df = pd.read_csv('/content/drive/MyDrive/llama_val_balanced.csv')

train_df = balanced_sample(train_df, n=500)
val_df = balanced_sample(val_df, n=500)

action_labels_9 = ['evacuate', 'shelter', 'rescue', 'medical', 'supply', 'information', 'infrastructure', 'security', 'other'] # Define your 9 action labels

def format_instruction(row):
    if 'Wildfire' in row and row['Wildfire'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return f"You are a wildfire classifier. Determine if the tweet is about a wildfire. Input: {row['tweet_text']}"
    elif 'distress' in row and row['distress'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return f"Emergency situation detected? If so, what is the most urgent action needed from: {', '.join(action_labels_9)}. Input: {row['tweet_text']}"
    elif 'state' in row and isinstance(row['state'], str) and row['state'].lower() != 'none':
        return f"Identify the U.S. state mentioned in the tweet. Input: {row['tweet_text']}"
    elif 'take_action' in row and row['take_action'] in action_labels_9:
        return f"What is the most urgent action needed from: {', '.join(action_labels_9)}. Input: {row['tweet_text']}"
    return ""

def format_output(row):
    if 'Wildfire' in row and row['Wildfire'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        return str(row['Wildfire'])
    elif 'distress' in row and row['distress'] in [0, 1, '0', '1', 'yes', 'no', True, False]:
        # If distress, directly try to return the 'take_action' label
        if str(row['distress']) in ['1', 'yes', 'True']:
            if 'take_action' in row and row['take_action'] in action_labels_9:
                return row['take_action']
            else:
                return 'monitor only' # Or a more appropriate default if no action label found
        else:
            return 'monitor only'
    elif 'state' in row and isinstance(row['state'], str) and row['state'].lower() != 'none':
        return row['state']
    elif 'take_action' in row and row['take_action'] in action_labels_9:
        return row['take_action']
    return ""

train_df['instruction'] = train_df.apply(format_instruction, axis=1)
train_df['output'] = train_df.apply(format_output, axis=1)
train_df['input'] = train_df['tweet_text']
train_df_filtered = train_df[train_df['instruction'] != ""]
train_dataset = Dataset.from_pandas(train_df_filtered[['instruction', 'input', 'output']])

val_df['instruction'] = val_df.apply(format_instruction, axis=1)
val_df['output'] = val_df.apply(format_output, axis=1)
val_df['input'] = val_df['tweet_text']
val_df_filtered = val_df[val_df['instruction'] != ""]
val_dataset = Dataset.from_pandas(val_df_filtered[['instruction', 'input', 'output']])

max_length = 128
def preprocess_function(examples):
    inputs = [f"{instruction}\n{input_text}" for instruction, input_text in zip(examples['instruction'], examples['input'])]
    targets = [f"{output}{tokenizer.eos_token}" for output in examples['output']]
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"], "labels": labels["input_ids"]}

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))
  sampled_df = grouped.apply(lambda x: x.sample(min(len(x), n // grouped.ngroups), random_state=42))


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/358 [00:00<?, ? examples/s]

In [None]:
import torch
import random
from collections import defaultdict

# Take a random sample of unique tweet IDs from the validation dataset
sample_size = 5  # Let's look at a smaller number of tweets with all instructions
unique_tweet_ids = val_df['tweet_id'].unique().tolist()
random_tweet_ids = random.sample(unique_tweet_ids, min(sample_size, len(unique_tweet_ids)))
sample_val_df = val_df[val_df['tweet_id'].isin(random_tweet_ids)]

model.eval()  # Set the model to evaluation mode
grouped_results = defaultdict(lambda: defaultdict(dict))

for index, row in sample_val_df.iterrows():
    tweet_id = row['tweet_id']
    instruction = format_instruction(row)
    input_text = row['tweet_text']
    true_output = format_output(row)

    if instruction:  # Only process rows where an instruction was generated
        input_prompt = f"{instruction}\n{input_text}"
        input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(input_ids=input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)

        predicted_output = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-1].strip()

        grouped_results[tweet_id][instruction]['input'] = input_text
        grouped_results[tweet_id][instruction]['true_output'] = true_output
        grouped_results[tweet_id][instruction]['predicted_output'] = predicted_output

model.train()  # Set the model back to training mode

for tweet_id, instruction_results in grouped_results.items():
    print(f"============================== Tweet ID: {tweet_id} ==============================")
    for instruction, results in instruction_results.items():
        print(f"Instruction: {instruction}")
        print(f"Input:       {results['input']}")
        if "emergency response system" in instruction.lower():
            print(f"True Action: {results['true_output']}")
            print(f"Predicted:   {results['predicted_output']}")
        else:
            print(f"True Output: {results['true_output']}")
            print(f"Response:    {results['predicted_output']}")
        print("-" * 70)
    print("\n")

Instruction: Emergency situation detected? If so, what is the most urgent action needed from: evacuate, shelter, rescue, medical, supply, information, infrastructure, security, other. Input: hurricane maria coordination call september 28 400 pm et
Input:       hurricane maria coordination call september 28 400 pm et
True Output: monitor only
Response:    hurricane maria coordination call september 28 400 pm et
----------------------------------------------------------------------


Instruction: Emergency situation detected? If so, what is the most urgent action needed from: evacuate, shelter, rescue, medical, supply, information, infrastructure, security, other. Input: humidity will increase sunday tracking when cooler air returns amp hurricane maria right now on channel 11 news #wpxi
Input:       humidity will increase sunday tracking when cooler air returns amp hurricane maria right now on channel 11 news #wpxi
True Output: monitor only
Response:    humidity will increase sunday trac

In [None]:
import torch
import random
from collections import defaultdict

# Take a random sample of unique tweet IDs from the validation dataset
sample_size = 5
unique_tweet_ids = val_df['tweet_id'].unique().tolist()
random_tweet_ids = random.sample(unique_tweet_ids, min(sample_size, len(unique_tweet_ids)))
sample_val_df = val_df[val_df['tweet_id'].isin(random_tweet_ids)]

model.eval()  # Set the model to evaluation mode
grouped_results = defaultdict(lambda: defaultdict(dict))

print("--- DEBUGGING INSTRUCTIONS AND TRUE OUTPUTS FOR EMERGENCY RESPONSE ---")
for index, row in sample_val_df.iterrows():
    if 'distress' in row and str(row['distress']) in ['1', 'yes', 'True']:
        instruction = format_instruction(row)
        true_output = format_output(row)
        print(f"Instruction: {instruction}")
        print(f"True Action: {true_output}")
        print(f"Tweet: {row['tweet_text']}")
        print("-" * 50)
print("--- END DEBUGGING ---")

for index, row in sample_val_df.iterrows():
    tweet_id = row['tweet_id']
    instruction = format_instruction(row)
    input_text = row['tweet_text']
    true_output = format_output(row)

    if instruction:  # Only process rows where an instruction was generated
        input_prompt = f"{instruction}\n{input_text}"
        input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output = model.generate(input_ids=input_ids, max_new_tokens=50, num_beams=5, early_stopping=True)

        predicted_output = tokenizer.decode(output[0], skip_special_tokens=True).split('\n')[-1].strip()

        grouped_results[tweet_id][instruction]['input'] = input_text
        grouped_results[tweet_id][instruction]['true_output'] = true_output
        grouped_results[tweet_id][instruction]['predicted_output'] = predicted_output

model.train()  # Set the model back to training mode

for tweet_id, instruction_results in grouped_results.items():
    print(f"============================== Tweet ID: {tweet_id} ==============================")
    for instruction, results in instruction_results.items():
        print(f"Instruction: {instruction}")
        print(f"Input:       {results['input']}")
        if "emergency situation detected" in instruction.lower():
            print(f"True Action: {results['true_output']}")
            print(f"Predicted:   {results['predicted_output']}")
        else:
            print(f"True Output: {results['true_output']}")
            print(f"Response:    {results['predicted_output']}")
        print("-" * 70)
    print("\n")

--- DEBUGGING INSTRUCTIONS AND TRUE OUTPUTS FOR EMERGENCY RESPONSE ---
Instruction: Emergency situation detected? If so, what is the most urgent action needed from: evacuate, shelter, rescue, medical, supply, information, infrastructure, security, other. Input: help #houstonstrong #redcross #irma #giveblood now and ongoing more #lifesaving stories at
True Action: monitor only
Tweet: help #houstonstrong #redcross #irma #giveblood now and ongoing more #lifesaving stories at
--------------------------------------------------
--- END DEBUGGING ---
Instruction: Emergency situation detected? If so, what is the most urgent action needed from: evacuate, shelter, rescue, medical, supply, information, infrastructure, security, other. Input: how will puerto rico devastated and drowning in debt pay torebuild
Input:       how will puerto rico devastated and drowning in debt pay torebuild
True Action: monitor only
Predicted:   how will puerto rico devastated and drowning in debt pay torebuild
------