In [1]:
!pip install pandas scikit-learn xgboost joblib


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 491.6 kB/s eta 0:05:05
   ---------------------------------------- 0.5/150.0 MB 491.6 kB/s eta 0:05:05
   ---------------------------------------- 0.5/150.0 MB 491.6 kB/s eta 0:05:05
   ---------------------------------------- 0.8/150.0 MB 517.2 kB/s eta 0:04:49
   ---------------------------------------- 0.8/150.0 MB 517.2 kB/s eta 0:04:49
   ---------------------------------------- 1.0/150.0 MB 542.4 kB/s eta 0:04:35
   --------------

In [10]:
import pandas as pd

df = pd.read_csv("disaster_weather_augmented.csv")

# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Show shape and preview
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("First few rows:")
print(df.head())

# Show class distribution
if 'disaster_subtype' in df.columns:
    print("\nLabel distribution:")
    print(df['disaster_subtype'].value_counts())
else:
    print("Missing 'disaster_subtype' column!")


Shape: (112, 7)
Columns: ['disaster_subtype', 'temp', 'humidity', 'precipitation', 'windgust', 'windspeed', 'pressure']
First few rows:
  disaster_subtype  temp  humidity  precipitation  windgust  windspeed  \
0          drought  22.5      57.7            0.0       0.0       15.8   
1          drought  30.1      45.1            0.0       0.0       18.0   
2          drought  29.1      46.5            0.0       0.0       23.4   
3          drought  30.1      41.4            0.0       0.0       23.0   
4          drought  20.7      59.0            0.4       0.0       14.8   

   pressure  
0    1014.3  
1    1008.4  
2    1010.2  
3    1011.7  
4    1013.4  

Label distribution:
disaster_subtype
drought            40
heat wave          20
flash flood        16
forest fire        12
cold wave          12
storm (general)    12
Name: count, dtype: int64


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib

# Load data
df = pd.read_csv("data.csv")

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Fix duplicated 'windspeed' column issue by filling NaNs with 0
if 'windspeed' in df.columns:
    df['windspeed'] = df['windspeed'].fillna(0)

# Drop any remaining NaNs in other columns if any
df.dropna(inplace=True)

# Features and label
features = ['temp', 'humidity', 'precipitation', 'windgust', 'windspeed', 'pressure']
X = df[features]
y = df['disaster_subtype']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label encoder
joblib.dump(label_encoder, "label_encoder.pkl")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
joblib.dump(rf_model, "random_forest_model.pkl")

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "xgboost_model.pkl")

# Evaluate
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

print("Random Forest:\n", classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))
print("XGBoost:\n", classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))


Random Forest:
               precision    recall  f1-score   support

   cold wave       1.00      1.00      1.00        15
 flash flood       1.00      1.00      1.00        17
 forest fire       1.00      1.00      1.00        16
   heat wave       1.00      1.00      1.00        15
       storm       1.00      1.00      1.00        10

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73

XGBoost:
               precision    recall  f1-score   support

   cold wave       1.00      0.93      0.97        15
 flash flood       1.00      1.00      1.00        17
 forest fire       1.00      1.00      1.00        16
   heat wave       0.94      1.00      0.97        15
       storm       1.00      1.00      1.00        10

    accuracy                           0.99        73
   macro avg       0.99      0.99      0.99        73
weighted avg       0.99      0.99      0.99        

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
print("DataFrame shape:", df.shape)

print("Columns:", df.columns)
print("Null values:\n", df.isnull().sum())
print("Label (Disaster Subtype) unique values:", df["Disaster Subtype"].unique())
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y_encoded.shape)


DataFrame shape: (0, 8)
Columns: Index(['Disaster Subtype', 'temp', 'humidity', 'precipitation', 'windgust',
       'windspeed ', 'pressure', 'windspeed'],
      dtype='object')
Null values:
 Disaster Subtype    0
temp                0
humidity            0
precipitation       0
windgust            0
windspeed           0
pressure            0
windspeed           0
dtype: int64
Label (Disaster Subtype) unique values: []
Feature matrix shape: (0, 6)
Target vector shape: (0,)
