In [None]:
from google.colab import drive

drive.mount('/content/drive/', force_remount=True);

Mounted at /content/drive/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/drive/My Drive/FoBA - Climate Action (Dataset)/philippines_gain_dataset.csv")
df.head(10)

Unnamed: 0,Region,Indicator,Sector,Vulnerability,Readiness,GAIN,Trend,Year
0,NCR,Access to Safe Water,Water,0.614,0.692,0.539,↑ +0.02,2022
1,Region XI - Davao,Employment Rate,Economic,0.357,0.537,0.59,↑ +0.02,2019
2,Region IV-B - MIMAROPA,Protected Land Area,Ecosystem,0.547,0.792,0.622,↓ -0.01,2020
3,Region VIII - Eastern Visayas,Communication Coverage,Infrastructure,0.687,0.453,0.383,↓ -0.02,2010
4,CAR,Literacy Rate,Social,0.448,0.446,0.499,↑ +0.01,2012
5,Region IV-B - MIMAROPA,Crop Yield Variability,Food,0.499,0.684,0.592,→ 0.00,2024
6,Region III - Central Luzon,Life Expectancy,Health,0.363,0.625,0.631,↑ +0.01,2011
7,BARMM,Annual Precipitation Change,Water,0.313,0.794,0.74,↓ -0.02,2023
8,BARMM,Disaster Preparedness Index,Governance,0.49,0.662,0.586,→ 0.00,2014
9,Region V - Bicol,Water Resource Availability,Water,0.689,0.488,0.4,↑ +0.02,2020


# **Data Preprocessing**

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
df.columns

Index(['Region', 'Indicator', 'Sector', 'Vulnerability', 'Readiness', 'GAIN',
       'Trend', 'Year'],
      dtype='object')

In [None]:
df.tail(10)

Unnamed: 0,Region,Indicator,Sector,Vulnerability,Readiness,GAIN,Trend,Year
1990,CAR,Life Expectancy,Health,0.795,0.611,0.408,↓ -0.02,2016
1991,Region X - Northern Mindanao,Communication Coverage,Infrastructure,0.624,0.515,0.446,↓ -0.01,2018
1992,Region XIII - Caraga,Regional GDP per Capita,Economic,0.404,0.554,0.575,→ 0.00,2022
1993,CAR,Communication Coverage,Infrastructure,0.363,0.689,0.663,↑ +0.01,2013
1994,NCR,Local Gov Efficiency,Governance,0.705,0.346,0.32,↑ +0.02,2010
1995,Region IX - Zamboanga Peninsula,Community Engagement Index,Social,0.642,0.357,0.358,↓ -0.01,2011
1996,Region IV-A - CALABARZON,Protected Land Area,Ecosystem,0.324,0.673,0.674,↑ +0.02,2020
1997,Region IV-B - MIMAROPA,Food Import Dependency,Food,0.458,0.431,0.487,→ 0.00,2010
1998,Region X - Northern Mindanao,Biodiversity Index,Ecosystem,0.757,0.424,0.334,→ 0.00,2011
1999,Region IX - Zamboanga Peninsula,Food Import Dependency,Food,0.644,0.405,0.38,↑ +0.01,2020


In [None]:
df.shape

(2000, 8)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Region,0
Indicator,0
Sector,0
Vulnerability,0
Readiness,0
GAIN,0
Trend,0
Year,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Region         2000 non-null   object 
 1   Indicator      2000 non-null   object 
 2   Sector         2000 non-null   object 
 3   Vulnerability  2000 non-null   float64
 4   Readiness      2000 non-null   float64
 5   GAIN           2000 non-null   float64
 6   Trend          2000 non-null   object 
 7   Year           2000 non-null   int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 125.1+ KB


In [None]:
df.dtypes

Unnamed: 0,0
Region,object
Indicator,object
Sector,object
Vulnerability,float64
Readiness,float64
GAIN,float64
Trend,object
Year,int64


In [None]:
df.describe()

Unnamed: 0,Vulnerability,Readiness,GAIN,Year
count,2000.0,2000.0,2000.0,2000.0
mean,0.550229,0.552142,0.500955,2016.989
std,0.143185,0.142823,0.100801,4.332697
min,0.3,0.3,0.251,2010.0
25%,0.427,0.427,0.428,2013.0
50%,0.553,0.5535,0.502,2017.0
75%,0.672,0.674,0.574,2021.0
max,0.8,0.8,0.748,2024.0


In [None]:
# Fill numerical columns with their mean

num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean());
print(df[num_cols])

      Vulnerability  Readiness   GAIN  Year
0             0.614      0.692  0.539  2022
1             0.357      0.537  0.590  2019
2             0.547      0.792  0.622  2020
3             0.687      0.453  0.383  2010
4             0.448      0.446  0.499  2012
...             ...        ...    ...   ...
1995          0.642      0.357  0.358  2011
1996          0.324      0.673  0.674  2020
1997          0.458      0.431  0.487  2010
1998          0.757      0.424  0.334  2011
1999          0.644      0.405  0.380  2020

[2000 rows x 4 columns]


In [None]:
# Fill categorical columns with their mode (most frequent value)

cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
  df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
# Remove Duplicates

df.drop_duplicates(inplace=True)

In [None]:
from google.colab import files
cleaned_df = df.copy()
cleaned_df.head()


# Save cleaned dataset for Flask
cleaned_df.to_csv('cleaned_dataset.csv', index=False)
print("✅ Cleaned dataset saved successfully!")

files.download('cleaned_dataset.csv');

✅ Cleaned dataset saved successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Feature Engineering**

In [None]:
# Normalize Numerical Features

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print(df[num_cols])


      Vulnerability  Readiness      GAIN      Year
0             0.628      0.784  0.579477  0.857143
1             0.114      0.474  0.682093  0.642857
2             0.494      0.984  0.746479  0.714286
3             0.774      0.306  0.265594  0.000000
4             0.296      0.292  0.498994  0.142857
...             ...        ...       ...       ...
1995          0.684      0.114  0.215292  0.071429
1996          0.048      0.746  0.851107  0.714286
1997          0.316      0.262  0.474849  0.000000
1998          0.914      0.248  0.167002  0.071429
1999          0.688      0.210  0.259557  0.714286

[2000 rows x 4 columns]


In [None]:
# Encode Categorical Columns

label_encoder = LabelEncoder()
for col in cat_cols:
  df[col] = label_encoder.fit_transform(df[col])
  print(df[col])

0        2
1       14
2        7
3       12
4        1
        ..
1995     8
1996     6
1997     7
1998    13
1999     8
Name: Region, Length: 2000, dtype: int64
0        0
1       10
2       19
3        4
4       16
        ..
1995     5
1996    19
1997    11
1998     3
1999    11
Name: Indicator, Length: 2000, dtype: int64
0       7
1       0
2       1
3       5
4       6
       ..
1995    6
1996    1
1997    2
1998    1
1999    2
Name: Sector, Length: 2000, dtype: int64
0       1
1       1
2       3
3       4
4       0
       ..
1995    3
1996    1
1997    2
1998    2
1999    0
Name: Trend, Length: 2000, dtype: int64


# **Define The Target**

In [None]:
# Define thresholds
threshold_vulnerability = cleaned_df['Vulnerability'].mean()
threshold_readiness = cleaned_df['Readiness'].mean()
threshold_gain = cleaned_df['GAIN'].mean()

# Classify levels
cleaned_df['Vulnerability_Class'] = cleaned_df['Vulnerability'].apply(lambda x: 1 if x >= threshold_vulnerability else 0)
cleaned_df['Readiness_Class'] = cleaned_df['Readiness'].apply(lambda x: 1 if x >= threshold_readiness else 0)
cleaned_df['GAIN_Class'] = cleaned_df['GAIN'].apply(lambda x: 1 if x >= threshold_gain else 0)

# Add readable text labels
cleaned_df['Vulnerability_Label'] = cleaned_df['Vulnerability'].apply(
    lambda x: 'High Vulnerability' if x >= threshold_vulnerability else 'Low Vulnerability'
)
cleaned_df['Readiness_Label'] = cleaned_df['Readiness'].apply(
    lambda x: 'High Readiness' if x >= threshold_readiness else 'Low Readiness'
)
cleaned_df['GAIN_Label'] = cleaned_df['GAIN'].apply(
    lambda x: 'High GAIN (High Resilience)' if x >= threshold_gain else 'Low GAIN (Low Resilience)'
)

# Define target variable (regions needing improvement)
cleaned_df['Needs_Improvement'] = cleaned_df.apply(
    lambda row: 1 if row['Vulnerability_Class'] == 1 and row['Readiness_Class'] == 0 and row['GAIN_Class'] == 0 else 0,
    axis=1
)

# Optional human-readable label
cleaned_df['Needs_Improvement_Label'] = cleaned_df['Needs_Improvement'].apply(
    lambda x: 'Needs Climate Action Improvement' if x == 1 else 'Sufficiently Prepared'
)

# Display formatted table
display_cols = [
    'Region', 'Vulnerability', 'Readiness', 'GAIN',
    'Vulnerability_Label', 'Readiness_Label', 'GAIN_Label', 'Needs_Improvement_Label'
]
print(cleaned_df[display_cols].to_string(index=False))


                         Region  Vulnerability  Readiness  GAIN Vulnerability_Label Readiness_Label                  GAIN_Label          Needs_Improvement_Label
                            NCR          0.614      0.692 0.539  High Vulnerability  High Readiness High GAIN (High Resilience)            Sufficiently Prepared
              Region XI - Davao          0.357      0.537 0.590   Low Vulnerability   Low Readiness High GAIN (High Resilience)            Sufficiently Prepared
         Region IV-B - MIMAROPA          0.547      0.792 0.622   Low Vulnerability  High Readiness High GAIN (High Resilience)            Sufficiently Prepared
  Region VIII - Eastern Visayas          0.687      0.453 0.383  High Vulnerability   Low Readiness   Low GAIN (Low Resilience) Needs Climate Action Improvement
                            CAR          0.448      0.446 0.499   Low Vulnerability   Low Readiness   Low GAIN (Low Resilience)            Sufficiently Prepared
         Region IV-B - MIMAROPA   

# **Split the Dataset**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = cleaned_df[['Readiness', 'GAIN']]
y = cleaned_df['Vulnerability_Class']

# Keep region labels for display
region = cleaned_df['Region']

In [None]:
# Split all together to keep same indices
X_train, X_test, y_train, y_test, region_train, region_test = train_test_split(
    X, y, region, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (1600, 2)
Testing set shape: (400, 2)


# **Model Building**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,
)

In [None]:
import pickle
from google.colab import files

# Save the trained Random Forest model
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)


print("Model saved as rf_model.pkl")

files.download('rf_model.pkl')

Model saved as rf_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
from google.colab import files
import json

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Model Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Low Vulnerability', 'High Vulnerability']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Predict on Full Dataset
cleaned_df['Predicted_Vulnerability'] = rf_model.predict(X)

# Combine Predictions for Display
results = pd.DataFrame({
    'Region': region_test.values,
    'Readiness': X_test['Readiness'].values,
    'GAIN': X_test['GAIN'].values,
    'Actual_Label': ['High Vulnerability' if val == 1 else 'Low Vulnerability' for val in y_test],
    'Predicted_Label': ['High Vulnerability' if val == 1 else 'Low Vulnerability' for val in y_pred]
})

def get_recommendation(pred):
  if(pred == 'High Vulnerability'):
    return 'Needs Climate Action Improvement'
  else:
    return 'Sufficiently Prepared'

results['Recommendation'] = results['Predicted_Label'].apply(get_recommendation);

# Calculate the average readiness score
average_readiness = results['Readiness'].mean()
print(f"\nAverage Readiness Score: {average_readiness:.2f}")

# Calculate the average gain score
average_gain = results['GAIN'].mean()
print(f"Average GAIN Score: {average_gain:.2f}")

overall_high_vulnerability = (results['Predicted_Label'] == 'High Vulnerability').sum()
print(f"Number of Regions with High Vulnerability: {overall_high_vulnerability}")

overall_low_vulnerability = (results['Predicted_Label'] == 'Low Vulnerability').sum()
print(f"Number of Regions with Low Vulnerability: {overall_low_vulnerability}")

total_region_analyze = results['Region'].nunique()
print(f"Total Regions Analyzed: {total_region_analyze}")

# Display Sample Predictions
print("\n Sample Predictions:")
print(results.head(10))

# Save metrics as JSON
metrics = {
    "accuracy": round(accuracy, 2),
    "average_readiness": round(average_readiness, 2),
    "average_gain": round(average_gain, 2),
    "high_vulnerability_count": int(overall_high_vulnerability),
    "low_vulnerability_count": int(overall_low_vulnerability),
    "total_region_count": int(total_region_analyze)
}

with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)

# Save results and metrics as CSV
results.to_csv('results.csv', index=False)


files.download('results.csv')
files.download('metrics.json')



Random Forest Model Accuracy: 0.98

Classification Report:
                    precision    recall  f1-score   support

 Low Vulnerability       0.98      0.99      0.98       198
High Vulnerability       0.99      0.98      0.98       202

          accuracy                           0.98       400
         macro avg       0.98      0.98      0.98       400
      weighted avg       0.98      0.98      0.98       400


Confusion Matrix:
[[196   2]
 [  5 197]]

Average Readiness Score: 0.55
Average GAIN Score: 0.50
Number of Regions with High Vulnerability: 199
Number of Regions with Low Vulnerability: 201
Total Regions Analyzed: 17

 Sample Predictions:
                            Region  Readiness   GAIN        Actual_Label  \
0                              NCR      0.507  0.374  High Vulnerability   
1    Region VIII - Eastern Visayas      0.765  0.484  High Vulnerability   
2     Region VII - Central Visayas      0.781  0.533  High Vulnerability   
3     Region X - Northern Mindanao

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>