# 🌿 VayuSense: Model Building for Carbon Emission Prediction

This notebook builds and trains machine learning models for country-wise carbon emission prediction.

## 📋 Table of Contents
1. Import Libraries and Load Data
2. Data Preparation
3. Model Building
   - Random Forest
   - XGBoost
4. Model Evaluation
5. Future Predictions (2024-2050)
6. Model Saving

## 1. Import Libraries and Load Data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

In [None]:
# Load the cleaned data
try:
    df = pd.read_csv('../data/features_country_data.csv')
    print(f"Data loaded successfully! Shape: {df.shape}")
    print("\nColumns:", df.columns.tolist())
    display(df.head())
except FileNotFoundError:
    # If features file doesn't exist, load cleaned data
    df = pd.read_csv('../data/cleaned_country_data.csv')
    print(f"Loaded cleaned data. Shape: {df.shape}")
    print("\nColumns:", df.columns.tolist())
    display(df.head())

## 2. Data Preparation

In [None]:
# Identify the target variable and features
# Assuming CO2 emissions is our target
target_col = 'CO2 emissions (metric tons per capita)'

# Check if target column exists, otherwise find CO2-related column
if target_col not in df.columns:
    co2_cols = [col for col in df.columns if 'CO2' in col or 'co2' in col]
    if co2_cols:
        target_col = co2_cols[0]
        print(f"Using {target_col} as target variable")
    else:
        print("No CO2 column found. Available columns:")
        print(df.columns.tolist())

In [None]:
# Prepare features for modeling
# Remove non-numeric columns and prepare feature matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Remove target from features if it exists
if target_col in numeric_cols:
    feature_cols = [col for col in numeric_cols if col != target_col]
else:
    feature_cols = numeric_cols[:-1]  # Assume last numeric column is target
    target_col = numeric_cols[-1]

print(f"Target variable: {target_col}")
print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols[:5]}...")  # Show first 5 features

In [None]:
# Create feature matrix and target vector
X = df[feature_cols].fillna(df[feature_cols].mean())
y = df[target_col].fillna(df[target_col].mean())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

## 3. Model Building

### 3.1 Random Forest Model

In [None]:
# Random Forest Model
print("Training Random Forest Model...")

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Make predictions
rf_pred_train = rf_model.predict(X_train_scaled)
rf_pred_test = rf_model.predict(X_test_scaled)

# Evaluate the model
rf_train_rmse = np.sqrt(mean_squared_error(y_train, rf_pred_train))
rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_pred_test))
rf_train_mae = mean_absolute_error(y_train, rf_pred_train)
rf_test_mae = mean_absolute_error(y_test, rf_pred_test)
rf_train_r2 = r2_score(y_train, rf_pred_train)
rf_test_r2 = r2_score(y_test, rf_pred_test)

print("\nRandom Forest Results:")
print(f"Train RMSE: {rf_train_rmse:.4f} | Test RMSE: {rf_test_rmse:.4f}")
print(f"Train MAE: {rf_train_mae:.4f} | Test MAE: {rf_test_mae:.4f}")
print(f"Train R²: {rf_train_r2:.4f} | Test R²: {rf_test_r2:.4f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 10 important features
plt.figure(figsize=(10, 6))
plt.barh(feature_importance.head(10)['feature'], feature_importance.head(10)['importance'])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 3.2 XGBoost Model

In [None]:
# XGBoost Model
print("Training XGBoost Model...")

xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
xgb_pred_train = xgb_model.predict(X_train_scaled)
xgb_pred_test = xgb_model.predict(X_test_scaled)

# Evaluate the model
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_pred_train))
xgb_test_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred_test))
xgb_train_mae = mean_absolute_error(y_train, xgb_pred_train)
xgb_test_mae = mean_absolute_error(y_test, xgb_pred_test)
xgb_train_r2 = r2_score(y_train, xgb_pred_train)
xgb_test_r2 = r2_score(y_test, xgb_pred_test)

print("\nXGBoost Results:")
print(f"Train RMSE: {xgb_train_rmse:.4f} | Test RMSE: {xgb_test_rmse:.4f}")
print(f"Train MAE: {xgb_train_mae:.4f} | Test MAE: {xgb_test_mae:.4f}")
print(f"Train R²: {xgb_train_r2:.4f} | Test R²: {xgb_test_r2:.4f}")

### 3.3 Model Comparison

In [None]:
# Compare models
results_df = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Train RMSE': [rf_train_rmse, xgb_train_rmse],
    'Test RMSE': [rf_test_rmse, xgb_test_rmse],
    'Train MAE': [rf_train_mae, xgb_train_mae],
    'Test MAE': [rf_test_mae, xgb_test_mae],
    'Train R²': [rf_train_r2, xgb_train_r2],
    'Test R²': [rf_test_r2, xgb_test_r2]
})

print("\nModel Comparison:")
display(results_df)

# Select best model based on test R²
best_model_idx = results_df['Test R²'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_model = rf_model if best_model_name == 'Random Forest' else xgb_model

print(f"\nBest model: {best_model_name} with Test R² = {results_df.loc[best_model_idx, 'Test R²']:.4f}")

## 4. Future Predictions (2024-2050)

In [None]:
# Create future prediction function
def predict_future_emissions(model, scaler, country_data, years_ahead=26):
    """
    Predict future emissions from 2024 to 2050
    """
    # For simplicity, we'll use the latest available data and apply a trend
    # In production, you'd want more sophisticated time series forecasting
    
    predictions = []
    current_features = country_data.copy()
    
    for year in range(2024, 2051):
        # Scale features
        scaled_features = scaler.transform(current_features.reshape(1, -1))
        
        # Make prediction
        pred = model.predict(scaled_features)[0]
        predictions.append({'year': year, 'co2_emissions': pred})
        
        # Update features for next year (simple trend)
        # In reality, you'd model how GDP, population etc. change over time
        current_features = current_features * 1.01  # 1% growth assumption
    
    return pd.DataFrame(predictions)

In [None]:
# Example: Predict for a sample country
# Get the latest data point
sample_country_features = X.iloc[0].values  # Take first country as example

# Generate future predictions
future_predictions = predict_future_emissions(best_model, scaler, sample_country_features)

# Plot predictions
plt.figure(figsize=(12, 6))
plt.plot(future_predictions['year'], future_predictions['co2_emissions'], 
         marker='o', linewidth=2, markersize=4)
plt.title('CO₂ Emissions Forecast (2024-2050)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('CO₂ Emissions (metric tons per capita)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(range(2024, 2051, 2), rotation=45)
plt.tight_layout()
plt.show()

print(f"\nPredicted emissions for 2030: {future_predictions[future_predictions['year']==2030]['co2_emissions'].values[0]:.2f}")
print(f"Predicted emissions for 2040: {future_predictions[future_predictions['year']==2040]['co2_emissions'].values[0]:.2f}")
print(f"Predicted emissions for 2050: {future_predictions[future_predictions['year']==2050]['co2_emissions'].values[0]:.2f}")

## 5. Save Models and Preprocessors

In [None]:
# Save the best model
model_path = f'../models/{best_model_name.lower().replace(" ", "_")}_model.joblib'
joblib.dump(best_model, model_path)
print(f"Model saved to: {model_path}")

# Save the scaler
scaler_path = '../models/scaler.joblib'
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

# Save feature names
feature_names_path = '../models/feature_names.joblib'
joblib.dump(feature_cols, feature_names_path)
print(f"Feature names saved to: {feature_names_path}")

# Save model metadata
metadata = {
    'model_type': best_model_name,
    'target_variable': target_col,
    'feature_count': len(feature_cols),
    'test_r2': results_df.loc[best_model_idx, 'Test R²'],
    'test_rmse': results_df.loc[best_model_idx, 'Test RMSE'],
    'test_mae': results_df.loc[best_model_idx, 'Test MAE']
}

import json
with open('../models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)
print("\nModel metadata saved!")
print("\nAll models and preprocessors saved successfully!")