<a href="https://colab.research.google.com/github/koshaantala/IndustriesClimateChange/blob/main/ClimateIndustries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install -q plotly pandas numpy scikit-learn kagglehub

In [25]:
import pandas as pd
import numpy as np
import plotly.express as px
import kagglehub
import os
from sklearn.linear_model import LinearRegression
import plotly.graph_objects as go


In [26]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("alamshihab075/greenhouse-dataset")

Using Colab cache for faster access to the 'greenhouse-dataset' dataset.


In [27]:
df = pd.read_csv(f"{path}/SupplyChainGHGEmissionFactors_v1.3.0_NAICS_CO2e_USD2022.csv")

df.columns = df.columns.str.strip().str.replace(" ", "_")

df.rename(columns={
    "2017_NAICS_Code": "NAICS_Code",
    "2017_NAICS_Title": "Industry",
    "Supply_Chain_Emission_Factors_without_Margins": "SEF",
    "Margins_of_Supply_Chain_Emission_Factors": "MEF",
    "Supply_Chain_Emission_Factors_with_Margins": "Total_Emission_Factor"
}, inplace=True)

df = df.dropna(subset=["Total_Emission_Factor"])
df["Total_Emission_Factor"] = pd.to_numeric(df["Total_Emission_Factor"], errors="coerce")
df = df.dropna(subset=["Total_Emission_Factor"])

print("Cleaned data shape:", df.shape)
df.head()


Cleaned data shape: (1016, 8)


Unnamed: 0,NAICS_Code,Industry,GHG,Unit,SEF,MEF,Total_Emission_Factor,Reference_USEEIO_Code
0,111110,Soybean Farming,All GHGs,"kg CO2e/2022 USD, purchaser price",0.488,0.044,0.532,1111A0
1,111120,Oilseed (except Soybean) Farming,All GHGs,"kg CO2e/2022 USD, purchaser price",0.488,0.044,0.532,1111A0
2,111130,Dry Pea and Bean Farming,All GHGs,"kg CO2e/2022 USD, purchaser price",0.809,0.04,0.848,1111B0
3,111140,Wheat Farming,All GHGs,"kg CO2e/2022 USD, purchaser price",0.809,0.04,0.848,1111B0
4,111150,Corn Farming,All GHGs,"kg CO2e/2022 USD, purchaser price",0.809,0.04,0.848,1111B0


In [28]:
import plotly.express as px

# Sort industries by emission factor for clearer visualization
df_sorted = df_clean.sort_values("Total_Emission_Factor", ascending=False)

# Create the heatmap
fig = px.imshow(
    [df_sorted["Total_Emission_Factor"]],
    labels=dict(x="Industry", color="CO₂e / USD"),
    x=df_sorted["Industry"],
    color_continuous_scale="YlOrRd"
)

fig.update_layout(
    title="Heatmap of Total Emission Factors by Industry, Current",
    xaxis_title="Industry",
    yaxis_visible=False,
    height=600,
    template="plotly_white"
)

fig.show()


In [29]:
# Simulated growth from 2017 → 2030
years = np.arange(2017, 2031)
growth_rate = np.random.uniform(0.01, 0.05, len(df))

df["Predicted_2030_Emission_Factor"] = df["Total_Emission_Factor"] * (1 + growth_rate)**13

# Basic regression for average trend
X = np.array([2017, 2022, 2030]).reshape(-1, 1)
y = np.array([df["Total_Emission_Factor"].mean(),
               df["Total_Emission_Factor"].mean() * 1.15,
               df["Total_Emission_Factor"].mean() * 1.25])

model = LinearRegression()
model.fit(X, y)
pred_2030 = model.predict(np.array([[2030]]))[0]

print(f"Average Emission Factor, Predicted 2030: {pred_2030:.3f} kg CO₂e / USD")


Average Emission Factor, Predicted 2030: 0.356 kg CO₂e / USD


In [30]:
fig = px.bar(
    df.sort_values("Predicted_2030_Emission_Factor", ascending=False).head(20),
    x="Industry",
    y="Predicted_2030_Emission_Factor",
    title="Forecasted Emission Factors (2030) — Top 20 Industries",
    color="Predicted_2030_Emission_Factor",
    color_continuous_scale="YlOrRd"
)
fig.update_layout(xaxis_tickangle=-45, template="plotly_white")
fig.show()
