In [2]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
ROOT = Path("third_project").resolve().parent
DATA_DIR = ROOT / "data"
OUT_DIR = ROOT / "outputs"
IMG_DIR = OUT_DIR / "images"
for p in (DATA_DIR, OUT_DIR, IMG_DIR):
    p.mkdir(parents=True, exist_ok=True)

In [5]:
# Our World in Data CO2 CSV (canonical URL)
OWID_CSV_URL = "https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv"
LOCAL_CSV = DATA_DIR / "owid-co2-data.csv"


In [6]:
# Helper functions
def savefig(fig, fname, dpi=150):
    path = IMG_DIR / fname
    fig.savefig(path, bbox_inches="tight", dpi=dpi)
    print(f"Saved image: {path}")

def ensure_dataset():
    """Download dataset if not present."""
    if LOCAL_CSV.exists():
        print(f"Using local dataset: {LOCAL_CSV}")
        return LOCAL_CSV
    try:
        print("Downloading OWID CO2 dataset...")
        df = pd.read_csv(OWID_CSV_URL)
        df.to_csv(LOCAL_CSV, index=False)
        print(f"Downloaded and saved to: {LOCAL_CSV}")
        return LOCAL_CSV
    except Exception as e:
        print("ERROR: Could not download dataset automatically.", e)
        sys.exit(1)

In [7]:
# Load data
csv_path = ensure_dataset()
df = pd.read_csv(csv_path, low_memory=False)

print("\nColumns available (sample):")
print(df.columns.tolist())

# Keep relevant columns (guard for existence)
cols_keep = [
    "country", "iso_code", "year",
    "co2", "co2_per_capita",
    "total_ghg", "ghg_per_capita",  # optional
    "coal_co2", "oil_co2", "gas_co2",
    "cement_co2", "flaring_co2",
    "population", "gdp"
]
# Filter available columns
cols_present = [c for c in cols_keep if c in df.columns]
print("\nColumns used:", cols_present)

df = df[["country", "iso_code", "year"] + cols_present[3:]].copy()

Downloading OWID CO2 dataset...
Downloaded and saved to: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/data/owid-co2-data.csv

Columns available (sample):
['country', 'year', 'iso_code', 'population', 'gdp', 'cement_co2', 'cement_co2_per_capita', 'co2', 'co2_growth_abs', 'co2_growth_prct', 'co2_including_luc', 'co2_including_luc_growth_abs', 'co2_including_luc_growth_prct', 'co2_including_luc_per_capita', 'co2_including_luc_per_gdp', 'co2_including_luc_per_unit_energy', 'co2_per_capita', 'co2_per_gdp', 'co2_per_unit_energy', 'coal_co2', 'coal_co2_per_capita', 'consumption_co2', 'consumption_co2_per_capita', 'consumption_co2_per_gdp', 'cumulative_cement_co2', 'cumulative_co2', 'cumulative_co2_including_luc', 'cumulative_coal_co2', 'cumulative_flaring_co2', 'cumulative_gas_co2', 'cumulative_luc_co2', 'cumulative_oil_co2', 'cumulative_other_co2', 'energy_per_capita', 'energy_per_gdp', 'flaring_co2', 'flaring_co2_per_capita', 'gas_co2', 'gas_co2_per_capita', 'ghg_excluding_lu

In [8]:
# Basic cleaning
# Filter out aggregate entities (regions) - OWID has entities like 'World' and regions; keep only iso_code that are 3-letter country codes
# iso_code for countries is usually 3 letters, while regions often have NaN
df_countries = df[df["iso_code"].apply(lambda x: isinstance(x, str) and len(x) == 3)].copy()
print(f"\nRows (countries only): {len(df_countries)}")



Rows (countries only): 42262


In [9]:
# Convert numeric columns
numeric_cols = [c for c in df_countries.columns if c not in ["country", "iso_code", "year"]]
df_countries[numeric_cols] = df_countries[numeric_cols].apply(pd.to_numeric, errors="coerce")

# Basic missing-value report for key columns
print("\nMissing values (key cols):")
print(df_countries[["year", "co2", "co2_per_capita", "population"]].isna().sum())



Missing values (key cols):
year                  0
co2               19069
co2_per_capita    19491
population         4046
dtype: int64


In [10]:
# Derived columns
# If co2_per_capita missing but co2 and population available, compute it
mask = df_countries["co2_per_capita"].isna() & df_countries["co2"].notna() & df_countries["population"].notna()
if mask.any():
    df_countries.loc[mask, "co2_per_capita"] = df_countries.loc[mask, "co2"] / df_countries.loc[mask, "population"]

# Create per-capita normalized columns for coal/oil/gas if available
for src in ["coal_co2", "oil_co2", "gas_co2"]:
    if src in df_countries.columns:
        percap_col = f"{src}_per_capita"
        df_countries[percap_col] = df_countries[src] / df_countries["population"]


In [11]:
# Analysis: top emitters in recent year and over time
# Choose baseline years for comparison (hint suggested 1990 vs 2020)
year_a = 1990
year_b = 2020

# Helper to get top N by total CO2 in a given year
def top_n_emitters(year, n=10):
    sub = df_countries[df_countries["year"] == year]
    sub = sub.dropna(subset=["co2"])
    return sub.sort_values("co2", ascending=False).head(n)

top10_2020 = top_n_emitters(year_b, n=10)
print(f"\nTop 10 emitters in {year_b}:\n", top10_2020[["country", "co2"]])


Top 10 emitters in 2020:
              country        co2
9882           China  10905.693
47796  United States   4714.628
21655          India   2421.552
38154         Russia   1632.929
23729          Japan   1040.483
22585           Iran    756.606
18227        Germany    648.357
40117   Saudi Arabia    616.086
21829      Indonesia    608.223
42852    South Korea    597.634


In [12]:
# Compare 1990 vs 2020 for top countries of 2020
countries_of_interest = top10_2020["country"].tolist()
cmp_df = df_countries[df_countries["country"].isin(countries_of_interest) & df_countries["year"].isin([year_a, year_b])]
pivot_cmp = cmp_df.pivot(index="country", columns="year", values="co2")
pivot_cmp = pivot_cmp.rename(columns={year_a: f"co2_{year_a}", year_b: f"co2_{year_b}"})
pivot_cmp["absolute_change"] = pivot_cmp[f"co2_{year_b}"] - pivot_cmp[f"co2_{year_a}"]
# compute percent change carefully
pivot_cmp["percent_change"] = (pivot_cmp[f"co2_{year_b}"] - pivot_cmp[f"co2_{year_a}"]) / pivot_cmp[f"co2_{year_a}"] * 100
pivot_cmp = pivot_cmp.sort_values(f"co2_{year_b}", ascending=False)
print(f"\nComparison {year_a} vs {year_b} for top 2020 emitters:\n", pivot_cmp[[
    f"co2_{year_a}", f"co2_{year_b}", "absolute_change", "percent_change"
]])


Comparison 1990 vs 2020 for top 2020 emitters:
 year           co2_1990   co2_2020  absolute_change  percent_change
country                                                            
China          2484.855  10905.693         8420.838      338.886494
United States  5120.957   4714.628         -406.329       -7.934630
India           577.996   2421.552         1843.556      318.956533
Russia         2536.248   1632.929         -903.319      -35.616351
Japan          1157.393   1040.483         -116.910      -10.101150
Iran            211.075    756.606          545.531      258.453630
Germany        1054.796    648.357         -406.439      -38.532475
Saudi Arabia    182.025    616.086          434.061      238.462299
Indonesia       155.081    608.223          453.142      292.196981
South Korea     250.511    597.634          347.123      138.565971


In [13]:
# Save comparison CSV
pivot_cmp.to_csv(OUT_DIR / f"co2_{year_a}_vs_{year_b}_top10.csv")


In [14]:
# Growth rates (CAGR) from 1990 to 2020
def compute_cagr(df_in, start_year, end_year, value_col="co2"):
    # df_in must contain rows for start_year and end_year for each country
    groups = []
    for c, g in df_in.groupby("country"):
        try:
            v0 = g.loc[g["year"] == start_year, value_col].values
            v1 = g.loc[g["year"] == end_year, value_col].values
            if len(v0) == 1 and len(v1) == 1 and v0[0] > 0 and not np.isnan(v0[0]) and not np.isnan(v1[0]):
                years = end_year - start_year
                cagr = (v1[0] / v0[0]) ** (1 / years) - 1
                groups.append({"country": c, "value_start": v0[0], "value_end": v1[0], "cagr": cagr})
        except Exception:
            continue
    return pd.DataFrame(groups).sort_values("cagr", ascending=False)

cagr_df = compute_cagr(df_countries, year_a, year_b, value_col="co2")
cagr_df.to_csv(OUT_DIR / f"cagr_co2_{year_a}_{year_b}.csv", index=False)
print(f"\nTop 10 CAGR between {year_a} and {year_b} (by CO2):\n", cagr_df.head(10))


Top 10 CAGR between 1990 and 2020 (by CO2):
                country  value_start  value_end      cagr
61   Equatorial Guinea        0.062      6.411  0.167216
102               Laos        0.513     19.492  0.128906
129              Nepal        0.721     15.632  0.107990
204            Vietnam       21.299    362.460  0.099082
116               Mali        0.422      6.448  0.095142
34            Cambodia        1.260     19.034  0.094726
115           Maldives        0.172      1.678  0.078886
21               Benin        0.579      5.518  0.078045
32        Burkina Faso        0.590      5.377  0.076439
151              Qatar       11.411    102.501  0.075920


In [15]:
# Rolling averages for trend smoothing
# We'll compute a 5-year rolling mean per country for co2 and co2_per_capita
rolling_window = 5
df_countries_sorted = df_countries.sort_values(["country", "year"]).copy()
df_countries_sorted["co2_rolling"] = df_countries_sorted.groupby("country")["co2"].transform(lambda x: x.rolling(window=rolling_window, min_periods=1).mean())
df_countries_sorted["co2_per_capita_rolling"] = df_countries_sorted.groupby("country")["co2_per_capita"].transform(lambda x: x.rolling(window=rolling_window, min_periods=1).mean())


In [16]:
# Visualizations

sns.set(style="whitegrid", context="talk")
plt.rcParams.update({"figure.autolayout": True})



In [17]:
# 1) Line chart: top N emitters over time (absolute CO2)
top_n = 10
top_countries_latest = df_countries[df_countries["year"] == year_b].sort_values("co2", ascending=False).head(top_n)["country"].tolist()
fig, ax = plt.subplots(figsize=(12, 6))
for c in top_countries_latest:
    tmp = df_countries_sorted[df_countries_sorted["country"] == c]
    ax.plot(tmp["year"], tmp["co2_rolling"], label=c)
ax.set_title(f"Top {top_n} CO2 Emitters Over Time (Rolling {rolling_window}-yr avg)")
ax.set_xlabel("Year")
ax.set_ylabel("CO2 emissions (million tonnes)")  # units as in OWID
ax.legend(loc="upper left", bbox_to_anchor=(1.02, 1))
savefig(fig, f"top_{top_n}_emitters_over_time.png")
plt.close(fig)

Saved image: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images/top_10_emitters_over_time.png


In [18]:
# 2) Line chart: per-capita CO2 for top per-capita countries (choose top 10 by per-capita in latest year)
if "co2_per_capita" in df_countries.columns:
    top_pc = df_countries[df_countries["year"] == year_b].sort_values("co2_per_capita", ascending=False).head(top_n)["country"].tolist()
    fig, ax = plt.subplots(figsize=(12, 6))
    for c in top_pc:
        tmp = df_countries_sorted[df_countries_sorted["country"] == c]
        ax.plot(tmp["year"], tmp["co2_per_capita_rolling"], label=c)
    ax.set_title(f"Top {top_n} CO2 per-capita Countries Over Time (Rolling {rolling_window}-yr avg)")
    ax.set_xlabel("Year")
    ax.set_ylabel("CO2 per capita (tonnes per person)")
    ax.legend(loc="upper left", bbox_to_anchor=(1.02, 1))
    savefig(fig, f"top_{top_n}_percapita_over_time.png")
    plt.close(fig)


Saved image: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images/top_10_percapita_over_time.png


In [19]:
# 3) Heatmap: countries x year (CO2 per capita) for a selected set (top 20 by latest co2)
top20 = df_countries[df_countries["year"] == year_b].sort_values("co2", ascending=False).head(20)["country"].tolist()
heat_df = df_countries_sorted[df_countries_sorted["country"].isin(top20)].pivot(index="country", columns="year", values="co2_per_capita")
fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(heat_df, cmap="YlOrRd", robust=True, ax=ax)
ax.set_title(f"CO2 per capita (tonnes) — Top 20 emitters in {year_b}")
savefig(fig, f"heatmap_co2_percapita_top20_{year_b}.png")
plt.close(fig)

Saved image: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images/heatmap_co2_percapita_top20_2020.png


In [20]:
# 4) Bar chart: compare 1990 vs 2020 CO2 for top 10 emitters
cmp_plot = pivot_cmp[[f"co2_{year_a}", f"co2_{year_b}", "absolute_change"]].dropna()
cmp_plot_sorted = cmp_plot.sort_values(f"co2_{year_b}", ascending=False)
fig, ax = plt.subplots(figsize=(12, 6))
width = 0.35
indices = np.arange(len(cmp_plot_sorted))
ax.bar(indices - width/2, cmp_plot_sorted[f"co2_{year_a}"], width=width, label=str(year_a))
ax.bar(indices + width/2, cmp_plot_sorted[f"co2_{year_b}"], width=width, label=str(year_b))
ax.set_xticks(indices)
ax.set_xticklabels(cmp_plot_sorted.index, rotation=45, ha="right")
ax.set_ylabel("CO2 emissions (million tonnes)")
ax.set_title(f"CO2 in {year_a} vs {year_b} — Top {len(cmp_plot_sorted)} 2020 Emitters")
ax.legend()
savefig(fig, f"co2_{year_a}_vs_{year_b}_bar_top10.png")
plt.close(fig)

Saved image: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images/co2_1990_vs_2020_bar_top10.png


In [21]:
# 5) Stacked area / contributions by source for the world or specific country (if source columns exist)
source_cols = [c for c in ["coal_co2", "oil_co2", "gas_co2", "cement_co2", "flaring_co2"] if c in df_countries.columns]
if source_cols:
    # Example: world totals over time
    world = df[df["country"] == "World"] if "World" in df["country"].values else None
    # fallback: aggregate across countries
    agg = df_countries.groupby("year")[source_cols].sum()
    fig, ax = plt.subplots(figsize=(12, 6))
    agg.plot.area(ax=ax)
    ax.set_title("Global CO2 Emissions by Source Over Time (aggregate of countries)")
    ax.set_xlabel("Year")
    ax.set_ylabel("CO2 emissions (million tonnes)")
    savefig(fig, "global_by_source_area.png")
    plt.close(fig)


Saved image: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images/global_by_source_area.png


In [22]:
# 6) Correlation heatmap for sources (for recent year)
recent = df_countries[df_countries["year"] == year_b]
if not recent.empty and len(source_cols) >= 2:
    corr = recent[source_cols].corr()
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(corr, annot=True, cmap="coolwarm", ax=ax)
    ax.set_title(f"Correlation between emission sources ({year_b})")
    savefig(fig, f"source_corr_{year_b}.png")
    plt.close(fig)

Saved image: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images/source_corr_2020.png


In [23]:
# Multi-index grouping example: region-level aggregation
# OWID does not always include 'region' column. If present, use it; else group by continent using iso_code -> optional.
if "continent" in df_countries.columns:
    region_col = "continent"
    region_agg = df_countries.groupby([region_col, "year"])["co2"].sum().unstack(level=0).fillna(0)
    fig, ax = plt.subplots(figsize=(12, 6))
    region_agg.plot(ax=ax)
    ax.set_title("CO2 Emissions by Continent Over Time")
    savefig(fig, "co2_by_continent.png")
    plt.close(fig)
else:
    print("Note: 'continent' column not found in dataset; skipping continent plot.")


Note: 'continent' column not found in dataset; skipping continent plot.


In [24]:
# Save cleaned dataset & selected outputs
clean_path = OUT_DIR / "owid_co2_cleaned.csv"
df_countries.to_csv(clean_path, index=False)
print(f"\nSaved cleaned countries dataset: {clean_path}")

# Save rolling dataset
rolling_out = OUT_DIR / "owid_co2_with_rolling.csv"
df_countries_sorted.to_csv(rolling_out, index=False)
print(f"Saved rolling dataset: {rolling_out}")

print("\nDone. Outputs saved to:", OUT_DIR)
print("Images saved to:", IMG_DIR)


Saved cleaned countries dataset: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/owid_co2_cleaned.csv
Saved rolling dataset: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/owid_co2_with_rolling.csv

Done. Outputs saved to: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs
Images saved to: /Users/karimhasabelnaby/Documents/MkAI_intern/third_project/outputs/images
