# Workshop Programeren en AI

## Setup
### Imports

In [None]:
# General imports
import os

# Imports for data handeling
import pandas as pd
from skimpy import clean_columns
import country_converter as coco

# Imports for plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Settings for notebook
pd.set_option('display.max_rows', 40)
sns.set_theme()

### Retrieve data

#### Our dataset has 31 columns:
- **_Year_**: Year for which these measurements apply.
- **_Area_**: Country where the data is from.
- **_Savanna fires_**: Emissions from fires in savanna ecosystems.
- **_Forest fires_**: Emissions from fires in forested areas.
- **_Crop Residues_**: Emissions from burning or decomposing leftover plant material after crop harvesting.
- **_Rice Cultivation_**: Emissions from methane released during rice cultivation.
- **_Drained organic soils (CO2)_**: Emissions from carbon dioxide released when draining organic soils.
- **_Pesticides Manufacturing_**: Emissions from the production of pesticides.
- **_Food Transport_**: Emissions from transporting food products.
- **_Forestland_**: Land covered by forests.
- **_Net Forest conversion_**: Change in forest area due to deforestation and afforestation.
- **_Food Household Consumption_**: Emissions from food consumption at the household level.
- **_Food Retail_**: Emissions from the operation of retail establishments selling food.
- **_On-farm Electricity Use_**: Electricity consumption on farms.
- **_Food Packaging_**: Emissions from the production and disposal of food packaging materials.
- **_Agrifood Systems Waste Disposal_**: Emissions from waste disposal in the agrifood system.
- **_Food Processing_**: Emissions from processing food products.
- **_Fertilizers Manufacturing_**: Emissions from the production of fertilizers.
- **_IPPU_**: Emissions from industrial processes and product use.
- **_Manure applied to Soils_**: Emissions from applying animal manure to agricultural soils.
- **_Manure left on Pasture_**: Emissions from animal manure on pasture or grazing land.
- **_Manure Management_**: Emissions from managing and treating animal manure.
- **_Fires in organic soils_**: Emissions from fires in organic soils.
- **_Fires in humid tropical forests_**: Emissions from fires in humid tropical forests.
- **_On-farm energy use_**: Energy consumption on farms.
- **_Rural population_**: Number of people living in rural areas.
- **_Urban population_**: Number of people living in urban areas.
- **_Total Population - Male_**: Total number of male individuals in the population.
- **_Total Population - Female_**: Total number of female individuals in the population.
- **_total_emission_**: Total greenhouse gas emissions from various sources.
- **_Average Temperature °C_**: The average increasing of temperature (by year) in degrees Celsius.

#### And we add 4 ourselves:
- **_Continent_**: Continent where this area is located.
- **_Subregion_**: (United Nations) subregion where this area is located.
- **_Total Population_**: Sum of the Male and Female Population.
- **_Emission per capita_**: Total emission divided by the total population of an area.


All column names in the data frame are in [snakecase](https://en.wikipedia.org/wiki/Snake_case)

In [None]:
path_to_data = os.path.join(os.getcwd(), "Agrofood_co2_emission.csv")
assert os.path.exists(path_to_data), "Path to data file was not found"

df = pd.read_csv(path_to_data)  # Read the data from file to a Dataframe
df = clean_columns(df, replace={})

In [None]:
# Country names that give coco problems, we do some manual fixing / approximations
manual_country_fixes = {
    'Belgium-Luxembourg': 'Belgium',
    'Channel Islands': 'United Kingdom',
    'China, Taiwan Province of': 'Taiwan',
    'Netherlands Antilles (former)': 'Netherlands',
    'Pacific Islands Trust Territory': 'Micronesia',
    'Serbia and Montenegro': 'Serbia', 
    'USSR': 'Russia',
    'Yugoslav SFR': 'Serbia',  
}

# Add a "Continent" and "Subregion" column, This will make plotting a lot more managable than by country
# As there are 236 distinct countries in the dataset
cc = coco.CountryConverter()
df['area'] = df['area'].replace(manual_country_fixes)
df["continent"] = cc.convert(names=df["area"], to="continent")
df["subregion"] = cc.convert(names=df["area"], to="UNregion")
assert len(df[df['continent'] == 'not found']['area'].unique()) == 0  # Make sure everything is mapped
assert len(df[df['subregion'] == 'not found']['area'].unique()) == 0  # Make sure everything is mapped

# Add Total population, as a sum of male an female populations
# with this new column create a column of emission per capita. This can be used as a normalized measure of emissions
df["total_population"] = df["total_population_male"] + df["total_population_female"]
df["emission_per_capita"] = df["total_emission"] / df["total_population"]

### Familiarize yourself with the data

In [None]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")
display(df.head(5))

In [None]:
# Dit code blok maakt een tabel met vier kolommen.
dtypes = df.dtypes 
missing_counts = df.isnull().sum()
non_null_counts = df.notnull().sum()
unique_counts = df.nunique()
summary = pd.DataFrame({
    "Data Type": dtypes,
    "Unieke Waardes": unique_counts,
    "# Null Waardes": missing_counts,
    "# Non-null Waardes": non_null_counts,
}).sort_values(by='Unieke Waardes')

# Display it
display(summary)

In [None]:
display(df.describe())


!TODO: Outlier detection? Handeling Missing cells -> fill / remove

In [None]:
# Group by year and calculate mean and standard deviation
grouped = df.groupby("year")[["emission_per_capita"]]
mean_emission = grouped.mean()
std_emission = grouped.std()

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(mean_emission.index, mean_emission["emission_per_capita"], label="Average Emission", color="blue")
plt.fill_between(
    mean_emission.index,
    mean_emission["emission_per_capita"] - std_emission["emission_per_capita"],
    mean_emission["emission_per_capita"] + std_emission["emission_per_capita"],
    color="blue",
    alpha=0.2,
    label="Standard Deviation"
)

plt.title("Average Total Emission per Year with Standard Deviation")
plt.xlabel("Year")
plt.ylabel("Total Emission")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
grouped = df.groupby(['year', 'subregion'])['emission_per_capita'].sum().unstack()

plt.figure(figsize=(14, 8))
grouped.plot(kind='line', figsize=(14, 8), logy=True)
plt.title("Emission per Capita Over Years by Subregion")
plt.xlabel("Year")
plt.ylabel("Emission per Capita")
plt.legend(title="Subregion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.show()


In [None]:
grouped_year = df.groupby("year")[["food_household_consumption", "food_transport", "food_packaging"]].sum()
grouped_year.plot(kind="line", title="Food Household consumption over the years", xlabel="year", ylabel="CO2 emmissions (kt)", figsize=(18, 6))
plt.grid(True)
plt.legend()
plt.show()

In [None]:
grouped_year = df.groupby("year")[["food_household_consumption", "food_transport", "food_packaging", "total_emission"]].sum()
ax1 = grouped_year.plot(kind="line", title="Food Household consumption over the years",
                        xlabel="Year", ylabel="CO2 emissions (kt)", figsize=(18, 6), logy=True)
ax1.grid(True)
ax1.legend(loc='upper left')

ax2 = ax1.twinx()
df_total = df.groupby("year")["total_population"].sum()  # Replace with your actual column
df_total.plot(ax=ax2, color='black', linestyle='--', label='Total Population', linewidth=2, ylabel="Number of People")
ax2.legend(loc='upper right')

plt.show()

In [None]:
grouped_year = df.groupby("year")[["savanna_fires", "forest_fires", "fires_in_organic_soils", "fires_in_humid_tropical_forests"]].sum()
ax1 = grouped_year.plot(kind="line", title="emisions from fires",
                        xlabel="Year", ylabel="CO2 emissions (kt)", figsize=(18, 6))
ax1.grid(True)
ax1.legend(loc='upper left')

ax2 = ax1.twinx()
df_total = df.groupby("year")["average_temperature_c"].sum()
df_total.plot(ax=ax2, color='black', linestyle='--', label='Average Temperature', linewidth=2, ylabel="°C")
ax2.legend(loc='upper right')

plt.show()

In [None]:
grouped_year = df.groupby("year")[
    [
        "crop_residues",
        "rice_cultivation",
        "drained_organic_soils_co_2",
        "pesticides_manufacturing",
        "fertilizers_manufacturing",
        "net_forest_conversion",
        "on_farm_electricity_use",
        "agrifood_systems_waste_disposal",
        "ippu",
        "manure_applied_to_soils",
        "manure_left_on_pasture",
        "manure_management",
        "on_farm_energy_use"
    ]
].sum()
ax1 = grouped_year.plot(
    kind="line",
    title="Emissions during farming",
    xlabel="Year",
    ylabel="CO2 emissions (kt)",
    figsize=(18, 4),
    logy=True
)
ax1.grid(True)
ax1.legend()
plt.show()

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(df.drop(labels=["area", "continent", "subregion"], axis=1).corr(), fignum=f.number)
plt.xticks(
    range(df.select_dtypes(["number"]).shape[1]),
    df.select_dtypes(["number"]).columns,
    fontsize=14,
    rotation=90,
)
plt.yticks(
    range(df.select_dtypes(["number"]).shape[1]),
    df.select_dtypes(["number"]).columns,
    fontsize=14,
)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title("Correlation Matrix", fontsize=16)
plt.show()

In [None]:
# Compute the correlation matrix
corr = df.drop(labels=["area", "continent", "subregion"], axis=1).corr()

# Set up the matplotlib figure
plt.figure(figsize=(19, 15))

# Create a heatmap with annotations (formatted as percentages)
sns.heatmap(
    corr,
    annot=True,
    fmt=".0%",  # Show values as whole percentages
    cmap="coolwarm",  # You can change this colormap if you like
    cbar_kws={"shrink": 0.75},
    annot_kws={"size": 12},
)

# Customize ticks and title
plt.xticks(fontsize=14, rotation=90)
plt.yticks(fontsize=14, rotation=0)
plt.title("Correlation Matrix", fontsize=16)

plt.show()

In [None]:
grouped_year = df.groupby("year")[["rural_population", "urban_population", "total_population_male", "total_population_female"]].sum()
grouped_year.plot()
plt.show()

In [None]:
names = [
        "savanna_fires",
        "forest_fires",
        "fires_in_organic_soils",
        "fires_in_humid_tropical_forests",
        "crop_residues",
        "rice_cultivation",
        "drained_organic_soils_co_2",
        "pesticides_manufacturing",
        "fertilizers_manufacturing",
        "net_forest_conversion",
        "on_farm_electricity_use",
        "agrifood_systems_waste_disposal",
        "ippu",
        "manure_applied_to_soils",
        "manure_left_on_pasture",
        "manure_management",
        "on_farm_energy_use",
        "food_transport",
        "food_household_consumption",
        "food_packaging",
        "food_retail",
        "food_processing"
    ]
grouped_year = df.groupby("year")[names].sum()
df_total = df.groupby("year")["total_emission"].sum()

# Sort columns by the value in the last row (last year)
last_year_values = grouped_year.iloc[-1]
sorted_columns = last_year_values.sort_values().index.tolist()
grouped_year_sorted = grouped_year[sorted_columns]


fig, ax = plt.subplots(figsize=(20, 10))
ax.stackplot(grouped_year_sorted.index, grouped_year_sorted.T.values, labels=names)
ax.plot(df_total.index, df_total, label="Total Emission", color="red")
plt.legend()
plt.show()

In [None]:
grouped_year = (
    df.groupby(["year", "continent"])["total_emission"].sum().unstack(fill_value=0)
)

# Optional: Sort subregions by the value in the last year (for nicer stacking order)
sorted_subregions = grouped_year.iloc[-1].sort_values().index.tolist()
grouped_year_sorted = grouped_year[sorted_subregions]

# Plot
fig, ax = plt.subplots(figsize=(20, 10))
ax.stackplot(
    grouped_year_sorted.index,
    grouped_year_sorted.T.values,
    labels=grouped_year_sorted.columns,
)

plt.title("Total Emission Over Years by region")
plt.xlabel("Year")
plt.ylabel("Total Emission")
plt.legend(title="Subregion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
plt.subplots(figsize=(20, 6))
sns.kdeplot(data=df, x="emission_per_capita", hue="subregion", fill=True, common_norm=False, alpha=0.5)
plt.title("Distribution of Emission per Capita by Subregion")
plt.xscale("log")

In [None]:
pp = sns.pairplot(df, vars=["food_processing", "food_packaging", "food_transport", "food_retail"], hue="continent", diag_kind="kde", dropna=True)


for ax in pp.diag_axes:
    y_max = ax.get_ylim()[1]
    ax.set_ylim(0, y_max * 0.1)  # Adjust this factor as needed


pp.figure.set_figheight(10)
pp.figure.set_figwidth(20)

In [None]:
pp = sns.pairplot(
    df,
    vars=[
        "fires_in_organic_soils",
        "fires_in_humid_tropical_forests",
        "savanna_fires",
        "forest_fires",
        "average_temperature_c",
    ],
    hue="continent",
    diag_kind="auto",
    dropna=True,
)


for ax in pp.diag_axes:
    y_max = ax.get_ylim()[1]
    ax.set_ylim(0, y_max *0.2)  # Adjust this factor as needed


pp.figure.set_figheight(10)
pp.figure.set_figwidth(20)

In [None]:
df_europe = df[df["contintent"] == "Europe"]
pp = sns.jointplot(data=df_europe[df_europe["year"]>=2015], x="food_household_consumption", y="agrifood_systems_waste_disposal", hue="subregion")

In [None]:
pp = sns.jointplot(data=df[df["year"]>=2015], x="average_temperature_c", y="forest_fires", hue="continent")
pp.ax_joint.set_yscale("log")

In [None]:
pp = sns.lmplot(data=df[df["year"] == 2010], y="forest_fires", x="average_temperature_c", hue="continent")
pp.set(yscale="log")
pp.figure.set_figwidth(20)