# 1.1 Process the PRIMAP data product

[PRIMAP](https://zenodo.org/records/13752654) data repository

In [None]:
import os
from pathlib import Path

import pandas as pd

In [None]:
# load the data, here I am loading the data over HTTP
data_url = "https://zenodo.org/records/13752654/files/Guetschow_et_al_2024a-PRIMAP-hist_v2.6_final_13-Sep-2024.csv"
df_raw = pd.read_csv(data_url)

In [None]:
# output directory
processed_dir = Path(os.path.abspath("../data/processed/"))
processed_dir.mkdir(parents=True, exist_ok=True)

In [None]:
df_raw.head()

In [None]:
df_tmp = (
    df_raw
    .loc[df_raw["category (IPCC2006_PRIMAP)"] == "M.0.EL"]
    .loc[df_raw["entity"] == "KYOTOGHG (AR6GWP100)"]
    .loc[df_raw["scenario (PRIMAP-hist)"]== "HISTTP"] # HISTTP is third party and HISTCR is crountry reported
    .drop(columns=["source", "scenario (PRIMAP-hist)", "provenance", "category (IPCC2006_PRIMAP)", "unit", "entity"])
    .rename(columns={"area (ISO3)": "iso3"})
    .sort_values(by="iso3")
)

# ensure all column names are strings
df_tmp.columns = df_tmp.columns.astype(str)

# pivot the dataset (only select data from 1850 onwards)
start_year = 1850
id_vars = [val for val in list(df_tmp.columns) if not val.isdigit()]
value_vars = [val for val in list(df_tmp.columns) if val.isdigit() and int(val)>=start_year]
df_final = (
    df_tmp[id_vars + value_vars]
    .melt(id_vars=id_vars, value_vars=value_vars, var_name="year", value_name="emissions_gg")  
    .sort_values(by=["iso3", "year"])
    .assign(emissions_gt=lambda x: x['emissions_gg'].apply(lambda val: val / 1000000))
    .drop(columns=['emissions_gg'])
)

In [None]:
df_final.to_csv(processed_dir / "primap-histtp-ghg-without-lulucf-1850-2023.csv", index=False)