# Synthetic Greenhouse Gas Emissions Dataset Generation

In [11]:
library(httr)
library(jsonlite)
library(readr)
library(ggplot2)
library(data.table)
library(utils) 
library(dplyr)
library(tidyr)
library(purrr)
library(zoo) 
library(stringr)
library(stats)
library(tibble)

In [12]:
# Load the file
data_raw <- read.csv("3A5EGHG1.csv", sep = ";", skip = 1, stringsAsFactors = FALSE, header = TRUE)
# Inspect first few rows
head(data_raw, 10)

Unnamed: 0_level_0,Year,Sector.and.Source.Categories,Carbon.Dioxide..CO2.,Methane..CH4.,Nitrous.Oxide..N2O.,Hydroflourocarbons..HFCs.
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
1,2010,ENERGY TOTAL EMISSIONS,50697.000,1887.000,520.000,.
2,2010,ENERGY TOTAL CO2 EQUIVALENT,53105.000,.,.,.
3,2010,..Energy Industries,32803.000,49.000,168.000,.
4,2010,..Manufacturing Industries and Construction,11887.000,55.000,95.000,.
5,2010,..Other Sectors,5995.000,1692.000,257.000,.
6,2010,..Fugitive Emissions - Solid Fuels,..,90.000,..,.
7,2010,..Fugitive Emissions - Oil and Natural Gas,12.000,1.000,0.000,.
8,2010,TRANSPORT TOTAL EMISSIONS,23718.000,125.000,331.000,.
9,2010,TRANSPORT TOTAL CO2 EQUIVALENT,24174.000,.,.,.
10,2010,...Domestic Aviation,706.000,0.000,6.000,.


In [13]:
# Rename columns to simplified versions
colnames(data_raw) <- c("Year", "Sector", "CO2", "CH4", "N2O", "HFCs")

# Replace '.' and '..' with NA
data_raw[data_raw == "." | data_raw == ".."] <- NA


# Preview the data
tail(data_raw, 5)


Unnamed: 0_level_0,Year,Sector,CO2,CH4,N2O,HFCs
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
134,2020,..Solid Waste Disposal,,11689.692,,
135,2020,..Biological Treatment of Solid Waste,,391.664,222.335,
136,2020,..Incineration and Open Burning of Waste,0.322,1.288,0.265,
137,2020,..Wastewater Treatment and Discharge,,16563.68,1253.185,
138,2020,TOTAL NATIONAL GHG EMISSIONS (CO2-e),204325.244,,,


In [14]:
# Convert CO2, CH4, N2O, and HFCs columns to numeric
data_raw$CO2 <- as.numeric(data_raw$CO2)
data_raw$CH4 <- as.numeric(data_raw$CH4)
data_raw$N2O <- as.numeric(data_raw$N2O)
data_raw$HFCs <- as.numeric(data_raw$HFCs)


In [15]:
# Remove leading ".." or "..." from Sector names
data_raw$Sector <- gsub("^\\.+", "", data_raw$Sector)

# Trim leading/trailing whitespace
data_raw$Sector <- trimws(data_raw$Sector)


In [16]:
# View cleaned sample
# head(data_raw, 10)
data_raw

Year,Sector,CO2,CH4,N2O,HFCs
<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
2010,ENERGY TOTAL EMISSIONS,50697,1887,520,
2010,ENERGY TOTAL CO2 EQUIVALENT,53105,,,
2010,Energy Industries,32803,49,168,
2010,Manufacturing Industries and Construction,11887,55,95,
2010,Other Sectors,5995,1692,257,
2010,Fugitive Emissions - Solid Fuels,,90,,
2010,Fugitive Emissions - Oil and Natural Gas,12,1,0,
2010,TRANSPORT TOTAL EMISSIONS,23718,125,331,
2010,TRANSPORT TOTAL CO2 EQUIVALENT,24174,,,
2010,Domestic Aviation,706,0,6,


In [17]:
summary(data_raw)

      Year         Sector               CO2                CH4         
 Min.   :2010   Length:138         Min.   :-71354.7   Min.   :    0.0  
 1st Qu.:2010   Class :character   1st Qu.:   127.8   1st Qu.:    9.0  
 Median :2015   Mode  :character   Median :  6477.0   Median :  112.2  
 Mean   :2015                      Mean   : 17636.3   Mean   : 5017.9  
 3rd Qu.:2020                      3rd Qu.: 28895.8   3rd Qu.: 3000.8  
 Max.   :2020                      Max.   :232987.6   Max.   :38434.2  
                                   NA's   :41         NA's   :65       
      N2O               HFCs     
 Min.   :    0.0   Min.   : 771  
 1st Qu.:   18.7   1st Qu.:1137  
 Median :  272.8   Median :2234  
 Mean   : 1397.4   Mean   :2028  
 3rd Qu.: 1071.3   3rd Qu.:2867  
 Max.   :14880.7   Max.   :3078  
 NA's   :74        NA's   :132   

In [18]:
# Unique sectors
unique_sectors <- unique(data_raw$Sector)
length(unique_sectors)  # Number of sectors
unique_sectors

In [19]:
# --- Clean TOTAL sectors before any modeling ---

# Remove rows where Sector is fully uppercase and contains "TOTAL"
data_raw <- data_raw %>%
  filter(!(grepl("TOTAL", Sector) & Sector == toupper(Sector)))

# --- Synthetic Dataset Generation --

# Step 1: Keep original 2010–2020 data with at least one gas value present
data_clean <- data_raw %>%
  filter(!is.na(CO2) | !is.na(CH4) | !is.na(N2O) | !is.na(HFCs))

# Unique sectors, gases, and future years
sectors <- unique(data_clean$Sector)
gases <- c("CO2", "CH4", "N2O", "HFCs")
years_future <- 2021:2050

# Function to impute missing values via linear interpolation in historical data
impute_linear <- function(df) {
  # Ensure data is sorted by Year
  df <- df %>% arrange(Year)
  # Apply na.approx interpolation; if leading/trailing NAs, keep them as NA
  df$Value <- na.approx(df$Value, x = df$Year, na.rm = FALSE)
  return(df)
}

# Function to simulate emissions per sector-gas with imputation and regression forecasting
simulate_emissions <- function(sector, gas) {
  # Extract historical data for sector-gas
  df_sector <- data_clean %>%
    filter(Sector == sector) %>%
    select(Year, Value = all_of(gas))
  
  # Impute missing values in historical data using linear interpolation
  df_imputed <- impute_linear(df_sector)
  
  # Filter out any remaining NAs after interpolation (e.g., leading/trailing)
  df_model <- df_imputed %>% filter(!is.na(Value))
  
  if (nrow(df_model) >= 3) {
    # Fit linear regression model on imputed data
    model <- lm(Value ~ Year, data = df_model)
    # Predict future values for 2021-2050
    future_values <- predict(model, newdata = data.frame(Year = years_future))
  } else if (nrow(df_model) > 0) {
    # Not enough points for regression: repeat last known value
    future_values <- rep(tail(df_model$Value, 1), length(years_future))
  } else {
    # No data at all: fill with NA to reflect missingness
    future_values <- rep(NA_real_, length(years_future))
  }
  
  # Return synthetic future data in long format
  data.frame(
    Year = years_future,
    Sector = sector,
    Gas = gas,
    Value = future_values
  )
}

# Step 2: Generate synthetic future data for all sector-gas combinations
synthetic_data_long <- expand_grid(Sector = sectors, Gas = gases) %>%
  pmap_dfr(~simulate_emissions(..1, ..2))

# Step 3: Pivot synthetic data to wide format (Year, Sector, CO2, CH4, N2O, HFCs)
synthetic_data <- synthetic_data_long %>%
  pivot_wider(names_from = Gas, values_from = Value) %>%
  arrange(Year, Sector)

# Step 4: Prepare original real data (keep original NAs, no imputation here)
real_data <- data_clean %>%
  select(Year, Sector, all_of(gases)) %>%
  arrange(Year, Sector)

# Step 5: Combine real historical and synthetic future data
combined_data <- bind_rows(real_data, synthetic_data) %>%
  arrange(Sector, Year)

# Optional: Trim whitespace in Sector names
combined_data$Sector <- stringr::str_trim(combined_data$Sector)

# Step 6: Save combined dataset to CSV
write_csv(combined_data, "phil_synthetic_ghg_emissions_2010_2050.csv")

# Summary message
cat("Saved combined dataset with imputed historical data and synthetic forecasts from 2010 to 2050.\n")

Saved combined dataset with imputed historical data and synthetic forecasts from 2010 to 2050.


In [20]:
head(combined_data,20)

Unnamed: 0_level_0,Year,Sector,CO2,CH4,N2O,HFCs
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,2010,Biological Treatment of Solid Waste,,35.0,32.0,
2,2015,Biological Treatment of Solid Waste,,70.936,40.282,
3,2020,Biological Treatment of Solid Waste,,391.664,222.335,
4,2021,Biological Treatment of Solid Waste,,379.8651,212.4067,
5,2022,Biological Treatment of Solid Waste,,415.5315,231.4402,
6,2023,Biological Treatment of Solid Waste,,451.1979,250.4737,
7,2024,Biological Treatment of Solid Waste,,486.8643,269.5072,
8,2025,Biological Treatment of Solid Waste,,522.5307,288.5407,
9,2026,Biological Treatment of Solid Waste,,558.1971,307.5742,
10,2027,Biological Treatment of Solid Waste,,593.8635,326.6077,
