<a href="https://colab.research.google.com/github/joekelly211/masfi/blob/main/8_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Subdirectories

In [None]:
# Define base directory
base_dir = "/gdrive/Shareddrives/masfi"

# Mount Google Drive and set base directory
from google.colab import drive
import os
import sys
drive.mount('/gdrive', force_remount=True)
_path_to_add = os.path.realpath(base_dir)
if _path_to_add not in sys.path:
    sys.path.append(_path_to_add)

In [None]:
# Capture outputs
%%capture
# Imports and upgrades
!pip install geopandas
!pip install pmdarima
!pip install rasterio

In [None]:
# Reload imports, replacing those in the cache
# %load_ext autoreload
# %autoreload 2
# Imports
import geopandas as gpd
from google.auth import default
from google.colab import auth
from google.colab import runtime
import gspread
import glob
import math
from matplotlib import pyplot as plt
import numpy as np
from os.path import exists, join
from os import makedirs
from osgeo import gdal
import pandas as pd
import pmdarima as pm
import rasterio
from rasterio import mask as msk

In [None]:
areas_dir = join(base_dir, "1_areas")
scenarios_dir = join(base_dir, "6_scenarios")
uncertainty_dir = join(base_dir, "7_uncertainty")
predictions_dir = join(base_dir, "8_predictions")
sample_polygons_dir = join(predictions_dir, "sample_polygons")

# Create directories
makedirs(sample_polygons_dir, exist_ok=True)

In [None]:
# Global function: export an array as a .tif
template_tif_path = join(areas_dir, "template.tif")
nodatavalue = -1111111
compress = True
def export_array_as_tif(input_array, output_tif, template=template_tif_path, nodatavalue=nodatavalue, compress=compress):
  template = gdal.Open(template)
  template_band = template.GetRasterBand(1)
  template_dimensions, template_projection = template.GetGeoTransform(), template.GetProjection()
  if compress: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32,
                                                options=["COMPRESS=DEFLATE","PREDICTOR=2","ZLEVEL=9"])
  if compress == False: driver = gdal.GetDriverByName("GTiff").Create(output_tif, template_band.XSize, template_band.YSize, bands=1, eType=gdal.GDT_Float32)
  driver.GetRasterBand(1).WriteArray(input_array)
  driver.GetRasterBand(1).SetNoDataValue(nodatavalue)
  driver.SetGeoTransform(template_dimensions)
  driver.SetProjection(template_projection)

# Select model, area and sample polygons

In [None]:
# Select if to source predictions from scenarios_dir or uncertainty_dir
source_dir = scenarios_dir

# Select the model
for subdir in os.listdir(source_dir):
  print(f"selected_model = '{subdir}'")

In [None]:
selected_model = 'agbd_240718_164421'

selected_model_dir = join(source_dir, selected_model)
# Select the prediction area
for subdir in os.listdir(selected_model_dir):
  if source_dir == scenarios_dir and not subdir.endswith('.csv') and not subdir.endswith('.json'):
    print(f"prediction_area = '{subdir}'")
  if source_dir == uncertainty_dir and subdir != 'model_iterations':
    print(f"prediction_area = '{subdir[10:]}'")

In [None]:
prediction_area = 'terengganu'

# Model-area stats directory
model_area_predictions_dir = join(predictions_dir, f"{selected_model}_{prediction_area}")
makedirs(model_area_predictions_dir, exist_ok=True)

# Create list of available prediction rasters and scenarios. Rasters must already be masked (e.g. to forest).
if source_dir == scenarios_dir:
  prediction_raster_dir = join(selected_model_dir, prediction_area, 'scenario_predictions')
if source_dir == uncertainty_dir:
  prediction_raster_dir = join(selected_model_dir, f'scenarios_{prediction_area}', 'statistics_masked')

prediction_raster_dirs = []
scenarios = set()
for prediction_raster in os.listdir(prediction_raster_dir):
  prediction_raster_dirs.append(join(prediction_raster_dir, prediction_raster))
  scenarios.add(prediction_raster.split("__")[0])
scenarios = sorted(list(scenarios))

In [None]:
# Select sample area polygons. This should be a single .gpkg with the field 'name' differentiating polygons.
sample_polygons = []
for geopackage in os.listdir(sample_polygons_dir):
  sample_polygons.append(geopackage)

if len(sample_polygons) == 0:
  print(f"No sample areas found. Upload .gpkg polygons to {sample_polygons_dir}")
else:
  for sample_polygon in sample_polygons: print(f"selected_sample_polygons = '{sample_polygon}'")

In [None]:
selected_sample_polygons = 'terengganu_fr_lite.gpkg'

selected_sample_polygons_dir = join(sample_polygons_dir, selected_sample_polygons)
selected_sample_polygons_gpkg = gpd.read_file(selected_sample_polygons_dir)

sample_polygons_predictions_dir = join(model_area_predictions_dir, selected_sample_polygons[:-5])
makedirs(sample_polygons_predictions_dir, exist_ok=True)
arima_input_dir = join(sample_polygons_predictions_dir, 'arima_input')
makedirs(arima_input_dir, exist_ok=True)
bau_forecasting_dir = join(sample_polygons_predictions_dir, 'bau_forecasting')
makedirs(bau_forecasting_dir, exist_ok=True)
detailed_stats_dir = join(sample_polygons_predictions_dir, 'detailed_stats')
makedirs(detailed_stats_dir, exist_ok=True)
detailed_stats_scenario_dir = join(sample_polygons_predictions_dir, 'detailed_stats_scenario')
makedirs(detailed_stats_scenario_dir, exist_ok=True)

# Statistics

In [None]:
# Select scenarios to predict
print('selected_scenarios = [')
for scenario in scenarios:
  print(f'  "{scenario}",')
print(']')

In [None]:
selected_scenarios = [
  "2008",
  "2009",
  "2010",
  "2011",
  "2012",
  "2013",
  "2014",
  "2015",
  "2016",
  "2017",
  "2018",
  "2019",
  "2020",
  "2021",
  "2022",
  "2027_nodef",
  "2027_nodist",
  "2032_nodef",
  "2032_nodist",
  "2037_nodef",
  "2037_nodist",
  "9999_comrec",
  "9999_comrest",
]

# Filter to selected scenarios, and separate prediction and uncertainty rasters (if latter present)
prediction_rasters = []
uncertainty_rasters = []
for prediction_raster in prediction_raster_dirs:
  for scenario in selected_scenarios:
    if source_dir == uncertainty_dir:
      if f"{scenario}__" in prediction_raster and 'mean.tif' in prediction_raster:
        prediction_rasters.append(prediction_raster)
      if f"{scenario}__" in prediction_raster and 'uncertainty.tif' in prediction_raster:
        uncertainty_rasters.append(prediction_raster)
    else: # If the source directory is scenarios_dir (without uncertainty values)
      if f"{scenario}__" in prediction_raster: prediction_rasters.append(prediction_raster)
# Toggle whether to predict uncertainty stats
if len(uncertainty_rasters) > 0: generate_uncertainty_stats = True
else: generate_uncertainty_stats = False

# Sort rasters chronologically (assuming year is first in the filename)
prediction_rasters = sorted(prediction_rasters)
uncertainty_rasters = sorted(uncertainty_rasters)

In [None]:
# Calculate precise pixel size for the centre of the template area
# See https://gis.stackexchange.com/questions/142326/calculating-longitude-length-in-miles
# See https://thoughtco.com/degree-of-latitude-and-longitude-distance-4070616

# Proxy distance of degrees latitude in m (actual is non-linear)
lat_dist_equator_km = 110.567
lat_dist_poles_km = 111.699
lat_dist_diff = lat_dist_poles_km - lat_dist_equator_km
lat_dist_change_deg_km = lat_dist_diff / 90
long_dist_equator_km = 111.321 # Equation calculates at different latitudes

# Get a dimensions using a prediction as a template
template_prediction = gdal.Open(prediction_rasters[0])
pixel_height_deg = 0 - template_prediction.GetGeoTransform()[5]
pixel_width_deg = template_prediction.GetGeoTransform()[1]

# Approximate pixel size
approx_resolution = (np.average([pixel_height_deg, pixel_width_deg]) * np.average([lat_dist_equator_km, lat_dist_poles_km]) * 1000)
approx_pixel_size_ha = approx_resolution**2 / 10000

print(f"Without precise correction, the approximate resolution is {approx_resolution} m, while the approximate pixel area is {approx_pixel_size_ha} ha.\n")

print(f"The pixel size with be further corrected based on the position of each sample polygon.")

In [None]:
# Generate empty dataframes for statistics
df_base = pd.DataFrame(index = selected_scenarios)
df_base.rename_axis('scenario', inplace=True)
df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg = df_base.copy(), df_base.copy(), df_base.copy(), df_base.copy()
# If uncertainty rasters are present, generate empty dataframes
if generate_uncertainty_stats:
  df_agbd_mean_mg_ha_ci95, df_agbd_mean_mg_ha_uncertainty, df_agb_total_tg_ci95 = df_base.copy(), df_base.copy(), df_base.copy()

df_polygon_area_km2 = pd.DataFrame(columns = ["Name", "Area (km^2)"])

# Using the first prediction as a template, create a raster with no 'nodata' values.
# This will be used for counting all pixels inside a polygon mask (which uses the 'nodata' values)
template_prediction = gdal.Open(prediction_rasters[0])
template_array = template_prediction.ReadAsArray()
template_nodatavalue = template_prediction.GetRasterBand(1).GetNoDataValue()
absent_nodatavalue = template_nodatavalue + 0.1 # Modify nodata value to one that's absent.
assert absent_nodatavalue not in template_array, "New nodata value is present in the template. Change to a value that is absent."
template_no_nodata_dir = join(sample_polygons_predictions_dir, 'template_no_nodata.tif')
export_array_as_tif(template_array, template_no_nodata_dir, template = prediction_rasters[0], nodatavalue=absent_nodatavalue, compress=False)

# Loop through each polygon stored in GPKG to generate statistics
for index, row in selected_sample_polygons_gpkg.iterrows():

  # Define the polygon
  sample_polygon_geometry, sample_polygon_name = row["geometry"], row["name"] # Set name to start at 3rd character with [2:] (skipping number used for ordering)
  polygons = [polygon for polygon in sample_polygon_geometry.geoms]

  # Latitude of the centroid
  polygon_centroid_lat = sample_polygon_geometry.centroid.y

  # Calculate latitude distance per degree at the polygon's latitude
  latitude_m_per_degree = 1000 * (lat_dist_equator_km + (lat_dist_change_deg_km * polygon_centroid_lat))

  # Rest of your calculations that depend on the latitude_m_per_degree follows here
  # For example, if you're calculating pixel size in meters for each polygon based on its latitude:
  precise_pixel_height_m = latitude_m_per_degree * pixel_height_deg
  precise_pixel_width_m = (math.cos((math.pi / 180) * polygon_centroid_lat) * latitude_m_per_degree * pixel_width_deg)
  precise_pixel_size_ha = precise_pixel_height_m * precise_pixel_width_m / 10000

  # Mask the 'no nodata' raster to the polygon with an absent value to count all pixels within the polygon
  with rasterio.open(template_no_nodata_dir) as template_no_nodata:
    no_nodata_template_array_masked, transform_1 = msk.mask(template_no_nodata, polygons, crop=True, filled=False)

  # Calculate sample_polygon_geometry area (ellipsoidal as opposed to planimetric)
  sample_polygons_crs = selected_sample_polygons_gpkg.crs
  temp_gdf = gpd.GeoDataFrame({'name': [sample_polygon_name], 'geometry': sample_polygon_geometry}, crs=sample_polygons_crs)
  temp_gdf_utm = temp_gdf.estimate_utm_crs()
  polygon_area_ha = np.divide(temp_gdf.to_crs(temp_gdf_utm).area[0], 10000, dtype='float64')
  df_polygon_area_km2 = pd.concat([df_polygon_area_km2, pd.DataFrame([{'Name': sample_polygon_name, 'Area (km^2)': polygon_area_ha / 100}])], ignore_index=True, sort=False)

  # Create an empty list for each prediction raster statistic
  values_forest_cover_ha, values_agbd_mean_mg_ha, values_agbd_stdev_mg_ha, values_agb_total_tg = [], [], [], []

  # If uncertainty rasters are present, create an empty list for each uncertainty raster statistic
  if generate_uncertainty_stats:
    values_agbd_mean_mg_ha_ci95, values_agbd_mean_mg_ha_uncertainty, values_agb_total_tg_ci95  = [], [], []

  # Loop through prediction rasters
  for prediction_raster in prediction_rasters:

    # Mask predictor to sample_polygon_geometry
    with rasterio.open(prediction_raster) as prediction:
      nodatavalue = int(prediction.nodatavals[0])
      prediction_array_masked, transform_2 = msk.mask(prediction, polygons, crop=True, filled=False)

    # Count all (incl. nodata) pixels within polygon, and estimate their total area
    pixel_count_polygon = np.ma.count(no_nodata_template_array_masked)
    pixels_area_polygon_ha = np.multiply(pixel_count_polygon, precise_pixel_size_ha, dtype='float64')

    # Correct pixel size to UTM ellipsoidal measure of the polygons position (which will decrease further North)
    polygon_to_pixel_area_ratio = np.divide(pixels_area_polygon_ha, polygon_area_ha, dtype='float64')
    pixel_size_ha_corrected = np.multiply(precise_pixel_size_ha, polygon_to_pixel_area_ratio, dtype='float64')

    # Count pixels within polygon, excluding those previously masked i.e. nodata
    pixel_count_polygon_masked = np.ma.count(prediction_array_masked)
    pixels_area_polygon_masked_ha = np.multiply(pixel_count_polygon_masked, pixel_size_ha_corrected, dtype='float64')

    # Calculate forest area
    forest_cover_ha = pixels_area_polygon_masked_ha # Already masked to forest in current workflow

    # Calculate total, mean and stdev of aboveground biomass
    agbd_mean_mg_ha = np.ma.mean(prediction_array_masked, dtype='float64') # Float64 minimises error for large number of values
    agbd_mean_stdev_ha = np.ma.std(prediction_array_masked, dtype='float64')
    agb_total_mg = np.multiply(agbd_mean_mg_ha, forest_cover_ha, dtype='float64')
    agb_total_tg = np.divide(agb_total_mg, 1000000, dtype='float64') # Convert Mg (megagram = ton) to Tg (teragram = megaton)

    # Append results to statistics list
    values_forest_cover_ha.append(forest_cover_ha)
    values_agbd_mean_mg_ha.append(agbd_mean_mg_ha)
    values_agbd_stdev_mg_ha.append(agbd_mean_stdev_ha)
    values_agb_total_tg.append(agb_total_tg)

    if generate_uncertainty_stats:
      uncertainty_raster_present = False
      for uncertainty_raster in uncertainty_rasters:
        if prediction_raster.split("__mean.tif")[0] in uncertainty_raster:
          uncertainty_raster_present = True
          matching_uncertainty_raster = uncertainty_raster

      if not uncertainty_raster_present: print(f"There is no uncertainty raster for {prediction_raster.split('/')[-1]}")

      if uncertainty_raster_present:
        # Open and mask uncertainty raster to polygon
          with rasterio.open(matching_uncertainty_raster) as uncertainty:
            nodatavalue = int(uncertainty.nodatavals[0])
            uncertainty_array_masked, transform_2 = msk.mask(uncertainty, polygons, crop=True, filled=False)

          # See https://stats.stackexchange.com/questions/223924/how-to-add-up-partial-confidence-intervals-to-create-a-total-confidence-interval#comment426260_223924

          # Compress masked array data to 1D
          prediction_1d = np.ma.compressed(prediction_array_masked)
          uncertainty_1d = np.ma.compressed(uncertainty_array_masked)
          # Convert uncertainty percentages to ratios
          uncertainty_ratios = np.divide(uncertainty_1d, 100, dtype='float64')
          # Multiply the prediction values (mean AGBD Mg/ha) by uncertainty ratios for CI95 values
          prediction_ci95s = np.multiply(prediction_1d, uncertainty_ratios, dtype='float64')

          # Method 1 - Simple calculation of mean CI95 (higher estimate). Assumption: pixel values are completely correlated (all measure the same thing).
          agbd_mean_mg_ha_ci95_1 = np.mean(prediction_ci95s, dtype = 'float64')
          # Method 2 - Square prediction CI95s. Sum and then square root for total CI.
          # Then divide by observations to calculate mean CI95. Assumption: pixel values are completely independent.
          sum_squares = np.sum(np.square(prediction_ci95s, dtype='float64'), dtype='float64')
          total_ci95 = np.sqrt(sum_squares, dtype='float64')
          agbd_mean_mg_ha_ci95_2 = np.divide(total_ci95, np.ma.count(prediction_ci95s), dtype='float64')
          # Method 3 - Used in Liang et al 2023 to calculate change uncertainty. Identical results to method 2.
          predictions_x_uncertainties = np.multiply(prediction_1d, uncertainty_1d, dtype='float64')
          sum_squares_pxu = np.sum(np.square(predictions_x_uncertainties, dtype='float64'), dtype='float64')
          sqrt_divided_sum = np.sqrt(sum_squares_pxu, dtype='float64') / np.sum(prediction_1d, dtype='float64')
          agbd_mean_mg_ha_ci95_3 = np.multiply(np.divide(sqrt_divided_sum, 100, dtype='float64'), agbd_mean_mg_ha, dtype='float64')

          agbd_mean_mg_ha_ci95 = agbd_mean_mg_ha_ci95_1

          # Calculate total AGB CI95
          agb_total_mg_ci95 = np.multiply(agbd_mean_mg_ha_ci95, forest_cover_ha, dtype='float64')
          agb_total_tg_ci95 = np.divide(agb_total_mg_ci95, 1000000, dtype='float64') # Convert total CI to Tg
          # Calculate percentage uncertainty
          agbd_mean_mg_ha_uncertainty = np.multiply(np.divide(agbd_mean_mg_ha_ci95, agbd_mean_mg_ha, dtype='float64'), 100, dtype='float64')
          # Append results to statistics list
          values_agbd_mean_mg_ha_ci95.append(agbd_mean_mg_ha_ci95)
          values_agbd_mean_mg_ha_uncertainty.append(agbd_mean_mg_ha_uncertainty)
          values_agb_total_tg_ci95.append(agb_total_tg_ci95)

  # Concatenate new columns to the main DataFrames for each statistic
  df_forest_cover_ha = pd.concat([df_forest_cover_ha, pd.DataFrame({sample_polygon_name: values_forest_cover_ha}, index=df_forest_cover_ha.index)], axis=1)
  df_agbd_mean_mg_ha = pd.concat([df_agbd_mean_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha}, index=df_agbd_mean_mg_ha.index)], axis=1)
  df_agbd_stdev_mg_ha = pd.concat([df_agbd_stdev_mg_ha, pd.DataFrame({sample_polygon_name: values_agbd_stdev_mg_ha}, index=df_agbd_stdev_mg_ha.index)], axis=1)
  df_agb_total_tg = pd.concat([df_agb_total_tg, pd.DataFrame({sample_polygon_name: values_agb_total_tg}, index=df_agb_total_tg.index)], axis=1)

  if generate_uncertainty_stats:
      df_agbd_mean_mg_ha_ci95 = pd.concat([df_agbd_mean_mg_ha_ci95, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_ci95}, index=df_agbd_mean_mg_ha_ci95.index)], axis=1)
      df_agbd_mean_mg_ha_uncertainty = pd.concat([df_agbd_mean_mg_ha_uncertainty, pd.DataFrame({sample_polygon_name: values_agbd_mean_mg_ha_uncertainty}, index=df_agbd_mean_mg_ha_uncertainty.index)], axis=1)
      df_agb_total_tg_ci95 = pd.concat([df_agb_total_tg_ci95, pd.DataFrame({sample_polygon_name: values_agb_total_tg_ci95}, index=df_agb_total_tg_ci95.index)], axis=1)


# Create stats list
if generate_uncertainty_stats:
  df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_mean_mg_ha_ci95,
                 df_agbd_mean_mg_ha_uncertainty, df_agbd_stdev_mg_ha, df_agb_total_tg, df_agb_total_tg_ci95]
else: df_stats_list = [df_forest_cover_ha, df_agbd_mean_mg_ha, df_agbd_stdev_mg_ha, df_agb_total_tg]

# Set index of the polygon area km2 dataframe to 'Name' of the polygon
df_polygon_area_km2 = df_polygon_area_km2.set_index('Name')

# Export statistics for ARIMA input
df_arima_list = [df_forest_cover_ha, df_agb_total_tg]
for df_arima in df_arima_list:
  df_noalts = df_arima[df_arima.index.str.contains("_") == False]
  if df_arima.equals(df_forest_cover_ha): df_filename = "forest_cover_ha"
  # if df_stats.equals(df_agbd_mean_mg_ha): df_filename = "agbd_mean_mg_ha"
  # if df_stats.equals(df_agbd_stdev_mg_ha): df_filename = "agbd_stdev_mg_ha"
  if df_arima.equals(df_agb_total_tg): df_filename = "agb_total_tg"
  df_noalts.to_csv(join(arima_input_dir, f'{df_filename}.csv'))

# Generate summary stats
df_forest_cover_ha_t = df_forest_cover_ha.T.rename_axis("Name", axis=1).add_suffix(" forest cover (ha)")
df_agb_total_tg_t = df_agb_total_tg.T.rename_axis("Name", axis=1).add_suffix(" forest AGB (Tg)")
if generate_uncertainty_stats:
  df_agb_total_tg_ci95_t = df_agb_total_tg_ci95.T.rename_axis("Name", axis=1).add_suffix(" forest AGB CI95 (Tg)")
  summary_stats = pd.concat([df_polygon_area_km2, df_forest_cover_ha_t, df_agb_total_tg_t, df_agb_total_tg_ci95_t], axis=1).rename_axis("Name", axis=1)
else: summary_stats = pd.concat([df_polygon_area_km2, df_forest_cover_ha_t, df_agb_total_tg_t], axis=1).rename_axis("Name", axis=1)
summary_stats.to_csv(join(sample_polygons_predictions_dir, 'summary_stats.csv'))

# Generate detailed stats by polygon (requires uncertainty stats)
for polygon_area in df_stats_list[0]:
  polygon_area_km2 = df_polygon_area_km2.loc[polygon_area]["Area (km^2)"]
  df_detailed_stats = df_base
  df_detailed_stats["Area (km^2)"] = polygon_area_km2
  for df_stats in df_stats_list:
    if df_stats.equals(df_forest_cover_ha): stat_col = "Forest cover (ha)"
    if df_stats.equals(df_agbd_mean_mg_ha): stat_col = "Forest AGBD mean (Mg / ha)"
    if df_stats.equals(df_agbd_stdev_mg_ha): stat_col = "Forest AGBD stdev (Mg / ha)"
    if df_stats.equals(df_agb_total_tg): stat_col = "Forest AGB total (Tg)"
    if generate_uncertainty_stats:
      if df_stats.equals(df_agbd_mean_mg_ha_ci95): stat_col = "Forest AGBD CI95 (Mg / ha)"
      if df_stats.equals(df_agbd_mean_mg_ha_uncertainty): stat_col = "Forest AGBD uncertainty (%)"
      if df_stats.equals(df_agb_total_tg_ci95): stat_col = "Forest AGB total CI95 (Tg)"
    for stats_polygon_area in df_stats:
      if stats_polygon_area == polygon_area:
        df_stats_renamed = df_stats.rename(columns={stats_polygon_area:stat_col})
        df_detailed_stats = pd.concat([df_detailed_stats, df_stats_renamed[stat_col]], axis=1)
    df_detailed_stats.to_csv(join(detailed_stats_dir, f'{polygon_area}.csv'))

# Generate detailed stats by scenario (requires uncertainty stats)
scenarios = {}
# Loop through all CSV files in the 'detailed_stats_dir'
for stats_csv in os.listdir(detailed_stats_dir):
    polygon_name = f"{stats_csv[:-4]}"
    stats_csv_path = join(detailed_stats_dir, stats_csv)
    stats_csv_df = pd.read_csv(stats_csv_path)
    # Loop through each unique scenario in the file
    for scenario in stats_csv_df['scenario'].unique():
        # Filter the dataframe for the current scenario
        scenario_df = stats_csv_df[stats_csv_df['scenario'] == scenario].copy()
        # Drop the 'scenario' column and add the 'Name' column
        scenario_df.drop('scenario', axis=1, inplace=True)
        scenario_df.insert(0, 'Name', polygon_name)
        # If this scenario's dataframe already exists, append to it; otherwise, create it
        if scenario in scenarios: scenarios[scenario] = pd.concat([scenarios[scenario], scenario_df], ignore_index=True)
        else: scenarios[scenario] = scenario_df
for scenario, scenario_df in scenarios.items():
    output_file_path = join(detailed_stats_scenario_dir,f'{scenario}.csv')
    scenario_df.to_csv(output_file_path, index=False)

# Delete template raster
os.remove(template_no_nodata_dir)

# ARIMA

In [None]:
# Select time series (Must have the 'scenario' (i.e. time period) in the first column.
# Must have the names of the sample polygons in the proceeding columns.
time_series_csv = []
for csv in os.listdir(arima_input_dir): time_series_csv.append(csv)
if len(time_series_csv) == 0: print(f"Run the statistics section first to generate input for ARIMA")

print('time_series = [')
for series in time_series_csv:
  print(f'  "{series}",')
print(']')

In [None]:
time_series = [
  "forest_cover_ha.csv",
  "agb_total_tg.csv",
]

time_column = 'scenario'
time_label = 'Year' # Name of time unit for plots
forecast_years = 15

for csv in time_series:
  forecast_dir = join(bau_forecasting_dir, f"{csv[:-4]}")
  makedirs(forecast_dir, exist_ok=True)
  time_series_df = pd.read_csv(join(arima_input_dir,csv), header=0)
  starting_time_unit = time_series_df[time_column].astype('int')[0]
  study_areas = list(time_series_df.columns[1:])

  if csv == 'forest_cover_ha.csv': forecast_label = 'Forest cover (ha)'
  if csv == 'agb_total_tg.csv': forecast_label = 'Forest total AGB (Tg)'

  for study_area in study_areas:
    filename = study_area.replace(' ','_').lower()
    verbose = False # Change verbose to True to see summary and diagnostics
    # https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html
    model = pm.auto_arima(time_series_df[study_area],
                          seasonal=False,
                          trace=verbose,
                          # start_p=1,
                          # start_q=1,
                          # test='adf',       # use adftest to find optimal 'd'
                          # max_p=3, max_q=3, # maximum p and q
                          # m=1,              # frequency of series
                          # d=None,           # let model determine 'd'
                          # stationary=False,
                          # error_action='ignore',
                          # suppress_warnings=True,
                          # information_criterion = 'aic', # Either ‘aic’, ‘bic’, ‘hqic’, ‘oob’
                          stepwise=True,
                          # n_jobs = -1,
                          maxiter = 10000,
                          error_action="ignore"
                          )
    if verbose == True:
      print(model.summary())
      model.plot_diagnostics(figsize=(7,5))
      plt.show()

    # Forecast
    n_periods = forecast_years
    fc, confint = model.predict(n_periods=n_periods, return_conf_int=True)
    index_of_fc = np.arange(
        len(time_series_df[study_area])+starting_time_unit,
        len(time_series_df[study_area])+n_periods+starting_time_unit
        )

    # Create series
    fc_series = pd.Series(fc.values, index=index_of_fc, name=f'{study_area} forecast')
    lower_series = pd.Series(confint[:, 0], index=index_of_fc, name=f'{study_area} lower CI')
    upper_series = pd.Series(confint[:, 1], index=index_of_fc, name=f'{study_area} upper CI')

    # Compile dataframe
    df_forecast = pd.concat([fc_series,upper_series,lower_series],axis=1)
    df_forecast.index.name = time_label

    # Plot
    plt.plot(time_series_df[time_column],time_series_df[study_area])
    plt.plot(fc_series, color='darkgreen')
    plt.fill_between(lower_series.index, lower_series, upper_series, color='k', alpha=.15)
    plt.title(f'{study_area}'), plt.xlabel(f'{time_label}'), plt.ylabel(f'{forecast_label}')
    plt.savefig(join(forecast_dir,f'plot_{filename}.png'))
    plt.close()

    # Save results
    df_forecast.to_csv(join(forecast_dir,f'results_{filename}.csv'))

  forecast_all_csv = glob.glob(os.path.join(forecast_dir, "*.csv"))
  forecast_all_df = pd.concat((pd.read_csv(f) for f in forecast_all_csv), ignore_index=True)
  forecast_final_df = forecast_all_df.groupby(time_label).agg('first')
  forecast_final_df.to_csv(join(bau_forecasting_dir,f'{csv}'))

# Gsheet

In [None]:
# https://docs.gspread.org/en/latest/user-guide.html#opening-a-spreadsheet
# The following scenarios must be predicted: historic scenarios 2008 - 2022,
# alternate scenarios 'no_def' and 'no_dist' for 2027, 2032, 2037 and '9999_comrec'.
# 2022 - 2037 must be forecast using the ARIMA section.

# Authenticate
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# Define AGB stats template, needs to be placed in the 'sample_polygons_predictions_dir'
# Should also be the last sheet of this name opened.
gspread_agb_stats_template = gc.open('agb_stats_template')

# Define the Gsheet name
gspread_agb_stats_name = f"stats_{selected_model}_{selected_sample_polygons[:-5]}"
gpspread_agb_stats_backup_name = f"{gspread_agb_stats_name}_backup"

gpsread_agb_stats_backup = None
try: # Delete previous backup Gsheet (if it exists)
  gpsread_agb_stats_backup = gc.open(gpspread_agb_stats_backup_name)
  gc.del_spreadsheet(gpsread_agb_stats_backup.id) # Danger: deletes permanently
  print("The previous stats Gsheet backup for this model, area and sample polygon combination has been deleted and replaced.")
except: print("There is no existing stats Gsheet backup.")

gspread_agb_stats = None
try: # Copies an existing Gsheet (if it exists) to backup, then deletes it.
  gspread_agb_stats = gc.open(gspread_agb_stats_name)
  gc.copy(gspread_agb_stats.id, title=gpspread_agb_stats_backup_name, copy_permissions=True)
  gc.del_spreadsheet(gspread_agb_stats.id) # Danger: deletes permanently
  print("The previous stats Gsheet for this model, area and sample polygon combination has been deleted and replaced.")
except: print("Creating the stats Gsheet for this model, area and sample polygon combination for the first time.")

# Create a new gsheet from the template
new_sheet = gc.copy(gspread_agb_stats_template.id, title=gspread_agb_stats_name, copy_permissions=False)
gspread_agb_stats = gc.open_by_key(new_sheet.id)

# Define worksheets
summary_stats_sheet = gspread_agb_stats.worksheet("Summary stats")
arima_forest_cover_sheet = gspread_agb_stats.worksheet("BAU with CI forest cover (ha)")
arima_total_agb_sheet = gspread_agb_stats.worksheet("BAU with CI forest AGB (Tg)")
template_forest_cover = gspread_agb_stats.worksheet("Template forest cover (ha)")
template_forest_agb = gspread_agb_stats.worksheet("Template forest AGB (Tg)")
template_potential_agb = gspread_agb_stats.worksheet("Template potential AGB (%)")

# Add summary stats
summary_stats_dir = join(sample_polygons_predictions_dir, 'summary_stats.csv')
df_summary_stats = pd.read_csv(summary_stats_dir)
df_summary_stats = df_summary_stats.rename(columns = {"Unnamed: 0":"Name"})
summary_stats_sheet.update([df_summary_stats.columns.values.tolist()] + df_summary_stats.values.tolist())

# Add ARIMA forecasts for forest cover (ha)
arima_forest_cover_ha_dir = join(bau_forecasting_dir, 'forest_cover_ha.csv')
df_arima_forest_cover_ha = pd.read_csv(arima_forest_cover_ha_dir)
df_arima_forest_cover_ha = df_arima_forest_cover_ha.rename(columns = {"Unnamed: 0":"Year"})
arima_forest_cover_sheet.update([df_arima_forest_cover_ha.columns.values.tolist()] + df_arima_forest_cover_ha.values.tolist())

# Add ARIMA forecasts for forest AGB total (Tg)
arima_agb_total_tg_dir = join(bau_forecasting_dir, 'agb_total_tg.csv')
df_arima_agb_total_tg = pd.read_csv(arima_agb_total_tg_dir)
df_arima_agb_total_tg = df_arima_agb_total_tg.rename(columns = {"Unnamed: 0":"Year"})
arima_total_agb_sheet.update([df_arima_agb_total_tg.columns.values.tolist()] + df_arima_agb_total_tg.values.tolist())

# Create 'Forest cover (ha)' sheets for all polygons
gspread_agb_stats_max_index = len(gspread_agb_stats.worksheets())-1
sample_polygon_no = 1
for sample_polygon in df_summary_stats["Name"]:
  polygon_forest_cover_sheet = f"Forest cover (ha): {sample_polygon}"
  gspread_agb_stats.duplicate_sheet(source_sheet_id = template_forest_cover.id,
                                    insert_sheet_index = gspread_agb_stats_max_index+sample_polygon_no,
                                    new_sheet_name=polygon_forest_cover_sheet)
  gspread_agb_stats.worksheet(polygon_forest_cover_sheet).update_cell(1, 1, sample_polygon_no)
  sample_polygon_no +=1
gspread_agb_stats.del_worksheet(template_forest_cover)

# Create 'Forest AGB (Tg)' sheets for all polygons
gspread_agb_stats_max_index = len(gspread_agb_stats.worksheets())-1
sample_polygon_no = 1
for sample_polygon in df_summary_stats["Name"]:
  polygon_forest_agb_sheet = f"Forest AGB (Tg): {sample_polygon}"
  gspread_agb_stats.duplicate_sheet(source_sheet_id = template_forest_agb.id,
                                    insert_sheet_index = gspread_agb_stats_max_index+sample_polygon_no,
                                    new_sheet_name=polygon_forest_agb_sheet)
  gspread_agb_stats.worksheet(polygon_forest_agb_sheet).update_cell(1, 1, sample_polygon_no)
  sample_polygon_no +=1
gspread_agb_stats.del_worksheet(template_forest_agb)

# Create 'Potential AGB (%)' sheets for all polygons
gspread_agb_stats_max_index = len(gspread_agb_stats.worksheets())-1
sample_polygon_no = 1
for sample_polygon in df_summary_stats["Name"]:
  polygon_potential_agb_sheet = f"Potential AGB (%): {sample_polygon}"
  gspread_agb_stats.duplicate_sheet(source_sheet_id = template_potential_agb.id,
                                    insert_sheet_index = gspread_agb_stats_max_index+sample_polygon_no,
                                    new_sheet_name=polygon_potential_agb_sheet)
  gspread_agb_stats.worksheet(polygon_potential_agb_sheet).update_cell(1, 1, sample_polygon_no)
  sample_polygon_no +=1
gspread_agb_stats.del_worksheet(template_potential_agb)

# Disconnected runtime

In [None]:
# Useful for stopping background execution
runtime.unassign()