# Supplementary 1

Jorge Lizarazo & Emna Gharbia

<div style="text-align: left"> 
This document outlines the process of loading and merging forestry data from a GeoPackage file using Fiona Python library. 
</div>

There are the libraries used:

In [1]:
import fiona
import pandas as pd
from dask import delayed, compute

## Listing Available Layers
<div style="text-align: left"> 
Then we create an object for our GeoPackage file called input_file in order to use it. We created a function to list all available layers in the GeoPackage file to understand the structure and contents of the dataset.
 </div>

In [2]:


input_file = "CARTE_ECO_ORI_PROV.gpkg"

# List available layers
layers = fiona.listlayers(input_file)
print("Available layers:", layers)

# Function to read and display columns of each layer
def list_layer_columns(input_file, layer):
    with fiona.open(input_file, layer=layer) as src:
        columns = list(src.schema['properties'].keys())
        print(f"Columns in layer '{layer}': {columns}")

# Iterate over each layer and display its columns
for layer in layers:
    list_layer_columns(input_file, layer)




Available layers: ['pee_ori', 'meta_ori', 'vue_peup_etage_ori', 'vue_peup_essence_ori', 'vue_peup_meta_ori', 'etage_ori', 'essence_ori', 'layer_styles']
Columns in layer 'pee_ori': ['geocode', 'origine', 'an_origine', 'perturb', 'an_perturb', 'reb_ess1', 'reb_ess2', 'reb_ess3', 'et_domi', 'part_str', 'type_couv', 'gr_ess', 'cl_dens', 'cl_haut', 'cl_age', 'etagement', 'couv_gaule', 'cl_pent', 'dep_sur', 'cl_drai', 'type_eco', 'co_ter', 'type_ter', 'strate', 'met_at_str', 'superficie', 'toponyme', 'no_prg', 'ver_prg', 'shape_length', 'shape_area']
Columns in layer 'meta_ori': ['geocode', 'latitude', 'longitude', 'no_prg', 'ver_prg', 'ver_carto', 'ver_eco', 'ver_cmp', 'statut_acq', 'met_prod', 'pro_sou', 'an_pro_sou', 'an_saisie', 'met_ori', 'pro_ori', 'an_pro_ori', 'resolution', 'or_cl_pent', 'correction', 'dt_correct', 'ty_correct', 'in_etage', 'in_essence', 'us_for', 'in_son_pee', 'no_uco', 'in_cmp_pee', 'in_climat', 'in_contr', 'in_produ', 'in_station', 'in_bois', 'in_biom']
Columns i

## Selecting and Taking off Specific Columns
<div style="text-align: left"> 
We use the read_layer_attributes function to read specific columns from each layer. This function leverages fiona for efficient data loading and Dask for parallel processing. 
 </div>


In [3]:
@delayed
def read_layer_attributes(input_file, layer, columns):
    with fiona.open(input_file, layer=layer) as src:
        records = [{col: feat['properties'][col] for col in columns} for feat in src]
    return pd.DataFrame(records)

<div style="text-align: left"> 
We load the layers concurrently using Dask to improve performance.
</div>

In [4]:
# Define las columns you want to load for each layer
meta_ori_columns = ['latitude', 'longitude', 'no_uco', 'in_essence', 'geocode']
pee_ori_columns = ['an_origine', 'type_ter', 'superficie', 'no_prg',
                               'shape_length', 'shape_area', 'geocode']
vue_peup_etage_ori_columns = ['etage', 'densite', 'cl_age_et', 'cl_age',
                              'geocode']
vue_peup_essence_ori_columns = ['essence', 'geocode', 'st_ess_pc']


# Load layers concurrently using fiona
meta_ori_d = read_layer_attributes(input_file, 'meta_ori', meta_ori_columns)
pee_ori_d = read_layer_attributes(input_file, 'pee_ori', pee_ori_columns)
stations_for_pee_ori_d = read_layer_attributes(input_file, 'vue_peup_etage_ori',
                                               vue_peup_etage_ori_columns)
vue_peup_essence_ori_d = read_layer_attributes(input_file, 'vue_peup_essence_ori',
                                               vue_peup_essence_ori_columns)
# Compute the results
meta_ori_d, pee_ori_d, stations_for_pee_ori_d, vue_peup_essence_ori_d = compute(
    meta_ori_d, pee_ori_d, stations_for_pee_ori_d, vue_peup_essence_ori_d
)


## Merging Datasets
<div style="text-align: left"> 
We perform join operations on the geocode column to merge the datasets.
 </div>

In [5]:
meta_ori_d['geocode'] = meta_ori_d['geocode'].astype(str)
pee_ori_d['geocode'] = pee_ori_d['geocode'].astype(str)
stations_for_pee_ori_d['geocode'] = stations_for_pee_ori_d['geocode'].astype(str)


# Merge datasets on geocode
merged_df = meta_ori_d.merge(pee_ori_d, on='geocode', how='inner')
merged_df = merged_df.merge(stations_for_pee_ori_d, on='geocode', how='inner')


## Exporting

In [6]:
# Define output file paths
csv_output_file = "data_final_forestry.csv"
#json_output_file = "merged_data_".json"

# Export the merged dataframe to CSV
merged_df.to_csv(csv_output_file, index=False)

# Export the merged dataframe to JSON
#merged_df.to_json(json_output_file, orient='records', lines=True)

print(f"CSV file saved to: {csv_output_file}")
#print(f"JSON file saved to: {json_output_file}")

CSV file saved to: data_final_forestry.csv
