# Create Data Sets

This file allows the creation of the data sets from netcdf files of:

* Agroclimatic Indicators: https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-agroclimatic-indicators?tab=overview)
* Crop data for years 2005 and 2010: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/PRFF8V
* Spatial data relevant to crops: These files need to be netcdf4 files that you want to add in.

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import numpy as np
import re
import json
import rasterio
import pickle
import json

from shared_methods import compute_ten_day_feature, compute_seasonal_feature

In [None]:
# Specify the data folder structure from a stored json
out_file = 'climate_monthly_seasonal_2005_2010_with_spatial_features.csv'

with open('json_data/file_data.json', 'r') as outfile:
    file_data = json.load(outfile)
    
seasonal_data_folder = file_data['seasonal_data_folder']
ten_day_data_folder = file_data['ten_day_data_folder']
crop_data_2005_file = file_data['crop_data_2005_file']
crop_data_2010_file = file_data['crop_data_2010_file']
spatial_attributes_folder = file_data['spatial_attributes_folder']
ten_day_feature_files = file_data['ten_day_feature_files']
seasonal_feature_files = file_data['seasonal_feature_files']
spatial_attribute_files = file_data['spatial_attribute_files']

In [None]:
# Load in the crop data and clean it.
# Choose the crop here
crop = 'maize'

# Read the crop data in
crop_data_2005 = pd.read_csv(crop_data_2005_file,  encoding = "ISO-8859-1")
crop_data_2010 = pd.read_csv(crop_data_2010_file,  encoding = "ISO-8859-1")

# Merge the data together on the 'alloc_key' wich refers to the same place on the Earth
merged = crop_data_2010.merge(crop_data_2005, how='inner', on='alloc_key', suffixes=['_2010', '_2005'])

# Only keep the columns we are interested in
kept_columns = ['alloc_key', 'x', 'y', 'iso3_2010', f'{crop[0:4]}_a_2010', f'{crop[0:4]}_a_2005']
merged = merged[kept_columns]


# Rename the columns
merged = merged.rename(columns={
    f'{crop[0:4]}_a_2010': f'{crop}_a_2010',
    f'{crop[0:4]}_a_2005': f'{crop}_a_2005',
    'iso3_2010': 'iso3',
    'x': 'lon',
    'y': 'lat'
})

# Only include non zero and remove NaNs yields
data = merged.dropna()
data = data[data[f'{crop}_a_2005'] > 0]
data = data[data[f'{crop}_a_2010'] > 0]

In [None]:
# Compute the seasonal features        
years = ['2010', '2009', '2006', '2005', '2004']
seasonal_features = [
    'CDD',
    'CFD',
    'CWD', 
    'WW',
    'WSDI',
    'CSDI'
]

for feature in tqdm(seasonal_features):
    file_location = seasonal_data_folder + seasonal_feature_files[feature]
    for year in years:
        data = compute_seasonal_feature(feature, file_location, year, data)

In [None]:
# Choose the features you want on the top of the notebook based on the ten_day_feature_files
# dict, and put them in this array.
ten_day_features = ten_day_feature_files.keys()
years = ['2010', '2009', '2006', '2005', '2004']
    
for feature in tqdm(ten_day_features):
    file_location = ten_day_data_folder + ten_day_feature_files[feature]
    for year in years:
        data = compute_ten_day_feature(feature, file_location, year, data)

In [None]:
# This cell takes bunches multiple years of data and creates a mean of the years. This is justified because the
# crop data in this project is a mean over 1 year either side of the year in question.

# For our project, there was no data for 2011 interim agriclimatic indicators, so a mean of 2009 and 2010 sufficed.
year_bunches = [['2010', '2009'], ['2005', '2004', '2006']]
features = fs = [col.replace('-2010', '') for col in data.columns if f'-2010' in col]
for f in features:
    for b in year_bunches:
        data[f'{f}-{b[0]}'] = data[[f'{f}-{year}' for year in b]].mean(axis=1, skipna=True)
        data = data.drop(columns=[f'{f}-{year}' for year in b[1:]])

In [None]:
# Compute the spatial features for the data set. 
spatial_attributes = [
    'soil_types',
    'climate_zones',
    'elevation',
    'slope',
    'irrigation'
]

categorical_spatial = [
    'soil_types',
    'climate_zones'
]

lats_ = xr.DataArray(list(data['lat'].values), dims='z')
lons_ = xr.DataArray(list(data['lon'].values), dims='z')

for feature in tqdm(spatial_attributes):
    with xr.open_rasterio(spatial_attributes_folder + spatial_attribute_files[feature]) as ds:
        feature_data = ds.load()
    feature_data = feature_data.rename({'x': 'lon','y': 'lat'}).squeeze()
    if feature in categorical_spatial:
        # The categorical features must not be interpolated. We will have to stick to the nearest.
        interpolation_method = 'nearest'
    else:
        interpolation_method = 'linear'
    data[feature] = feature_data.interp(lat=lats_, lon=lons_, method=interpolation_method)

In [None]:
# Save data to a csv
data.to_csv(out_file)