# Create Data Sets

This file allows the creation of the data sets from netcdf files of:

* Agroclimatic Indicators: https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-agroclimatic-indicators?tab=overview)
* Crop data for years 2005 and 2010: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/PRFF8V
* Spatial data relevant to crops: *** Need link for this ***

In [None]:
!pip install tqdm==4.42.1
!pip install requests
!pip install tabulate
!pip install "colorama>=0.3.8"
!pip install future
!pip install netcdf4==1.5.3
!pip install xarray
!pip install numpy==1.18.1
!pip install rasterio
!pip install xgboost==0.90
!pip install joblib==0.14.1

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import numpy as np
import re
import json
import rasterio
import pickle
import joblib
import xgboost as xgb
import pandas as pd
import os, sys
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.metrics import r2_score
import random

In [2]:
# This cell contains the file information for the file names to load data in and save the created data set.

# Add the locations of the data folders here, and the crop file locations
seasonal_data_folder = '../data/seasonal_era_agriclimatic/'
ten_day_data_folder = '../data/10-day-data/'
crop_data_2005_file = '../data/spam2005v3r2_global_yield/spam2005V3r2_global_Y_TA.csv'
crop_data_2010_file = '../data/spam2010v1r1_global_yield/spam2010V1r1_global_Y_TA.csv'

# The crop data files are so named because that is their names on the Copernicus Website:
# https://cds.climate.copernicus.eu/cdsapp#!/dataset/sis-agroclimatic-indicators?tab=overview

ten_day_feature_files = {
    'BEDD': 'BEDD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'FD': 'FD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R20mm': 'R20mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R10mm': 'R10mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'ID': 'ID_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TN': 'TN_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'DTR': 'DTR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR1': 'RR1_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR': 'RR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SDII': 'SDII_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SU': 'SU_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TNn': 'TNn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TR': 'TR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TX': 'TX_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXn': 'TXn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXx': 'TXx_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc'
}

seasonal_feature_files = {
    'CDD': 'CDD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CFD': 'CFD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CWD': 'CWD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WW': 'WW_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WSDI': 'WSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CSDI': 'CSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc'
}

spatial_attributes_folder = '../data/spatial_attributes_netcdf/'

spatial_attribute_files = {
    'soil_types': 'soil_types.nc',
    'climate_zones': 'climate_zones.nc',
    'slope': 'slope_resampled_percent.nc',
    'elevation': 'elevation_resampled.nc',
    'irrigation': 'irrigation_aei_rf.nc'
}

out_file = 'climate_monthly_seasonal_2005_2010_with_spatial_features.csv'

In [None]:
# Load in the crop data and clean it.

# Choose the crop here
crop = 'maize'

# Read the crop data in
crop_data_2005 = pd.read_csv(crop_data_2005_file,  encoding = "ISO-8859-1")
crop_data_2010 = pd.read_csv(crop_data_2010_file,  encoding = "ISO-8859-1")

# Merge the data together on the 'alloc_key' wich refers to the same place on the Earth
merged = crop_data_2010.merge(crop_data_2005, how='inner', on='alloc_key', suffixes=['_2010', '_2005'])

# Only keep the columns we are interested in
kept_columns = ['alloc_key', 'x', 'y', 'iso3_2010', f'{crop[0:4]}_a_2010', f'{crop[0:4]}_a_2005']
merged = merged[kept_columns]


# Rename the columns
merged = merged.rename(columns={
    f'{crop[0:4]}_a_2010': f'{crop}_a_2010',
    f'{crop[0:4]}_a_2005': f'{crop}_a_2005',
    'iso3_2010': 'iso3',
    'x': 'lon',
    'y': 'lat'
})

# Only include non zero and remove NaNs yields
data = merged.dropna()
data = data[data[f'{crop}_a_2005'] > 0]
data = data[data[f'{crop}_a_2010'] > 0]


In [None]:
# Compute the 10 day features and converts them to monthly features
def compute_ten_day_feature(feature, file_location, year, dataset, interpolation_method='linear'):
    days = ['05', '15', '25']
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    lats_ = xr.DataArray(list(data['lat'].values), dims='z')
    lons_ = xr.DataArray(list(data['lon'].values), dims='z')
    with xr.open_dataset(file_location) as ds:
        feature_data = ds.load()
        for day in days:
            for month in months:
                time = f'{year}-{month}-{day}'
                timed = feature_data.sel(time=time).squeeze()
                feature_interpolated = timed.interp(lat=lats_, lon=lons_, method=interpolation_method)
                dataset[f'{feature}-{month}-{day}-{year}'] = getattr(feature_interpolated, feature)
    all_ten_days = []
    
    # This loop takes the sum of each feature for each month, making the features monthly as opposed to 10-day
    for month in months:
        for day in days:
            all_ten_days.append(f'{feature}-{month}-{day}-{year}')
            dataset[f'{feature}-{month}-{year}'] \
                = dataset[f'{feature}-{month}-05-{year}'] + dataset[f'{feature}-{month}-15-{year}'] + dataset[f'{feature}-{month}-25-{year}']
    dataset = dataset.drop(columns=all_ten_days)

    return dataset

# Computes the seasonal features.
def compute_seasonal_feature(feature, file_location, year, dataset, interpolation_method='linear'):
    lats = xr.DataArray(list(dataset['lat'].values), dims='z')
    lons = xr.DataArray(list(dataset['lon'].values), dims='z')
    quarters = ['Q1', 'Q2', 'Q3', 'Q4']
    quarter_time_mapping = {
        'Q1': '01-16',
        'Q2': '04-16',
        'Q3': '07-16',
        'Q4': '10-16'
    }
    with xr.open_dataset(file_location) as ds:
        feature_data = ds.load()
        
    for quarter in quarters:
        feature_name = f'{feature}-{quarter}-{year}'
        time = f'{year}-{quarter_time_mapping[quarter]}'
        timed = feature_data.sel(time=time).squeeze()
        feature_interpolated = timed.interp(lat=lats, lon=lons, method=interpolation_method)
        dataset[feature_name] = getattr(feature_interpolated, feature)
    return dataset

In [None]:
# Compute the seasonal features        
years = ['2010', '2009', '2006', '2005', '2004']
seasonal_features = [
    'CDD',
    'CFD',
    'CWD', 
    'WW',
    'WSDI',
    'CSDI'
]

for feature in tqdm(seasonal_features):
    file_location = seasonal_data_folder + seasonal_feature_files[feature]
    for year in years:
        data = compute_seasonal_feature(feature, file_location, year, data)

In [None]:
# Choose the features you want on the top of the notebook based on the ten_day_feature_files
# dict, and put them in this array.
ten_day_features = ten_day_feature_files.keys()
years = ['2010', '2009', '2006', '2005', '2004']
    
for feature in tqdm(ten_day_features):
    file_location = ten_day_data_folder + ten_day_feature_files[feature]
    for year in years:
        data = compute_ten_day_feature(feature, file_location, year, data)

In [None]:
# This cell takes bunches multiple years of data and creates a mean of the years. This is justified because the
# crop data in this project is a mean over 1 year either side of the year in question.

# For our project, there was no data for 2011 interim agriclimatic indicators, so a mean of 2009 and 2010 sufficed.
year_bunches = [['2010', '2009'], ['2005', '2004', '2006']]
features = fs = [col.replace('-2010', '') for col in data.columns if f'-2010' in col]
for f in features:
    for b in year_bunches:
        data[f'{f}-{b[0]}'] = data[[f'{f}-{year}' for year in b]].mean(axis=1, skipna=True)
        data = data.drop(columns=[f'{f}-{year}' for year in b[1:]])

In [None]:
# Compute the spatial features for the data set. 
spatial_attributes = [
    'soil_types',
    'climate_zones',
    'elevation',
    'slope',
    'irrigation'
]

categorical_spatial = [
    'soil_types',
    'climate_zones'
]

lats_ = xr.DataArray(list(data['lat'].values), dims='z')
lons_ = xr.DataArray(list(data['lon'].values), dims='z')

for feature in tqdm(spatial_attributes):
    with xr.open_rasterio(spatial_attributes_folder + spatial_attribute_files[feature]) as ds:
        feature_data = ds.load()
    feature_data = feature_data.rename({'x': 'lon','y': 'lat'}).squeeze()
    if feature in categorical_spatial:
        # The categorical features must not be interpolated. We will have to stick to the nearest.
        interpolation_method = 'nearest'
    else:
        interpolation_method = 'linear'
    data[feature] = feature_data.interp(lat=lats_, lon=lons_, method=interpolation_method)

In [None]:
# Save data to a csv
data.to_csv(out_file)