In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np

# Add the locations of the data folders here, and the crop file locations
seasonal_data_folder = '../data/seasonal_era_agriclimatic/'
ten_day_data_folder = '../data/10-day-data/'
crop_data_2005_file = '../data/spam2005v3r2_global_yield/spam2005V3r2_global_Y_TA.csv'
crop_data_2010_file = '../data/spam2010v1r1_global_yield/spam2010V1r1_global_Y_TA.csv'

# Leave this, it just stores the names of the ten day feature files
ten_day_feature_files = {
    'BEDD': 'BEDD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'FD': 'FD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R20mm': 'R20mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R10mm': 'R10mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'ID': 'ID_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TN': 'TN_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'DTR': 'DTR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR1': 'RR1_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR': 'RR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SDII': 'SDII_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SU': 'SU_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TNn': 'TNn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TR': 'TR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TX': 'TX_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXn': 'TXn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXx': 'TXx_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc'
}

# I noticed the CSU file is actually the same as the CSDI file, so I havent included it
seasonal_feature_files = {
    'CDD': 'CDD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CFD': 'CFD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CWD': 'CWD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WW': 'WW_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WSDI': 'WSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CSDI': 'CSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc'
}

In [None]:
# Create crop data file
crops = ['maize']
crop_data_2005 = pd.read_csv(crop_data_2005_file,  encoding = "ISO-8859-1")
crop_data_2010 = pd.read_csv(crop_data_2010_file,  encoding = "ISO-8859-1")

merged = crop_data_2010.merge(crop_data_2005, how='inner', on='alloc_key', suffixes=['_2010', '_2005'])
kept_columns = ['alloc_key', 'x', 'y', 'iso3_2010']
for crop in crops:
    kept_columns += [f'{crop[0:4]}_a_2010', f'{crop[0:4]}_a_2005']
merged = merged[kept_columns]
for crop in crops:
    merged = merged.rename(columns={
    f'{crop[0:4]}_a_2010': f'{crop}_a_2010',
    f'{crop[0:4]}_a_2005': f'{crop}_a_2005',
})
    
merged = merged.rename(columns={
    'iso3_2010': 'iso3',
    'x': 'lon',
    'y': 'lat'
})

del crop_data_2005
del crop_data_2010
data = merged.dropna()
del merged

# Only include non zero yields
data = data[data['maize_a_2005'] > 0]
data = data[data['maize_a_2010'] > 0]
data

In [None]:
# This makes columns which compute the nearest the x y coordinates in the climate data.
# This massively speeds up the computation of the climatic indicators.
data['nearest_lat'] = 0
data['bool'] = 0
data['bool'] = ((data['lat'] % 1) <= 0.5).astype(float)
data['nearest_lat'] =  data['bool'] * (data['lat'].astype(int) + 0.25) + (1 - data['bool']) * (data['lat'].astype(int) + 0.75)

data['nearest_lon'] = 0
data['bool'] = 0
data['bool'] = ((data['lon'] % 1) <= 0.5).astype(float)
data['nearest_lon'] =  data['bool'] * (data['lon'].astype(int) + 0.25) + (1 - data['bool']) * (data['lon'].astype(int) + 0.75)
data = data.drop(columns='bool')
data

In [None]:
# Compute the seasonal features
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
years = ['2010', '2005']
seasonal_features = [
    'CDD',
    'CFD',
    'CWD',
    'WW',
    'WSDI',
    'CSDI'
]
quarter_time_mapping = {
    'Q1': '01-16',
    'Q2': '04-16',
    'Q3': '07-16',
    'Q4': '10-16'
}

lats_ = xr.DataArray(list(data['nearest_lat'].values), dims='z')
lons_ = xr.DataArray(list(data['nearest_lon'].values), dims='z')

for feature in tqdm(seasonal_features):
    for year in years:
        with xr.open_dataset(seasonal_data_folder + seasonal_feature_files[feature]) as ds:
            feature_data = ds.load()
        for quarter in quarters:
            feature_name = f'{feature}-{quarter}-{year}'
            time = f'{year}-{quarter_time_mapping[quarter]}'
            time_data = feature_data.sel(time=time).squeeze().sel(lat=lats_).sel(lon=lons_)
            data[feature_name] = getattr(time_data, feature)
            del time_data
    del feature_data
    
data

In [None]:
# Compute the 10 day features

# Choose the features you want at the top of the notebook based on the ten_day_feature_files
# dict.
days = ['05', '15', '25']
years = ['2010', '2005']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
ten_day_features = [
    'BEDD',
    'FD',
    'R20mm',
    'RR1',
    'DTR'
]

lats_ = xr.DataArray(list(data['nearest_lat'].values), dims='z')
lons_ = xr.DataArray(list(data['nearest_lon'].values), dims='z')

for feature in tqdm(ten_day_features):
    with xr.open_dataset(ten_day_data_folder + ten_day_feature_files[feature]) as ds:
        feature_data = ds.load()
    for day in days:
        for month in months:
            for year in years:
                time = f'{year}-{month}-{day}'
                time_data = feature_data.sel(time=time).squeeze().sel(lat=lats_).sel(lon=lons_)
                data[f'{feature}-{month}-{day}-{year}'] = getattr(time_data, feature)
    del feature_data
    del time_data
data

In [None]:
# Save data to a csv
file_path = f'head_of_soils_recommendations.csv'
data.to_csv(file_path)

In [None]:
# Inspect a CSV file
file_path = f'head_of_soils_recommendations.csv'
data = pd.read_csv(file_path)
data.head()

In [None]:
# Create a custom train and test set adding in the features you want. It reads an already computed 
# dataset but you can then remove features to see the effect on score.
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import random

# Which data set are we reading in?
read_file_path = f'head_of_soils_recommendations.csv'

# Set the random seed for the whole process of creating the training and test set.
random_seed = 42
random.seed(a=random_seed)
np.random.seed(random_seed)

# Test set fraction. What fraction of the data set to make a test set
test_size = 0.2


# Only read a sample fraction of the data, speeds things up, but you'll get worse performance.
# Use 1.0 as the sample fraction to get the full data set
sample_fraction = 1.0
data = pd.read_csv(read_file_path, skiprows=lambda i: i>0 and random.random() > sample_fraction)

# Remove datapoints where yield changes by more than the fraction between 2005 and 2010.
# This is unlikely to be due to changes in climate.
limit_change_in_yield = True
limit = 1.0

# Which type of features to include (if True must already exist in read dataset)
include_coordinates = True
include_2005_maize = True
include_seasonal = True
include_ten_day = True
include_2005_climate = True
include_soil_type = False
include_growing_zones = False

# Drop nans?
drop_nans = True


# Choose a subset of your features! (must already exist in original file)
ten_day_features = [
    'BEDD',
    'R20mm',
    'DTR'
]

seasonal_features = [
    'CDD',
    'CFD',
    'CWD',
    'WW',
    'WSDI',
    'CSDI'
]

if limit_change_in_yield:
    data = data[abs(data['maize_a_2010'] - data['maize_a_2005']) / data['maize_a_2005'] < limit]

features = []

if include_growing_zones:
    growing_zones = [
        'Inland water bodies',
        'Subtropics - summer rainfall',
        'Subtropics - winter rainfall',
        'Temperature - continental',
        'Temperature - oceanic',
        'Temperature - subcontinental',
        'Tropics'
    ]
    
if include_soil_type:
    features = features + ['soil_type']

if include_2005_climate:
    years = ['2010', '2005']
else:
    years = ['2010']

if include_seasonal:
    quarters = ['Q1', 'Q2', 'Q3', 'Q4']
    for feature in seasonal_features:
        for season in seasons:
            for year in years:
                features.append(f'{feature}-{quarter}-{year}')

if include_ten_day:
    days = ['05', '15', '25']
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    for feature in ten_day_features:
        for day in days:
            for month in months:
                for year in years:
                    features.append(f'{feature}-{month}-{day}-{year}')

if include_coordinates:
    features = features + ['lon', 'lat']
    
if include_2005_maize:
    features = features + ['maize_a_2005']

target = 'maize_a_2010'

if drop_nans:
    data = data.dropna()
    
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)

del X
del y
del data

In [None]:
# Try a simple random forest regressor
regressor = RandomForestRegressor(n_estimators=100, n_jobs=10, random_state=random_seed)
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print(score)

In [None]:
# Predict on Hadgem data