In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# this cell updates packages and then restarts runtime
%%capture
!pip install pandas==1.0.0
!pip install pandarallel
!pip install tqdm==4.42.1

import os
os.kill(os.getpid(), 9)

In [0]:
%%capture
import os
path = r'/content/drive/My Drive/Team Plants/data/'
os.chdir(path)

!pip install -e .
#!apt-get -qq install python-cartopy python3-cartopy

#!pip uninstall -y shapely
#!pip install shapely --no-binary shapely

os.chdir('/content/drive/My Drive/Team Plants')

In [0]:
%%capture
!pip install affine rasterio
from affine import Affine

In [0]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np

seasonal_data_folder = 'data/seasonal_era_agriclimatic/'
ten_day_data_folder = 'data/10-day-data/'
crop_data_2005_file = 'data/spam2005v3r2_global_yield/spam2005V3r2_global_Y_TA.csv'
crop_data_2010_file = 'data/spam2010v1r1_global_yield/spam2010V1r1_global_Y_TA.csv'



# Leave this, it just stores the names of the ten day feature files
ten_day_feature_files = {
    'BEDD': 'BEDD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'FD': 'FD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R20mm': 'R20mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R10mm': 'R10mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'ID': 'ID_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TN': 'TN_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'DTR': 'DTR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR1': 'RR1_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR': 'RR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SDII': 'SDII_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SU': 'SU_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TNn': 'TNn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TR': 'TR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TX': 'TX_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXn': 'TXn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXx': 'TXx_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc'
}

# I noticed the CSU file is actually the same as the CSDI file, so I havent included it
seasonal_feature_files = {
    'CDD': 'CDD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CFD': 'CFD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CWD': 'CWD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WW': 'WW_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WSDI': 'WSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CSDI': 'CSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc'
}


#Folder and filenames for spatial attributes 

spatial_attributes_folder = 'data/spatial_attributes_netcdf/'

spatial_attribute_files = {
    'soil_types': 'soil_types.nc',
    'climate_zones': 'climate_zones.nc'
}

In [4]:
# Create crop data file
crops = ['maize']
crop_data_2005 = pd.read_csv('data/spam2005v3r2_global_yield/spam2005V3r2_global_Y_TA.csv',  encoding = "ISO-8859-1")
crop_data_2010 = pd.read_csv('data/spam2010v1r1_global_yield/spam2010V1r1_global_Y_TA.csv',  encoding = "ISO-8859-1")

merged = crop_data_2010.merge(crop_data_2005, how='outer', on='alloc_key', suffixes=['_2010', '_2005'])
kept_columns = ['alloc_key', 'x', 'y', 'iso3_2010']
for crop in crops:
    kept_columns += [f'{crop[0:4]}_a_2010', f'{crop[0:4]}_a_2005']
merged = merged[kept_columns]
for crop in crops:
    merged = merged.rename(columns={
    f'{crop[0:4]}_a_2010': f'{crop}_a_2010',
    f'{crop[0:4]}_a_2005': f'{crop}_a_2005',
})
    
merged = merged.rename(columns={'iso3_2010': 'iso3'})

del crop_data_2005
del crop_data_2010
data = merged.dropna()
del merged
data = data[data['maize_a_2005'] > 0]
data = data[data['maize_a_2010'] > 0]
data

Unnamed: 0,alloc_key,x,y,iso3,maize_a_2010,maize_a_2005
0,4383640,123.291667,53.541667,CHN,3918.1,2819.2
3,4393629,122.375000,53.458333,CHN,3119.3,2028.5
8,4403648,123.958333,53.375000,CHN,3230.1,2337.9
9,4403649,124.041667,53.375000,CHN,3119.3,2010.2
10,4413637,123.041667,53.291667,CHN,3918.1,2819.2
...,...,...,...,...,...,...
832721,7670982,-98.208333,26.125000,USA,5939.7,6106.9
832722,7670983,-98.125000,26.125000,USA,6050.9,6107.8
832723,7670984,-98.041667,26.125000,USA,6111.0,6131.7
832724,7670985,-97.958333,26.125000,USA,6152.1,6181.2


In [5]:
# This makes columns which compute the nearest the x y coordinates in the climate data.
# This massively speeds up the computation of the climatic indicators.
data['nearest_lat'] = 0
data['bool'] = 0
data['bool'] = ((data['y'] % 1) <= 0.5).astype(float)
data['nearest_lat'] =  data['bool'] * (data['y'].astype(int) + 0.25) + (1 - data['bool']) * (data['y'].astype(int) + 0.75)
# data[data['y'] % 1 > 0.5]['nearest_lat'] = data[data['y'] % 1 > 0.5]['y'].astype(int) + 0.75

data['nearest_lon'] = 0
data['bool'] = 0
data['bool'] = ((data['x'] % 1) <= 0.5).astype(float)
data['nearest_lon'] =  data['bool'] * (data['x'].astype(int) + 0.25) + (1 - data['bool']) * (data['x'].astype(int) + 0.75)
# data[data['y'] % 1 > 0.5]['nearest_lat'] = data[data['y'] % 1 > 0.5]['y'].astype(int) + 0.75
data = data.drop(columns='bool')
data

Unnamed: 0,alloc_key,x,y,iso3,maize_a_2010,maize_a_2005,nearest_lat,nearest_lon
0,4383640,123.291667,53.541667,CHN,3918.1,2819.2,53.75,123.25
3,4393629,122.375000,53.458333,CHN,3119.3,2028.5,53.25,122.25
8,4403648,123.958333,53.375000,CHN,3230.1,2337.9,53.25,123.75
9,4403649,124.041667,53.375000,CHN,3119.3,2010.2,53.25,124.25
10,4413637,123.041667,53.291667,CHN,3918.1,2819.2,53.25,123.25
...,...,...,...,...,...,...,...,...
832721,7670982,-98.208333,26.125000,USA,5939.7,6106.9,26.25,-97.25
832722,7670983,-98.125000,26.125000,USA,6050.9,6107.8,26.25,-97.25
832723,7670984,-98.041667,26.125000,USA,6111.0,6131.7,26.25,-97.25
832724,7670985,-97.958333,26.125000,USA,6152.1,6181.2,26.25,-96.75


In [0]:
# Compute the seasonal features
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
years = ['2010', '2005']
seasonal_features = [
    'CDD',
    'CFD',
    'CWD',
    'WW',
    'WSDI',
    'CSDI'
]
quarter_time_mapping = {
    'Q1': '01-16',
    'Q2': '04-16',
    'Q3': '07-16',
    'Q4': '10-16'
}

lats_ = xr.DataArray(list(data['nearest_lat'].values), dims='z')
lons_ = xr.DataArray(list(data['nearest_lon'].values), dims='z')


for feature in tqdm(seasonal_features):
    for year in years:
        with xr.open_dataset(seasonal_data_folder + seasonal_feature_files[feature]) as ds:
            feature_data = ds.load()
        for quarter in quarters:
            feature_name = f'{feature}-{quarter}-{year}'
            time = f'{year}-{quarter_time_mapping[quarter]}'
            time_data = feature_data.sel(time=time).squeeze().sel(lat=lats_).sel(lon=lons_)
            data[feature_name] = getattr(time_data, feature)
            del time_data
    del feature_data
    
data = data.dropna()

In [7]:
# Compute the 10 day features
days = ['05', '15', '25']
years = ['2010', '2005']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
ten_day_features = [
    'BEDD',
    'FD',
    'R20mm',
    'RR1',
    'DTR'
]

lats_ = xr.DataArray(list(data['nearest_lat'].values), dims='z')
lons_ = xr.DataArray(list(data['nearest_lon'].values), dims='z')

for feature in tqdm(ten_day_features):
    with xr.open_dataset(ten_day_data_folder + ten_day_feature_files[feature]) as ds:
        feature_data = ds.load()
    for day in days:
        for month in months:
            for year in years:
                time = f'{year}-{month}-{day}'
                time_data = feature_data.sel(time=time).squeeze().sel(lat=lats_).sel(lon=lons_)
                data[f'{feature}-{month}-{day}-{year}'] = getattr(time_data, feature)
    del feature_data
    del time_data
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,alloc_key,x,y,iso3,maize_a_2010,maize_a_2005,nearest_lat,nearest_lon,CDD-Q1-2010,CDD-Q2-2010,CDD-Q3-2010,CDD-Q4-2010,CDD-Q1-2005,CDD-Q2-2005,CDD-Q3-2005,CDD-Q4-2005,CFD-Q1-2010,CFD-Q2-2010,CFD-Q3-2010,CFD-Q4-2010,CFD-Q1-2005,CFD-Q2-2005,CFD-Q3-2005,CFD-Q4-2005,CWD-Q1-2010,CWD-Q2-2010,CWD-Q3-2010,CWD-Q4-2010,CWD-Q1-2005,CWD-Q2-2005,CWD-Q3-2005,CWD-Q4-2005,WW-Q1-2010,WW-Q2-2010,WW-Q3-2010,WW-Q4-2010,WW-Q1-2005,WW-Q2-2005,WW-Q3-2005,WW-Q4-2005,...,DTR-05-15-2010,DTR-05-15-2005,DTR-06-15-2010,DTR-06-15-2005,DTR-07-15-2010,DTR-07-15-2005,DTR-08-15-2010,DTR-08-15-2005,DTR-09-15-2010,DTR-09-15-2005,DTR-10-15-2010,DTR-10-15-2005,DTR-11-15-2010,DTR-11-15-2005,DTR-12-15-2010,DTR-12-15-2005,DTR-01-25-2010,DTR-01-25-2005,DTR-02-25-2010,DTR-02-25-2005,DTR-03-25-2010,DTR-03-25-2005,DTR-04-25-2010,DTR-04-25-2005,DTR-05-25-2010,DTR-05-25-2005,DTR-06-25-2010,DTR-06-25-2005,DTR-07-25-2010,DTR-07-25-2005,DTR-08-25-2010,DTR-08-25-2005,DTR-09-25-2010,DTR-09-25-2005,DTR-10-25-2010,DTR-10-25-2005,DTR-11-25-2010,DTR-11-25-2005,DTR-12-25-2010,DTR-12-25-2005
0,4383640,123.291667,53.541667,CHN,3918.1,2819.2,53.75,123.25,27.0,29.0,9.0,17.0,57.0,33.0,14.0,36.0,90.0,61.0,0.0,65.0,90.0,65.0,1.0,64.0,3.0,5.0,6.0,4.0,2.0,3.0,7.0,5.0,3.0,0.0,1.0,2.0,0.0,0.0,1.0,3.0,...,13.332382,15.252075,13.324167,14.803393,12.333570,14.756067,11.631696,15.343668,14.546430,11.065396,12.438498,11.142166,11.286641,14.996429,11.249286,10.087871,15.151639,12.930721,14.618134,18.445944,18.416126,18.118044,17.290451,15.084249,18.048315,17.350435,16.417456,15.666064,12.421432,18.544703,15.281061,14.009233,11.894153,14.988565,13.214319,13.867992,10.006902,17.991137,11.188674,12.582870
3,4393629,122.375000,53.458333,CHN,3119.3,2028.5,53.25,122.25,27.0,22.0,9.0,17.0,34.0,49.0,10.0,13.0,90.0,61.0,1.0,65.0,90.0,65.0,1.0,64.0,3.0,5.0,4.0,4.0,1.0,2.0,8.0,3.0,2.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,...,14.373129,16.072353,14.150744,14.985700,13.123743,14.822324,11.951498,15.621585,16.604532,12.483157,14.118250,12.118223,10.831568,14.309111,10.062572,10.678294,14.043386,14.118988,16.645658,19.131752,19.742460,18.905256,17.086720,14.848269,17.864985,17.181152,17.213867,16.614248,12.155520,18.846418,15.123383,14.364394,11.784802,15.658490,13.753125,14.851035,11.463429,19.127764,10.960008,11.977540
8,4403648,123.958333,53.375000,CHN,3230.1,2337.9,53.25,123.75,27.0,22.0,9.0,16.0,48.0,49.0,11.0,11.0,90.0,61.0,1.0,65.0,90.0,65.0,2.0,61.0,3.0,5.0,4.0,3.0,2.0,3.0,5.0,3.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,...,13.012189,14.915680,13.354315,14.232178,11.977076,14.324661,11.409525,15.301145,14.778852,11.762842,11.742081,11.759287,11.670111,15.063660,11.752439,11.255476,15.117078,15.012938,15.213829,18.536697,17.824129,18.527342,17.360651,14.678027,17.886518,18.207722,16.557888,15.687100,12.452676,18.201635,15.806396,14.238156,11.348834,14.887103,14.353187,13.712119,9.362610,17.653688,11.661184,11.915107
9,4403649,124.041667,53.375000,CHN,3119.3,2010.2,53.25,124.25,20.0,22.0,9.0,17.0,48.0,49.0,11.0,37.0,90.0,61.0,0.0,65.0,90.0,65.0,1.0,61.0,3.0,5.0,4.0,3.0,2.0,3.0,5.0,3.0,1.0,1.0,3.0,0.0,0.0,0.0,2.0,1.0,...,12.535214,14.541928,12.888736,13.793360,11.077112,13.991834,11.085077,14.730136,14.251724,11.557001,10.826090,11.548694,11.414854,14.990764,12.067510,11.229442,15.145199,14.770938,14.250885,18.149925,17.242399,17.814438,17.363144,14.670880,17.674713,18.273426,16.177734,15.402698,12.426830,17.743471,15.812142,14.066384,11.296124,14.793427,14.058295,13.091297,8.589572,16.730436,11.452671,11.731344
10,4413637,123.041667,53.291667,CHN,3918.1,2819.2,53.25,123.25,27.0,29.0,9.0,16.0,27.0,49.0,8.0,14.0,90.0,61.0,1.0,73.0,90.0,65.0,1.0,61.0,3.0,5.0,4.0,4.0,2.0,2.0,7.0,3.0,3.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,...,13.435962,15.141947,13.650900,14.538767,12.499225,14.663321,11.649561,15.718985,15.526764,12.019254,12.674429,11.831363,11.680980,15.017365,11.050886,10.994413,14.787366,15.027884,16.146458,18.597643,18.362791,19.157080,17.692719,15.005264,17.833313,17.882652,16.831717,15.784909,12.334378,18.547461,15.497644,14.201300,11.520777,15.101382,14.612255,14.250738,9.922144,18.349087,11.572720,11.905176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832697,7660984,-98.041667,26.208333,USA,6148.3,6278.7,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,9.539728,11.099600,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690
832720,7670981,-98.291667,26.125000,USA,5713.1,5832.9,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,9.539728,11.099600,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690
832721,7670982,-98.208333,26.125000,USA,5939.7,6106.9,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,9.539728,11.099600,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690
832722,7670983,-98.125000,26.125000,USA,6050.9,6107.8,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,9.539728,11.099600,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690


In [127]:
# Compute the spatial attribute features

spatial_attributes = [
    'soil_types',
    'climate_zones',
    ]

lats_ = xr.DataArray(list(data['nearest_lat'].values), dims='z')
lons_ = xr.DataArray(list(data['nearest_lon'].values), dims='z')

for feature in tqdm(spatial_attributes):
    with xr.open_dataset(spatial_attributes_folder + spatial_attribute_files[feature]) as ds:
        feature_data = ds.load()
        feature_name = feature
        feature_data = feature_data.rename({'__xarray_dataarray_variable__': 'variable_name'})
        sel_data = feature_data.sel(lat=lats_).sel(lon=lons_).variable_name
        data[feature_name] = sel_data.values[0]
    del sel_data
    del feature_data




  0%|          | 0/2 [00:00<?, ?it/s][A[A[A


 50%|█████     | 1/2 [00:00<00:00,  2.00it/s][A[A[A


100%|██████████| 2/2 [00:00<00:00,  2.23it/s]


In [128]:
data

Unnamed: 0,alloc_key,x,y,iso3,maize_a_2010,maize_a_2005,nearest_lat,nearest_lon,CDD-Q1-2010,CDD-Q2-2010,CDD-Q3-2010,CDD-Q4-2010,CDD-Q1-2005,CDD-Q2-2005,CDD-Q3-2005,CDD-Q4-2005,CFD-Q1-2010,CFD-Q2-2010,CFD-Q3-2010,CFD-Q4-2010,CFD-Q1-2005,CFD-Q2-2005,CFD-Q3-2005,CFD-Q4-2005,CWD-Q1-2010,CWD-Q2-2010,CWD-Q3-2010,CWD-Q4-2010,CWD-Q1-2005,CWD-Q2-2005,CWD-Q3-2005,CWD-Q4-2005,WW-Q1-2010,WW-Q2-2010,WW-Q3-2010,WW-Q4-2010,WW-Q1-2005,WW-Q2-2005,WW-Q3-2005,WW-Q4-2005,...,DTR-06-15-2010,DTR-06-15-2005,DTR-07-15-2010,DTR-07-15-2005,DTR-08-15-2010,DTR-08-15-2005,DTR-09-15-2010,DTR-09-15-2005,DTR-10-15-2010,DTR-10-15-2005,DTR-11-15-2010,DTR-11-15-2005,DTR-12-15-2010,DTR-12-15-2005,DTR-01-25-2010,DTR-01-25-2005,DTR-02-25-2010,DTR-02-25-2005,DTR-03-25-2010,DTR-03-25-2005,DTR-04-25-2010,DTR-04-25-2005,DTR-05-25-2010,DTR-05-25-2005,DTR-06-25-2010,DTR-06-25-2005,DTR-07-25-2010,DTR-07-25-2005,DTR-08-25-2010,DTR-08-25-2005,DTR-09-25-2010,DTR-09-25-2005,DTR-10-25-2010,DTR-10-25-2005,DTR-11-25-2010,DTR-11-25-2005,DTR-12-25-2010,DTR-12-25-2005,soil_types,climate_zones
0,4383640,123.291667,53.541667,CHN,3918.1,2819.2,53.75,123.25,27.0,29.0,9.0,17.0,57.0,33.0,14.0,36.0,90.0,61.0,0.0,65.0,90.0,65.0,1.0,64.0,3.0,5.0,6.0,4.0,2.0,3.0,7.0,5.0,3.0,0.0,1.0,2.0,0.0,0.0,1.0,3.0,...,13.324167,14.803393,12.333570,14.756067,11.631696,15.343668,14.546430,11.065396,12.438498,11.142166,11.286641,14.996429,11.249286,10.087871,15.151639,12.930721,14.618134,18.445944,18.416126,18.118044,17.290451,15.084249,18.048315,17.350435,16.417456,15.666064,12.421432,18.544703,15.281061,14.009233,11.894153,14.988565,13.214319,13.867992,10.006902,17.991137,11.188674,12.582870,40,9
3,4393629,122.375000,53.458333,CHN,3119.3,2028.5,53.25,122.25,27.0,22.0,9.0,17.0,34.0,49.0,10.0,13.0,90.0,61.0,1.0,65.0,90.0,65.0,1.0,64.0,3.0,5.0,4.0,4.0,1.0,2.0,8.0,3.0,2.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,...,14.150744,14.985700,13.123743,14.822324,11.951498,15.621585,16.604532,12.483157,14.118250,12.118223,10.831568,14.309111,10.062572,10.678294,14.043386,14.118988,16.645658,19.131752,19.742460,18.905256,17.086720,14.848269,17.864985,17.181152,17.213867,16.614248,12.155520,18.846418,15.123383,14.364394,11.784802,15.658490,13.753125,14.851035,11.463429,19.127764,10.960008,11.977540,40,9
8,4403648,123.958333,53.375000,CHN,3230.1,2337.9,53.25,123.75,27.0,22.0,9.0,16.0,48.0,49.0,11.0,11.0,90.0,61.0,1.0,65.0,90.0,65.0,2.0,61.0,3.0,5.0,4.0,3.0,2.0,3.0,5.0,3.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,...,13.354315,14.232178,11.977076,14.324661,11.409525,15.301145,14.778852,11.762842,11.742081,11.759287,11.670111,15.063660,11.752439,11.255476,15.117078,15.012938,15.213829,18.536697,17.824129,18.527342,17.360651,14.678027,17.886518,18.207722,16.557888,15.687100,12.452676,18.201635,15.806396,14.238156,11.348834,14.887103,14.353187,13.712119,9.362610,17.653688,11.661184,11.915107,40,6
9,4403649,124.041667,53.375000,CHN,3119.3,2010.2,53.25,124.25,20.0,22.0,9.0,17.0,48.0,49.0,11.0,37.0,90.0,61.0,0.0,65.0,90.0,65.0,1.0,61.0,3.0,5.0,4.0,3.0,2.0,3.0,5.0,3.0,1.0,1.0,3.0,0.0,0.0,0.0,2.0,1.0,...,12.888736,13.793360,11.077112,13.991834,11.085077,14.730136,14.251724,11.557001,10.826090,11.548694,11.414854,14.990764,12.067510,11.229442,15.145199,14.770938,14.250885,18.149925,17.242399,17.814438,17.363144,14.670880,17.674713,18.273426,16.177734,15.402698,12.426830,17.743471,15.812142,14.066384,11.296124,14.793427,14.058295,13.091297,8.589572,16.730436,11.452671,11.731344,40,6
10,4413637,123.041667,53.291667,CHN,3918.1,2819.2,53.25,123.25,27.0,29.0,9.0,16.0,27.0,49.0,8.0,14.0,90.0,61.0,1.0,73.0,90.0,65.0,1.0,61.0,3.0,5.0,4.0,4.0,2.0,2.0,7.0,3.0,3.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,...,13.650900,14.538767,12.499225,14.663321,11.649561,15.718985,15.526764,12.019254,12.674429,11.831363,11.680980,15.017365,11.050886,10.994413,14.787366,15.027884,16.146458,18.597643,18.362791,19.157080,17.692719,15.005264,17.833313,17.882652,16.831717,15.784909,12.334378,18.547461,15.497644,14.201300,11.520777,15.101382,14.612255,14.250738,9.922144,18.349087,11.572720,11.905176,40,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
832697,7660984,-98.041667,26.208333,USA,6148.3,6278.7,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690,76,-997
832720,7670981,-98.291667,26.125000,USA,5713.1,5832.9,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690,76,-997
832721,7670982,-98.208333,26.125000,USA,5939.7,6106.9,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690,76,-997
832722,7670983,-98.125000,26.125000,USA,6050.9,6107.8,26.25,-97.25,16.0,23.0,33.0,16.0,12.0,48.0,24.0,19.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,11.0,13.0,10.0,3.0,2.0,6.0,7.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,...,11.322531,12.479111,11.477127,9.565717,10.018768,12.190143,9.422900,10.563825,13.942777,11.993628,13.248532,10.313437,14.381592,9.076151,11.906857,13.228763,13.828892,11.113091,16.484173,15.589267,14.154409,13.394033,11.191373,14.075293,9.930944,10.069348,10.064284,11.290084,13.312131,12.264083,12.297220,18.556065,13.426499,15.883956,13.732272,15.976010,12.377209,13.244690,76,-997


In [0]:
# Save data to csv
file_path = f'head_of_soils_recommendations_MGM.csv'
data.to_csv(file_path)

In [0]:
data = pd.read_csv(file_path)

In [0]:
# Create a custom data set adding in the features you want. It reads an already computed 
# dataset but you can then remove features to see the effect on score.
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import random

random_seed = 42
random.seed(a=random_seed)
np.random_seed(random_seed)

read_file_path = f'head_of_soils_recommendations.csv'

# Only read a sample fraction of the data, speeds things up, but you'll get worse performance.
# Use 1.0 as the sample fraction to get the full data set
sample_fraction = 0.3
data = pd.read_csv(read_file_path, skiprows=lambda i: i>0 and random.random() > sample_fraction)

limit_change_in_yield = True
limit = 0.5

include_coordinates = False
include_2005_maize = True
include_seasonal = True
include_ten_day = True
include_2005_climate = True
include_soil_type = False
include_growing_zones = False

# Remove datapoints where yield changes by more than 50%. This is unlikely to be
# due to changes in climate change.

if limit_change_in_yield:
    data = data[abs(data['maize_a_2010'] - data['maize_a_2005']) / data['maize_a_2005'] < limit]

# Choose a subset of your features! (must already exist in original file)
ten_day_features = [
    'BEDD',
    'R20mm',
    'DTR'
]

seasonal_features = [
    'CDD',
    'CFD',
    'CWD',
    'WW',
    'WSDI',
    'CSDI'
]

features = []

if include_growing_zones:
    growing_zones = [
        'Inland water bodies',
        'Subtropics - summer rainfall',
        'Subtropics - winter rainfall',
        'Temperature - continental',
        'Temperature - oceanic',
        'Temperature - subcontinental',
        'Tropics'
    ]
    
if include_soil_type:
    features = features + ['soil_type']

if include_2005_climate:
    years = ['2010', '2005']
else:
    years = ['2010']

if include_seasonal:
    seasons = ['Q1', 'Q2', 'Q3', 'Q4']
    years = ['2010', '2005']
    for feature in seasonal_features:
        for season in seasons:
            for year in years:
                features.append(f'{feature}-{season}-{year}')

if include_ten_day:
    days = ['05', '15', '25']
    years = ['2010', '2005']
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    for feature in ten_day_features:
        for day in days:
            for month in months:
                for year in years:
                    features.append(f'{feature}-{month}-{day}-{year}')

if include_coordinates:
    coordinates = [
        'x',
        'y'
    ]
    features = features + coordinates
    
if include_2005_maize:
    features = features + ['maize_a_2005']

target = 'maize_a_2010'
data = data.dropna()
X = data[features]
y = data[[target]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)
del X
del y
del data

In [0]:
regressor = RandomForestRegressor(n_estimators=100, n_jobs=10, random_state=42)
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print(score)

  


0.9503162643445006
