In [8]:
import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm
pandarallel.initialize(progress_bar=True, nb_workers=11)
tqdm.pandas()
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
from numpy import polyfit
from numpy import isnan
from matplotlib.pyplot import hist2d
starting_folder = 'data/world_maize_2010_2005_climate_zones_soil.csv'
data = pd.read_csv(starting_folder)

INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [9]:
# This makes columns which compute the nearest the x y coordinates in the climate data.
# This massively speeds up the computation of the climatic indicators.
def get_coord(lon):
    left_over = lon % 1
    if left_over <= 0.5:
        return int(lon) + 0.25
    return int(lon) + 0.75

data['nearest_lon'] = 0
data['nearest_lat'] = 0

def compute_nearest_lon_lat(row):
    row['nearest_lon'] = get_coord(row['x'])
    row['nearest_lat'] = get_coord(row['y'])
    return row

data = data.apply(compute_lon_lat, axis=1)

In [3]:
data = data.parallel_apply(compute_lon_lat, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27988), Label(value='0 / 27988')))…

In [22]:
# Compute the 10 day features
days = ['05', '15', '25']
years = ['2010', '2005']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
ten_day_features = [
    'BEDD',
    'FD',
    'R20mm',
    'R10mm',
    'ID',
    'TG',
    'TN'
]

ten_day_feature_files = {
    'BEDD': 'BEDD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'FD': 'FD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R20mm': 'R20mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R10mm': 'R10mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'ID': 'ID_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TN': 'TN_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc'
    
}

lats_ = xr.DataArray(list(data['nearest_lat'].values), dims='z')
lons_ = xr.DataArray(list(data['nearest_lon'].values), dims='z')

for feature in tqdm(ten_day_features):
    with xr.open_dataset('data/10-day-data/' + ten_day_features_files[feature]) as ds:
        feature_data = ds.load()
    for day in days:
        for month in months:
            for year in years:
                time = f'{year}-{month}-{day}'
                time_data = feature_data.sel(time=time).squeeze().sel(lat=lats_).sel(lon=lons_)
                data[f'{feature}-{month}-{day}-{year}'] = getattr(time_data, feature)
    del feature_data       
data




  0%|          | 0/7 [00:00<?, ?it/s][A[A[A


 14%|█▍        | 1/7 [00:18<01:50, 18.39s/it][A[A[A


 29%|██▊       | 2/7 [00:36<01:31, 18.28s/it][A[A[A


 43%|████▎     | 3/7 [00:54<01:13, 18.33s/it][A[A[A


 57%|█████▋    | 4/7 [01:12<00:54, 18.25s/it][A[A[A


 71%|███████▏  | 5/7 [01:31<00:36, 18.20s/it][A[A[A


 86%|████████▌ | 6/7 [01:49<00:18, 18.42s/it][A[A[A


100%|██████████| 7/7 [02:09<00:00, 18.44s/it][A[A[A


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,iso3_2005,x,y,maiz_a_2005,maiz_a_2010,delta_maiz,...,TN-08-25-2010,TN-08-25-2005,TN-09-25-2010,TN-09-25-2005,TN-10-25-2010,TN-10-25-2005,TN-11-25-2010,TN-11-25-2005,TN-12-25-2010,TN-12-25-2005
0,236476,236476,236476,532948,CIV,-3.208,5.208,2292.2,2867.5,575.3,...,297.634949,296.061005,297.619720,297.783386,297.636108,297.469391,297.911469,298.337830,297.374176,297.710327
1,298568,298568,298568,646330,USA,-76.042,35.958,6665.7,6052.0,-613.7,...,,,,,,,,,,
2,298565,298565,298565,646327,USA,-76.292,35.958,7071.2,6420.1,-651.1,...,,,,,,,,,,
3,298564,298564,298564,646326,USA,-76.458,35.958,7471.4,6671.0,-800.4,...,,,,,,,,,,
4,31935,31935,31935,63870,CHN,116.042,29.792,8656.9,7956.7,-700.2,...,296.422455,295.465942,290.605164,294.665863,284.680389,284.649384,281.639771,280.437836,273.817108,276.974823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307857,203857,203857,203857,461737,RUS,61.375,54.625,2170.3,2179.6,9.3,...,283.211670,281.326385,280.650055,275.239502,271.140594,268.903534,264.375763,267.163147,249.836380,257.231140
307858,203858,203858,203858,461738,RUS,61.458,54.625,2062.6,2071.4,8.8,...,283.211670,281.326385,280.650055,275.239502,271.140594,268.903534,264.375763,267.163147,249.836380,257.231140
307859,5231,5231,5231,11383,CHN,123.542,43.792,5086.2,4958.5,-127.7,...,289.088287,288.543335,280.441833,283.576477,272.107391,272.306671,260.910950,265.788422,253.626526,251.440872
307860,2,2,2,9,CHN,123.042,53.292,2819.2,3918.1,1098.9,...,279.688202,278.321472,268.814636,274.138275,258.392792,261.701141,244.233917,242.027145,244.451035,237.541504


In [23]:
data.to_csv(f'{ten_day_features}_maize_world.csv')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
seasons = ['Q1', 'Q2', 'Q3', 'Q4']
seasonal_features = ['CFD', 'CWD', 'CDD', 'WW', 'CSDI', 'WSDI']
days = ['05', '15', '25']
years = ['2010', '2005']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
ten_day_features = [
    'BEDD',
    'FD',
    'R20mm',
    'R10mm',
    'ID',
    'TG',
    'TN'
]
data = pd.read_csv(f'{ten_day_features}_maize_world.csv')
features = []

for feature in seasonal_features:
    for season in seasons:
        for year in years:
            features.append(f'{feature}-{season}_{year}')

for feature in ten_day_features:
    for day in days:
        for month in months:
            for year in years:
                features.append(f'{feature}-{month}-{day}-{year}')

features = features + [ 'x', 'y', 'maiz_a_2005', 'soil_type']
growing_zones = [
    'Inland water bodies',
    'Subtropics - summer rainfall',
    'Subtropics - winter rainfall',
    'Temperature - continental',
    'Temperature - oceanic',
    'Temperature - subcontinental',
    'Tropics'
]

features = features + growing_zones
target = 'maiz_a_2010'
data = data.dropna()
X = data[features]
y = data[[target]]
del data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del X
del y

In [6]:
regressor = RandomForestRegressor(n_estimators=200, n_jobs=10)
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print(score)

  


0.9784714890027345
