In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import xarray as xr
import numpy as np
import re
import json
import rasterio
import pickle
import joblib
import xgboost as xgb
import pandas as pd
import os, sys
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.metrics import r2_score
import random
from pandarallel import pandarallel
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
pandarallel.initialize(progress_bar=True, nb_workers=10)

ten_day_feature_files = {
    'BEDD': 'BEDD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'FD': 'FD_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R20mm': 'R20mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'R10mm': 'R10mm_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'ID': 'ID_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TN': 'TN_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'DTR': 'DTR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR1': 'RR1_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'RR': 'RR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SDII': 'SDII_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'SU': 'SU_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TG': 'TG_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TNn': 'TNn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TR': 'TR_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TX': 'TX_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXn': 'TXn_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc',
    'TXx': 'TXx_C3S-glob-agric_WFDEI_hist_dek_19810101-20101231_v1.nc'
}

seasonal_feature_files = {
    'CDD': 'CDD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CFD': 'CFD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CWD': 'CWD_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WW': 'WW_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'WSDI': 'WSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc',
    'CSDI': 'CSDI_C3S-glob-agric_WFDEI_hist_season_19810101-20101231_v1.nc'
}

# Compute the 10 day features
def compute_ten_day_feature(feature, file_location, year, dataset, interpolation_method='linear', in_place_2010=False):
    days = ['05', '15', '25']
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
    lats_ = xr.DataArray(list(dataset['lat'].values), dims='z')
    lons_ = xr.DataArray(list(dataset['lon'].values), dims='z')
    with xr.open_dataset(file_location) as ds:
        feature_data = ds.load()
        for day in days:
            for month in months:
                time = f'{year}-{month}-{day}'
                timed = feature_data.sel(time=time).squeeze()
                feature_interpolated = timed.interp(lat=lats_, lon=lons_, method='linear')
                dataset[f'{feature}-{month}-{day}-{year}'] = getattr(feature_interpolated, feature)
    all_ten_days = []
    for month in months:
        for day in days:
            all_ten_days.append(f'{feature}-{month}-{day}-{year}')
        if in_place_2010:
            feature_name = f'{feature}-{month}-2010'
        else:
            feature_name = f'{feature}-{month}-{year}'
        dataset[feature_name] = dataset[f'{feature}-{month}-05-{year}'] + dataset[f'{feature}-{month}-15-{year}'] + dataset[f'{feature}-{month}-25-{year}']
    dataset = dataset.drop(columns=all_ten_days)

    return dataset

def compute_seasonal_feature(feature, file_location, year, dataset, interpolation_method='linear', in_place_2010=False):
    lats = xr.DataArray(list(dataset['lat'].values), dims='z')
    lons = xr.DataArray(list(dataset['lon'].values), dims='z')
    quarters = ['Q1', 'Q2', 'Q3', 'Q4']
    quarter_time_mapping = {
        'Q1': '01-16',
        'Q2': '04-16',
        'Q3': '07-16',
        'Q4': '10-16'
    }
    with xr.open_dataset(file_location) as ds:
        feature_data = ds.load()
        
    for quarter in quarters:
        if in_place_2010:
            feature_name = f'{feature}-{quarter}-2010'
        else:
            feature_name = f'{feature}-{quarter}-{year}'
        time = f'{year}-{quarter_time_mapping[quarter]}'
        timed = feature_data.sel(time=time).squeeze()
        feature_interpolated = timed.interp(lat=lats, lon=lons, method=interpolation_method)
        dataset[feature_name] = getattr(feature_interpolated, feature)
    return dataset

def autograd_variable(input):
    """ Convert input to a torch cuda or cpu tensor"""
    if torch.cuda.is_available():
        return autograd.Variable(input).cuda()
    return autograd.Variable(input)

class NN(nn.Module):
    def __init__(self, feature_count, dropout=0.05):
        super(NN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(feature_count, 1000),
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(1000, 1000),
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(1000, 1000),
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(1000, 1000),
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(1000, 1),
        )

    def forward(self, x):
        x = self.fc(autograd_variable(x))
        return x
    
    def predict(self, x):
        with torch.no_grad():
            if torch.cuda.is_available():
                return self.forward(x.cuda())
            else:
                return self.forward(x)

In [None]:
degree_separation = 3
lon_slices = int(360/degree_separation) + 1
lon_range = np.linspace(-180,180,lon_slices)

def select_bin(row):
    for idx, edge in enumerate(lon_range[:-1]):
        if row['lon'] > edge and row['lon'] < lon_range[idx + 1]:
            return idx % 4

final_data['bin'] = final_data.parallel_apply(select_bin, axis=1)

In [None]:
for f in tqdm(final_data.columns):
    if '-2005' in f:
        feature = f.replace('-2005', '')
        final_data[f'delta-{feature}'] = final_data[f'{feature}-2010'] - final_data[f]
        final_data = final_data.drop(columns=[f'{feature}-2010'])

In [None]:
with open('regressors.pickle', 'rb') as handle:
    regressors = pickle.load(handle)

In [None]:
final_data = pd.read_csv('deltas.csv')
prediction_features = [f for f in final_data.columns if 'delta' in f] + ['lon', 'lat', 'slope', 'irrigation', 'elevation', 'climate_zones', 'soil_types', 'maize_a_2005']
prediction_features

In [None]:
for bin_idx in range(4):
    with open(f'regressors_{bin_idx}.pickle', 'rb') as handle:
        regressor = pickle.load(handle)
    test_bin = ((bin_idx + 2) % 4)
    test_data = final_data[final_data['bin'] == test_bin]
    test_data['prediction'] = regressor.predict(test_data[prediction_features])
    test_data[['prediction', 'maize_a_2010']].to_csv(f'test_data_predictions_{bin_idx}')
    del regressor
#     regressors[train_bin] = regressor

In [None]:
with open('regressors.pickle', 'rb') as handle:
    regressors = pickle.load(handle)

In [None]:
from collections import defaultdict
prediction_features_counts = defaultdict(lambda: 0.0)
for k,r in regressors.items():
     for x in zip(r.feature_importances_, prediction_features):
          prediction_features_counts[x[1]] += x[0]/4.0
        
sorted([(v, k) for k,v in prediction_features_counts.items()], reverse=True)[0:10]

In [None]:
for bin_, regressor in regressors.items():
    with open(f'regressors_{bin_}.pickle', 'wb') as handle:
        pickle.dump(regressor, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
final_data = final_data.drop(columns=[c for c in final_data.columns if '-2010' in c])

In [None]:
prediction_features = [f for f in final_data.columns if 'delta' in f] + ['lon', 'lat', 'slope', 'irrigation', 'elevation', 'climate_zones', 'soil_types', 'maize_a_2005']
prediction_features

In [None]:
prediction_years = [2019, 2020, 2021, 2024, 2025, 2026, 2039, 2040, 2042]
rcps = ['2p6', '4p5', '6p0', '8p5']
out_file = f'../data/hadgem_predictions_year_{prediction_years}_final.csv'  # load data
model = 'hadgem'

# Predict for different years
# def compute_features_for_rcp(rcp, final_data=final_data[['lon', 'lat', 'slope', 'irrigation', 'elevation', 'climate_zones', 'soil_types', 'maize_a_2005']]):
for rcp in rcps:
    folder = f'../data/{model}/{model}_rcp_{rcp}_agroclimatic_indicators/'
    for year in tqdm(prediction_years):
        print(f'Starting Model: {model} Year {year} RCP {rcp}')
        if int(year) in range(2011,2041):
            file_ending = '20110101-20401231_v1.nc'
        elif int(year) in range(2041,2071):
            file_ending = '20410101-20701231_v1.nc'

        for feature in tqdm(seasonal_feature_files.keys()):
            file_location = f'{folder}{feature}_C3S-glob-agric_hadgem2-es_rcp{rcp}_season_{file_ending}'
            final_data = compute_seasonal_feature(feature, file_location, str(year), final_data, in_place_2010=True)

        for feature in tqdm(ten_day_feature_files.keys()):    
            file_location = f'{folder}{feature}_C3S-glob-agric_hadgem2-es_rcp{rcp}_dek_{file_ending}'
            final_data = compute_ten_day_feature(feature, file_location, str(year), final_data, in_place_2010=True)
#             final_data.to_csv(f'../data/{model}/precomputed_{model}_year_{year}_rcp_{rcp}.csv')
        final_data = final_data.interpolate(axis=1)
        for f in tqdm(final_data.columns):
            if '-2005' in f:
                feature = f.replace('-2005', '')
                final_data[f'delta-{feature}'] = final_data[f'{feature}-2010'] - final_data[f]
                final_data = final_data.drop(columns=[f'{feature}-2010'])

        final_data[f'{rcp}_{year}_predict'] = 0
        for bin_ in [0, 1, 2, 3]:
            with open(f'regressors_{bin_}.pickle', 'rb') as handle:
                regressor = pickle.load(handle)
            final_data.loc[final_data['bin'] ==  bin_, f'{rcp}_{year}_predict'] = regressor.predict(final_data[final_data['bin'] ==  bin_][prediction_features])
            del regressor
        print(f'Finished year {year} rcp {rcp} model {model}')
    

final_data.to_csv('predictions')

In [None]:
final_data[[c for c in final_data.columns if 'pred' in c] + ['maize_a_2010', 'lon', 'lat', 'alloc_key']].to_csv('final_predictions.csv')

In [None]:
predictions = final_data[[c for c in final_data.columns if 'pred' in c] + ['maize_a_2010', 'lon', 'lat', 'alloc_key']]

In [None]:
prediction_years = [2040, 2025, 2020]
for year in tqdm(prediction_years):
    mean_included_years = [str(year), str(year + 1), str(year - 1)]
    if year == 2040:
        mean_included_years = [str(year), str(year + 2), str(year - 1)]
    mean_included_year_columns = []
    for rcp in rcps:
        mean_included_year_columns += [f'{rcp}_{col}_predict' for col in mean_included_years]
    predictions[f'{year}_mean'] = predictions[mean_included_year_columns].mean(axis=1)
    predictions[f'{year}_std'] = predictions[mean_included_year_columns].std(axis=1)
                                          

predictions.to_csv(f'Predictions for Years: {prediction_years}.csv')