# Prediction Notebook
This notebook contains the code to produce predictions on GCM data using a pickled regressor.

It computes the GCM data for each agroclimatic indicator and then predicts using those features for the chosen year. This is heavily RAM intensive, especially for a full dataset.

It does no feature engineering, so it assumes the initial data has the non climatic indicators precomputed.

Once the years have been predicted, for the year (e.g. 2025) it works out the mean and standard deviation for that year by using all rcps for that year and one year either side (12 in total).

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import numpy as np
import json
import pickle
import joblib
import pickle
from sklearn.ensemble import RandomForestRegressor

from shared_methods import compute_ten_day_feature, compute_seasonal_feature

In [None]:
# Add the correct file information
out_file = 'climate_monthly_seasonal_2005_2010_with_spatial_features.csv'

with open('json_data/file_data.json', 'r') as outfile:
    file_data = json.load(outfile)
    
seasonal_data_folder = file_data['seasonal_data_folder']
ten_day_data_folder = file_data['ten_day_data_folder']
crop_data_2005_file = file_data['crop_data_2005_file']
crop_data_2010_file = file_data['crop_data_2010_file']
spatial_attributes_folder = file_data['spatial_attributes_folder']
ten_day_feature_files = file_data['ten_day_feature_files']
seasonal_feature_files = file_data['seasonal_feature_files']
spatial_attribute_files = file_data['spatial_attribute_files']

out_file = f'../data/predictions.csv'
regressor_pickle_file = '../data/regressors_0.pickle'
data_file = '../data/deltas'

In [None]:
# Specify which columns are needed
target = 'maize_a_2010'
non_feature_columns = ['alloc_key', target]
ten_day_features = ten_day_feature_files.keys()
seasonal_features = seasonal_feature_files.keys()

In [None]:
# Load the data
with open(regressor_pickle_file, 'rb') as handle:
    regressor = pickle.load(handle)
data = pd.read_csv(data_file)

prediction_features = data.drop(columns=non_feature_columns).columns

In [None]:
# Loop through each rcp and year, compute the agroclimatic features for each year and predict based on those
# computed features.
prediction_years = [2024, 2025, 2026]
rcps = ['2p6', '4p5', '6p0', '8p5']
model = 'hadgem'

# Predict for different years
for rcp in rcps:
    folder = f'../data/{model}/{model}_rcp_{rcp}_agroclimatic_indicators/'
    for year in tqdm(prediction_years):
        print(f'Starting Model: {model} Year {year} RCP {rcp}')
        if int(year) in range(2011,2041):
            file_ending = '20110101-20401231_v1.nc'
        elif int(year) in range(2041,2071):
            file_ending = '20410101-20701231_v1.nc'

        for feature in tqdm(seasonal_features):
            # Get the correct file location for hadgem data
            file_location = f'{folder}{feature}_C3S-glob-agric_hadgem2-es_rcp{rcp}_season_{file_ending}'
            data = compute_seasonal_feature(feature, file_location, str(year), data, in_place_2010=True)

        for feature in tqdm(ten_day_features):
            # Get the correct file location for hadgem data
            file_location = f'{folder}{feature}_C3S-glob-agric_hadgem2-es_rcp{rcp}_dek_{file_ending}'
            data = compute_ten_day_feature(feature, file_location, str(year), data, in_place_2010=True)
        data = data.interpolate(axis=1)

        data[f'{rcp}_{year}_predict'] = regressor.predict(data[prediction_features])
        print(f'Finished year {year} rcp {rcp} model {model}')
    

data.to_csv('predictions')

In [None]:
# Compute the mean and standard deviation for a prediction year by using the rcps for that year and the same for
# plus and minus one year either side of that year (12 predictions in total).

prediction_years = [2025]
for year in tqdm(prediction_years):
    mean_included_years = [str(year), str(year + 1), str(year - 1)]
    mean_included_year_columns = []
    for rcp in rcps:
        mean_included_year_columns += [f'{rcp}_{col}_predict' for col in mean_included_years]
    data[f'{year}_mean'] = predictions[mean_included_year_columns].mean(axis=1)
    data[f'{year}_std'] = predictions[mean_included_year_columns].std(axis=1)
                                          

data.to_csv(f'Predictions for Years: {prediction_years}.csv')