In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import os

In [None]:
np.random.seed(5)

In [None]:
n_playa = 71848 # 71848 is all of them
n_iter = 100

In [None]:
reload_run = True

# Load data

In [None]:
pred_df = pd.read_csv('../data/all_preds.csv')
# Add months and years back in
pred_df['date'] = np.tile(pd.date_range('1984-03', periods=418, freq='M'), int(pred_df.shape[0]/418))
pred_df = pred_df.set_index(['id','date'])

# Make plots

In [None]:
def simulate_inundation(pred_df, n_playa, n_iter, n_time=418):
    binary_histories = np.zeros(shape = (n_playa, n_time, n_iter), dtype=np.byte) - 1
    for i in range(n_iter):
        binary_histories[:, :, i] = np.random.binomial(
            n=1,
            p=pred_df['pred'].values.reshape([n_playa, n_time])
        )
        if i % 10 == 0:
            print(n_iter, 'done')
        
    # Checks
    assert binary_histories.shape == (n_playa, n_time, n_iter)
    assert binary_histories.min() == 0.
    assert binary_histories.max() == 1.

    # we expect no values between 0 and 1
    # Had to comment out because it duplicates the array and exceeds memory
    # But checked it on smaller test cases and it should be good
    
    # assert not np.logical_and(binary_histories > 0, binary_histories < 1).any()
    return binary_histories


In [None]:
if reload_run and os.path.exists('./frac_inundated.npy'):
    frac_inundated = np.load('./frac_inundated.npy')
else:
    if n_playa < 71848:
        binary_histories = simulate_inundation(pred_df.iloc[0:(418*n_playa)], n_playa, n_iter)
    else: 
        binary_histories = simulate_inundation(pred_df, n_playa, n_iter)
        
    frac_inundated = np.mean(binary_histories, axis=0)


In [None]:
inundation_sd = np.std(frac_inundated, axis=1)
inundation_mean_of_means = np.mean(frac_inundated, axis=1)
inundation_975 = np.percentile(frac_inundated, 0.975, axis=1)
inundation_025 = np.percentile(frac_inundated, 0.975, axis=1)

In [None]:
# plot all of the draws for the fraction inundated
dates = pred_df.index.get_level_values(1)[:418]
fig, ax = plt.subplots(figsize=[15,5])
plt.plot(dates, pred_df['true'].groupby('date').mean().values, label='True')

plt.plot(dates, inundation_mean_of_means + 10*inundation_sd, label='Mean +/- 10*SD', 
         linestyle='--', color='darkorange', linewidth=1)
plt.plot(dates, inundation_mean_of_means - 10*inundation_sd, 
         linestyle='--', color='darkorange', linewidth=1)
plt.plot(dates, inundation_mean_of_means, label='Mean Pred', color='red')


# Commenting out spreads, not very interesting
# plt.plot(dates, inundation_975, label='97.5%')
# plt.plot(dates, inundation_025, label='2.5%')
plt.legend()
plt.xlabel("Time")
plt.ylabel("Fraction inundated")
plt.axvline(dt.datetime(2014,1,1), color='black')
plt.axvline(dt.datetime(2010,1,1), color='black')
plt.text(dt.datetime(2010,3,1), 0.135, 'Validation', size=15)
plt.text(dt.datetime(2014,3,1), 0.135, 'Test', size=15)
plt.text(dt.datetime(1984,5,1), 0.135, 'Train', size=15)
plt.show()


In [None]:
# Write frac_inundated to file to save time
np.save('./frac_inundated.npy', frac_inundated)