In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, f1_score, roc_curve, precision_score, recall_score, accuracy_score, auc
import numpy as np
import datetime as dt
import os

In [None]:
playa_att_df = pd.read_csv('../data/playa_nogeometry_whucs.csv')
playa_att_df = playa_att_df.set_index('id')
playa_att_df = playa_att_df[['state','acres','countyfips','cluster','farmed','hydromod','healthy', 'huc12', 'author']]
playa_att_df['huc4'] = playa_att_df['huc12'].astype(str).str[0:4]

In [None]:
precip_df = pd.read_csv('../data/prism.csv', usecols=['id','ppt', 'system:index']).rename(columns={'ppt':'precip'})

In [None]:
precip_df['year'] = precip_df['system:index'].str.slice(0,4).astype('int16')
precip_df['month'] = precip_df['system:index'].str.slice(4,6).astype('int16')
precip_df['day'] = 1

In [None]:
precip_df['date'] = pd.to_datetime(precip_df[['year','month', 'day']])
precip_df = precip_df.loc[precip_df['date']>=pd.Timestamp('1984-03-01')]
precip_df = precip_df.set_index(['id','date']).drop(columns=['year','month','day','system:index'])


In [None]:
pred_df = pd.read_csv('../data/all_preds_best_mean_inun_v2_calibrated.csv')
pred_df['pred'] = pred_df['pred_cal'] # for calibrated results

In [None]:
# Add months and years back in
pred_df['date'] = np.tile(pd.date_range('1984-03', periods=418, freq='M'), int(pred_df.shape[0]/418))
pred_df = pred_df.set_index(['id','date'])

# Summary statistics of attributes (non-modeling)

In [None]:
# acres to ha
playa_att_df['ha'] = playa_att_df['acres']*0.40469
print('Fraction farmed: ', playa_att_df['farmed'].mean())
print('Fraction modified: ', playa_att_df['hydromod'].mean())
print('Mean Size :', playa_att_df['ha'].mean())
print('Median Size :', playa_att_df['ha'].median())

In [None]:
att_pred_df = pred_df.reset_index().groupby('id').max().join(playa_att_df, how='inner')


In [None]:
print('never inundated, smaller than median:',1-att_pred_df.loc[att_pred_df['acres']<2.6955850000000003, 'true'].mean())
print('never inundated, larger than median:', 1-att_pred_df.loc[att_pred_df['acres']>2.6955850000000003, 'true'].mean())

In [None]:
print('Fraction smaller than 1 ha:', (att_pred_df['ha']<1).mean())
print('Fraction smaller than 0.5 ha:', (att_pred_df['ha']<0.5).mean())

In [None]:
northern_att_pred_df = att_pred_df.loc[att_pred_df['state'].isin(['KS', 'NB', 'CO'])]
print('northern states, farmed:', northern_att_pred_df['farmed'].mean())
print('northern states, median size:', northern_att_pred_df['ha'].median())
print('northern states, not inundated fraction:', 1-northern_att_pred_df['true'].mean())

# Basic statistics

In [None]:
def calc_f1_score(true, pred, cutoff=0.25):
    return f1_score(true, pred>cutoff, zero_division=1)

In [None]:
pred_df['set_flag'] = 2
pred_df.loc[pred_df.index.get_level_values(1)<pd.Timestamp('2015-01-01'), 'set_flag'] = 1
pred_df.loc[pred_df.index.get_level_values(1)<pd.Timestamp('2011-01-01'), 'set_flag'] = 0

In [None]:
pred_df = pred_df.assign(pred_binary=pred_df['pred']>0.25)

In [None]:
test_pred_df = pred_df.loc[pred_df['set_flag']==2]
val_pred_df = pred_df.loc[pred_df['set_flag']==1]
train_pred_df = pred_df.loc[pred_df['set_flag']==0]

#### Basic accuracy stats

In [None]:
print('Train Accuracy: ', accuracy_score(train_pred_df['true'], train_pred_df['pred_binary']))
print('Train loss: ',log_loss(train_pred_df['true'], train_pred_df['pred']))
print('Train Recall:', recall_score(train_pred_df['true'], train_pred_df['pred_binary']))
print('Train Precision:', precision_score(train_pred_df['true'], train_pred_df['pred_binary']))
print('Train F1:',calc_f1_score(train_pred_df['true'], train_pred_df['pred']))

In [None]:
print('Val Accuracy: ', accuracy_score(val_pred_df['true'], val_pred_df['pred_binary']))
print('Val loss: ',log_loss(val_pred_df['true'], val_pred_df['pred']))
print('Val Recall:', recall_score(val_pred_df['true'], val_pred_df['pred_binary']))
print('Val Precision:', precision_score(val_pred_df['true'], val_pred_df['pred_binary']))
print('Val F1:',calc_f1_score(val_pred_df['true'], val_pred_df['pred']))

In [None]:
print('Test Accuracy: ', accuracy_score(test_pred_df['true'], test_pred_df['pred_binary']))
print('Test loss: ', log_loss(test_pred_df['true'], test_pred_df['pred']))
print('Test Recall:', recall_score(test_pred_df['true'], test_pred_df['pred_binary']))
print('Test Precision:', precision_score(test_pred_df['true'], test_pred_df['pred_binary']))
print('Test F1:',calc_f1_score(test_pred_df['true'], test_pred_df['pred']))

#### Baseline using random or all zeros

In [None]:
print('All 0 Val Accuracy:', accuracy_score(val_pred_df['true'], np.zeros(val_pred_df['true'].shape)))
print('All 0 Val Loss:', log_loss(val_pred_df['true'], np.zeros(val_pred_df['true'].shape)))
rand_val = np.random.binomial(1, 0.5, val_pred_df['true'].shape)
print('Random Val Accuracy:', accuracy_score(val_pred_df['true'], rand_val))
print('Random Val Loss:', log_loss(val_pred_df['true'],rand_val))

In [None]:
print('All 0 test Accuracy:', accuracy_score(test_pred_df['true'], np.zeros(test_pred_df['true'].shape)))
print('All 0 test Loss:', log_loss(test_pred_df['true'], np.zeros(test_pred_df['true'].shape)))
rand_test = np.random.binomial(1, 0.5, test_pred_df['true'].shape)
print('Random test Accuracy:', accuracy_score(test_pred_df['true'], rand_test))
print('Random test Loss:', log_loss(test_pred_df['true'],rand_test))

# ROC Curves

In [None]:
fpr, tpr, thresholds = roc_curve(train_pred_df['true'],train_pred_df['pred'])
print('Train AUC: ', auc(fpr, tpr))

plt.plot(fpr, tpr)
plt.xlabel('False Pos Rate', size=13)
plt.ylabel('True Pos Rate', size=13)
plt.plot([-1,2], [-1,2], 'k-', lw=1)
plt.xlim([-0.02,1.02])
plt.ylim([-0.02,1.02])
plt.title('Train ROC', size=14)
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(val_pred_df['true'],val_pred_df['pred'])
print('Val AUC: ', auc(fpr, tpr))

plt.plot(fpr, tpr)
plt.xlabel('False Pos Rate', size=13)
plt.ylabel('True Pos Rate', size=13)
plt.plot([-1,2], [-1,2], 'k-', lw=1)
plt.xlim([-0.02,1.02])
plt.ylim([-0.02,1.02])
plt.title('Val ROC', size=14)
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(test_pred_df['true'], test_pred_df['pred'])
print('Test AUC: ', auc(fpr, tpr))
plt.plot(fpr, tpr)
plt.xlabel('False Pos Rate', size=13)
plt.ylabel('True Pos Rate',size=13)
plt.plot([-1,2], [-1,2], 'k-', lw=1)
plt.xlim([-0.02,1.02])
plt.ylim([-0.02,1.02])
plt.title('Test ROC', size=14)
plt.show()

In [None]:
# Paper Figure: Test and Val together
# Val
fig, ax = plt.subplots(1,2,figsize=(12,4))
fpr, tpr, thresholds = roc_curve(val_pred_df['true'],val_pred_df['pred'])
ax[0].plot(fpr, tpr)
ax[0].set_xlabel('False Pos Rate', size=13)
ax[0].set_ylabel('True Pos Rate',size=13)
ax[0].plot([-1,2], [-1,2], 'k-', lw=1)
ax[0].set_xlim([-0.02,1.02])
ax[0].set_ylim([-0.02,1.02])
ax[0].set_title('Val ROC', size=14)
# Test
fpr, tpr, thresholds = roc_curve(test_pred_df['true'], test_pred_df['pred'])
ax[1].plot(fpr, tpr)
ax[1].set_xlabel('False Pos Rate', size=13)
ax[1].set_ylabel('True Pos Rate',size=13)
ax[1].plot([-1,2], [-1,2], 'k-', lw=1)
ax[1].set_xlim([-0.02,1.02])
ax[1].set_ylim([-0.02,1.02])
ax[1].set_title('Test ROC', size=14)
plt.savefig('/home/ksolvik/research/misc_projects/playa/deliverables/figures/val_test_roc.png', dpi=300, bbox_inches='tight')


# Error analysis broken down by playa size and other atts

In [None]:
test_pred_df_atts = test_pred_df.reset_index().merge(playa_att_df[['acres','farmed', 'hydromod']].reset_index(), on='id').set_index(['id','date'])

In [None]:
# Size quantiles
print(np.quantile(playa_att_df['acres'], 0.25))
print(np.quantile(playa_att_df['acres'], 0.5))
print(np.quantile(playa_att_df['acres'], 0.75))
size_first_q = np.quantile(playa_att_df['acres'], 0.25)
size_second_q = np.quantile(playa_att_df['acres'], 0.5)
size_third_q = np.quantile(playa_att_df['acres'], 0.75)
test_pred_size_first_q = test_pred_df_atts.loc[test_pred_df_atts['acres']<size_first_q]
test_pred_size_second_q = test_pred_df_atts.loc[(test_pred_df_atts['acres']>size_first_q) & (test_pred_df_atts['acres']<size_second_q)]
test_pred_size_third_q = test_pred_df_atts.loc[(test_pred_df_atts['acres']>size_second_q) & (test_pred_df_atts['acres']<size_third_q)]
test_pred_size_fourth_q = test_pred_df_atts.loc[test_pred_df_atts['acres']>size_third_q]
test_pred_size_bigger = test_pred_df_atts.loc[test_pred_df_atts['acres']>size_second_q]
test_pred_size_smaller = test_pred_df_atts.loc[test_pred_df_atts['acres']<size_second_q]


In [None]:
print('first_q Size Test Accuracy: ', accuracy_score(test_pred_size_first_q['true'], test_pred_size_first_q['pred_binary']))
print('first_q Size Test loss: ', log_loss(test_pred_size_first_q['true'], test_pred_size_first_q['pred']))
print('first_q Size Test Recall:', recall_score(test_pred_size_first_q['true'], test_pred_size_first_q['pred_binary']))
print('first_q Size Test Precision:', precision_score(test_pred_size_first_q['true'], test_pred_size_first_q['pred_binary']))
print('first_q Size Test F1:',calc_f1_score(test_pred_size_first_q['true'], test_pred_size_first_q['pred']))

In [None]:
print('second_q Size Test Accuracy: ', accuracy_score(test_pred_size_second_q['true'], test_pred_size_second_q['pred_binary']))
print('second_q Size Test loss: ', log_loss(test_pred_size_second_q['true'], test_pred_size_second_q['pred']))
print('second_q Size Test Recall:', recall_score(test_pred_size_second_q['true'], test_pred_size_second_q['pred_binary']))
print('second_q Size Test Precision:', precision_score(test_pred_size_second_q['true'], test_pred_size_second_q['pred_binary']))
print('second_q Size Test F1:',calc_f1_score(test_pred_size_second_q['true'], test_pred_size_second_q['pred']))

In [None]:
print('third_q Size Test Accuracy: ', accuracy_score(test_pred_size_third_q['true'], test_pred_size_third_q['pred_binary']))
print('third_q Size Test loss: ', log_loss(test_pred_size_third_q['true'], test_pred_size_third_q['pred']))
print('third_q Size Test Recall:', recall_score(test_pred_size_third_q['true'], test_pred_size_third_q['pred_binary']))
print('third_q Size Test Precision:', precision_score(test_pred_size_third_q['true'], test_pred_size_third_q['pred_binary']))
print('third_q Size Test F1:',calc_f1_score(test_pred_size_third_q['true'], test_pred_size_third_q['pred']))

In [None]:
print('fourth_q Size Test Accuracy: ', accuracy_score(test_pred_size_fourth_q['true'], test_pred_size_fourth_q['pred_binary']))
print('fourth_q Size Test loss: ', log_loss(test_pred_size_fourth_q['true'], test_pred_size_fourth_q['pred']))
print('fourth_q Size Test Recall:', recall_score(test_pred_size_fourth_q['true'], test_pred_size_fourth_q['pred_binary']))
print('fourth_q Size Test Precision:', precision_score(test_pred_size_fourth_q['true'], test_pred_size_fourth_q['pred_binary']))
print('fourth_q Size Test F1:',calc_f1_score(test_pred_size_fourth_q['true'], test_pred_size_fourth_q['pred']))

In [None]:
print('bigger Size Test Accuracy: ', accuracy_score(test_pred_size_bigger['true'], test_pred_size_bigger['pred_binary']))
print('bigger Size Test loss: ', log_loss(test_pred_size_bigger['true'], test_pred_size_bigger['pred']))
print('bigger Size Test Recall:', recall_score(test_pred_size_bigger['true'], test_pred_size_bigger['pred_binary']))
print('bigger Size Test Precision:', precision_score(test_pred_size_bigger['true'], test_pred_size_bigger['pred_binary']))
print('bigger Size Test F1:',calc_f1_score(test_pred_size_bigger['true'], test_pred_size_bigger['pred']))

In [None]:
print('smaller Size Test Accuracy: ', accuracy_score(test_pred_size_smaller['true'], test_pred_size_smaller['pred_binary']))
print('smaller Size Test loss: ', log_loss(test_pred_size_smaller['true'], test_pred_size_smaller['pred']))
print('smaller Size Test Recall:', recall_score(test_pred_size_smaller['true'], test_pred_size_smaller['pred_binary']))
print('smaller Size Test Precision:', precision_score(test_pred_size_smaller['true'], test_pred_size_smaller['pred_binary']))
print('smaller Size Test F1:',calc_f1_score(test_pred_size_smaller['true'], test_pred_size_smaller['pred']))

In [None]:
# Farmed vs not farmed
test_pred_farmed = test_pred_df_atts.loc[test_pred_df_atts['farmed']==1]
test_pred_unfarmed = test_pred_df_atts.loc[test_pred_df_atts['farmed']==0]


In [None]:
print('farmed Size Test Accuracy: ', accuracy_score(test_pred_farmed['true'], test_pred_farmed['pred_binary']))
print('farmed Size Test loss: ', log_loss(test_pred_farmed['true'], test_pred_farmed['pred']))
print('farmed Size Test Recall:', recall_score(test_pred_farmed['true'], test_pred_farmed['pred_binary']))
print('farmed Size Test Precision:', precision_score(test_pred_farmed['true'], test_pred_farmed['pred_binary']))
print('farmed Size Test F1:',calc_f1_score(test_pred_farmed['true'], test_pred_farmed['pred']))


In [None]:
print('unfarmed Size Test Accuracy: ', accuracy_score(test_pred_unfarmed['true'], test_pred_unfarmed['pred_binary']))
print('unfarmed Size Test loss: ', log_loss(test_pred_unfarmed['true'], test_pred_unfarmed['pred']))
print('unfarmed Size Test Recall:', recall_score(test_pred_unfarmed['true'], test_pred_unfarmed['pred_binary']))
print('unfarmed Size Test Precision:', precision_score(test_pred_unfarmed['true'], test_pred_unfarmed['pred_binary']))
print('unfarmed Size Test F1:',calc_f1_score(test_pred_unfarmed['true'], test_pred_unfarmed['pred']))

In [None]:
# Mofidied vs not
test_pred_mod = test_pred_df_atts.loc[test_pred_df_atts['hydromod']==1]
test_pred_unmod = test_pred_df_atts.loc[test_pred_df_atts['hydromod']==0]

In [None]:
print('mod Size Test Accuracy: ', accuracy_score(test_pred_mod['true'], test_pred_mod['pred_binary']))
print('mod Size Test loss: ', log_loss(test_pred_mod['true'], test_pred_mod['pred']))
print('mod Size Test Recall:', recall_score(test_pred_mod['true'], test_pred_mod['pred_binary']))
print('mod Size Test Precision:', precision_score(test_pred_mod['true'], test_pred_mod['pred_binary']))
print('mod Size Test F1:',calc_f1_score(test_pred_mod['true'], test_pred_mod['pred']))


In [None]:
print('unmod Size Test Accuracy: ', accuracy_score(test_pred_unmod['true'], test_pred_unmod['pred_binary']))
print('unmod Size Test loss: ', log_loss(test_pred_unmod['true'], test_pred_unmod['pred']))
print('unmod Size Test Recall:', recall_score(test_pred_unmod['true'], test_pred_unmod['pred_binary']))
print('unmod Size Test Precision:', precision_score(test_pred_unmod['true'], test_pred_unmod['pred_binary']))
print('unmod Size Test F1:',calc_f1_score(test_pred_unmod['true'], test_pred_unmod['pred']))


# Calculate loss per playa

In [None]:
# Plot best, worst, and a few in the middle
def plot_single_playa_trajectory(df, id, ax=None, wide=False):
    playa_df = df.loc[id]
    if ax==None:
        if wide:
            playa_df[['true','pred']].plot(figsize=[15,2.2])
            plt.text(dt.datetime(2011,3,1), 1.05, 'Validation', size=14)
            plt.text(dt.datetime(2015,3,1), 1.05, 'Test', size=14)
            plt.text(dt.datetime(1984,3,1), 1.05, 'Train', size=14)
        else:
            playa_df[['true','pred']].plot(figsize=[15,4])
            plt.text(dt.datetime(2011,3,1), 1.1, 'Validation', size=14)
            plt.text(dt.datetime(2015,3,1), 1.1, 'Test', size=14)
            plt.text(dt.datetime(1984,3,1), 1.1, 'Train', size=14)
    else:
        playa_df[['true','pred']].plot(ax=ax, legend=False)
        ax.text(dt.datetime(2011,3,1), 1.05, 'Validation', size=14)
        ax.text(dt.datetime(2015,3,1), 1.05, 'Test', size=14)
        ax.text(dt.datetime(1984,3,1), 1.05, 'Train', size=14)
    ax.set_ylim(-0.025,1.025)
    ax.set_xlabel('Date', size=13)
    ax.set_ylabel('P(Inundation)', size=13)
    ax.set_xlim(dt.datetime(1984,1,1), dt.datetime(2019,1,1))
    ax.axvline(dt.datetime(2015,1,1), color='black')
    ax.axvline(dt.datetime(2011,1,1), color='black')
    #ax.text(dt.datetime(1984,3,1), 0.275, 'Threshold', size=11)
    # ax.axhline(0.25, color='black', linestyle='--', linewidth=0.8)

    if ax==None:
        plt.show()

In [None]:
def plot_precip_trajectory(df, id):
    precip_df = df.loc[id]
    precip_df['precip'].plot(figsize=[15,5])
    plt.axvline(dt.datetime(2015,1,1), color='black')
    plt.axvline(dt.datetime(2011,1,1), color='black')
    plt.text(dt.datetime(2011,3,1), 1.1, 'Validation', size=14)
    plt.text(dt.datetime(2015,3,1), 1.1, 'Test', size=14)
    plt.text(dt.datetime(1984,3,1), 1.1, 'Train', size=14)
    plt.show()

### Test loss only

In [None]:
max_inun_test = test_pred_df.groupby('id').aggregate({'true':'max'})
test_nonzero_ids = max_inun_test.index[max_inun_test['true'].values>0]

In [None]:
if not os.path.exists('../data/per_playa_test_loss_v2.csv'):# Looping to save some memory
    per_loop = 5000
    test_starts = np.arange(0, test_pred_df.shape[0], per_loop*48)
    per_playa_test_loss_series = pd.Series(dtype='float64')
    for start_point in test_starts:
        end = min(pred_df.shape[0], start_point + per_loop*48)
        temp_loss_series = test_pred_df.iloc[start_point:end].groupby('id').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
        per_playa_test_loss_series = per_playa_test_loss_series.append(temp_loss_series)
        
    per_playa_test_loss_series.to_csv('../data/per_playa_test_loss_v2.csv')    
else:
    per_playa_test_loss_series = pd.read_csv('../data/per_playa_test_loss_v2.csv', names=['id','loss']).iloc[1:].set_index('id')['loss']

In [None]:
test_nonzero_per_playa_loss = per_playa_test_loss_series.loc[test_nonzero_ids].sort_values()

### Paper figure

In [None]:
fig, axs = plt.subplots(3, figsize=(15,6.6))
plot_single_playa_trajectory(pred_df,test_nonzero_per_playa_loss.index[1], ax=axs[0])
plot_single_playa_trajectory(pred_df, test_nonzero_per_playa_loss.index[int(test_nonzero_per_playa_loss.shape[0]/2)-1], ax=axs[1])
plot_single_playa_trajectory(pred_df,test_nonzero_per_playa_loss.index[-1], ax=axs[2])
axs[2].legend(['True','Predicted'], loc=9, prop={'size':12})
axs[0].text(dt.datetime(1984,3,1), 0.86, '(a)', size=13)
axs[1].text(dt.datetime(1984,3,1), 0.86, '(b)', size=13)
axs[2].text(dt.datetime(1984,3,1), 0.86, '(c)', size=13)
plt.subplots_adjust(hspace = 0.35)
plt.savefig('/home/ksolvik/research/misc_projects/playa/deliverables/figures/inun_record_goodmedian_bad.png', dpi=300, bbox_inches='tight')
plt.show()

### Misc exploratory plots

In [None]:
plot_single_playa_trajectory(pred_df,test_nonzero_per_playa_loss.index[0], wide=True)
plot_precip_trajectory(precip_df, test_nonzero_per_playa_loss.index[0])

In [None]:
plot_single_playa_trajectory(pred_df, test_nonzero_per_playa_loss.index[-1], wide=True)
plot_precip_trajectory(precip_df, test_nonzero_per_playa_loss.index[-1])

In [None]:
plot_single_playa_trajectory(pred_df, test_nonzero_per_playa_loss.index[int(test_nonzero_per_playa_loss.shape[0]/2)], wide=True)
plot_single_playa_trajectory(pred_df, test_nonzero_per_playa_loss.index[int(test_nonzero_per_playa_loss.shape[0]/2)+1], wide=True)
plot_single_playa_trajectory(pred_df, test_nonzero_per_playa_loss.index[int(test_nonzero_per_playa_loss.shape[0]/2)-1], wide=True)

### Test f1

In [None]:
if not os.path.exists('../data/per_playa_test_f1_v2.csv'):# Looping to save some memory
    per_loop = 5000
    test_starts = np.arange(0, test_pred_df.shape[0], per_loop*48)
    per_playa_test_f1_series = pd.Series(dtype='float64')
    for start_point in test_starts:
        end = min(pred_df.shape[0], start_point + per_loop*48)
        temp_f1_series = test_pred_df.iloc[start_point:end].groupby('id').apply(lambda df: calc_f1_score(df['true'],df['pred']))
        per_playa_test_f1_series = per_playa_test_f1_series.append(temp_f1_series)
        
    per_playa_test_f1_series.to_csv('../data/per_playa_test_f1_v2.csv')    
else:
    per_playa_test_f1_series = pd.read_csv('../data/per_playa_test_f1_v2.csv', names=['id','f1']).iloc[1:].set_index('id')['f1']

### All loss

In [None]:
max_inun = pred_df.groupby('id').aggregate({'true':'max'})
nonzero_ids = max_inun.index[max_inun['true'].values>0]

In [None]:
pd.DataFrame(max_inun).to_csv('../data/max_inundation.csv')

In [None]:
if not os.path.exists('../data/per_playa_all_loss_v2.csv'):# Looping to save some memory
   # Looping to save some memory
    per_loop = 2000
    starts = np.arange(0, pred_df.shape[0], per_loop*418)
    per_playa_loss_series = pd.Series(dtype='float64')
    for start_point in starts:
        end = min(pred_df.shape[0], start_point + per_loop*418)
        temp_loss_series = pred_df.iloc[start_point:end].groupby('id').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
        per_playa_loss_series = per_playa_loss_series.append(temp_loss_series)
        
    per_playa_loss_series.to_csv('../data/per_playa_all_loss_v2.csv')    
else:
    per_playa_loss_series = pd.read_csv('../data/per_playa_all_loss_v2.csv', names=['id','loss']).iloc[1:].set_index('id')['loss']

In [None]:
nonzero_per_playa_loss = per_playa_loss_series.loc[nonzero_ids].sort_values()

In [None]:
plot_single_playa_trajectory(pred_df,nonzero_per_playa_loss.index[0])
plot_precip_trajectory(precip_df,nonzero_per_playa_loss.index[0])

In [None]:
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[-1])
plot_precip_trajectory(precip_df,nonzero_per_playa_loss.index[-1])

In [None]:
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[int(nonzero_per_playa_loss.shape[0]/2)])
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[int(nonzero_per_playa_loss.shape[0]/2)+1])
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[int(nonzero_per_playa_loss.shape[0]/2)-1])

# Playa Inundation over time (predicted vs real)

In [None]:
pred_df[['true','pred_binary']].groupby('date').mean().plot(figsize=[15,5])
plt.axvline(dt.datetime(2014,1,1), color='black')
plt.axvline(dt.datetime(2010,1,1), color='black')
plt.text(dt.datetime(2010,3,1), 0.14, 'Validation', size=15)
plt.text(dt.datetime(2014,3,1), 0.14, 'Test', size=15)
plt.text(dt.datetime(1984,5,1), 0.14, 'Train', size=15)
plt.legend(['True','Predicted'], loc=9)
plt.ylim([0,0.15])
plt.show()

precip_df[['precip']].groupby('date').mean().plot(figsize=[15,5])
plt.axvline(dt.datetime(2014,1,1), color='black')
plt.axvline(dt.datetime(2010,1,1), color='black')
plt.text(dt.datetime(2010,3,1), -0.5, 'Validation', size=15)
plt.text(dt.datetime(2014,3,1), -0.5, 'Test', size=15)
plt.text(dt.datetime(1984,5,1), -0.5 'Train', size=15)
plt.show()

# Spatial representation of val/test error

In [None]:
test_pred_df = test_pred_df.reset_index().set_index('id')
val_pred_df = val_pred_df.reset_index().set_index('id')

In [None]:
test_pred_df = test_pred_df.join(playa_att_df, how='inner')
val_pred_df = val_pred_df.join(playa_att_df, how='inner')

## By state to start

In [None]:
state_test_loss = test_pred_df.groupby('state').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
state_val_loss = val_pred_df.groupby('state').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
state_test_f1 = test_pred_df.groupby('state').apply(lambda df: calc_f1_score(df['true'],df['pred']))
state_val_f1 = val_pred_df.groupby('state').apply(lambda df: calc_f1_score(df['true'],df['pred']))
state_count = playa_att_df.groupby('state').size()

In [None]:
fig, ax = plt.subplots(2,3, figsize=[15,8])
state_count.plot.bar(ax=ax[0,0], title='Playa Count')
state_val_loss.plot.bar(ax=ax[0,1], title='Val Loss')
state_val_f1.plot.bar(ax=ax[0,2], title='Val F1')
state_test_loss.plot.bar(ax=ax[1,1], title='Test Loss')
state_test_f1.plot.bar(ax=ax[1,2], title='Test F1')
fig.tight_layout()

## By HUC 4

In [None]:
huc_test_loss = test_pred_df.groupby('huc4').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
huc_val_loss = val_pred_df.groupby('huc4').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
huc_test_f1 = test_pred_df.groupby('huc4').apply(lambda df: calc_f1_score(df['true'],df['pred']))
huc_val_f1 = val_pred_df.groupby('huc4').apply(lambda df: calc_f1_score(df['true'],df['pred']))
huc_count = playa_att_df.groupby('huc4').size()

In [None]:
fig, ax = plt.subplots(2,3, figsize=[15,8])
huc_count.plot.bar(ax=ax[0,0], title='HUC4 Playa Count')
huc_val_loss.plot.bar(ax=ax[0,1], title='Val Loss')
huc_val_f1.plot.bar(ax=ax[0,2], title='Val F1')
huc_test_loss.plot.bar(ax=ax[1,1], title='Test Loss')
huc_test_f1.plot.bar(ax=ax[1,2], title='Test F1')
fig.tight_layout()

## By Author

In [None]:
author_test_loss = test_pred_df.groupby('author').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
author_val_loss = val_pred_df.groupby('author').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
author_test_f1 = test_pred_df.groupby('author').apply(lambda df: calc_f1_score(df['true'],df['pred']))
author_val_f1 = val_pred_df.groupby('author').apply(lambda df: calc_f1_score(df['true'],df['pred']))
author_count = playa_att_df.groupby('author').size()

In [None]:
fig, ax = plt.subplots(2,3, figsize=[15,10])
author_count.plot.bar(ax=ax[0,0], title='Author Playa Count', rot=45)
author_val_loss.plot.bar(ax=ax[0,1], title='Val Loss', rot=45)
author_val_f1.plot.bar(ax=ax[0,2], title='Val F1', rot=45)
author_test_loss.plot.bar(ax=ax[1,1], title='Test Loss', rot=45)
author_test_f1.plot.bar(ax=ax[1,2], title='Test F1', rot=45)
fig.tight_layout()