In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
import numpy as np
import datetime as dt

In [None]:
playa_att_df = pd.read_csv('../data/playa_nogeometry_whucs.csv')
playa_att_df = playa_att_df.set_index('id')
playa_att_df = playa_att_df[['state','acres','countyfips','cluster','farmed','hydromod','healthy', 'huc12', 'author']]

In [None]:
pred_df = pd.read_csv('../data/all_preds.csv')

In [None]:
# Add months and years back in
pred_df['date'] = np.tile(pd.date_range('1984-03', periods=418, freq='M'), int(pred_df.shape[0]/418))

In [None]:
pred_df = pred_df.set_index(['id', 'date'])

# Calculate loss per playa

In [None]:
# Looping to save some memory
per_loop = 2000
starts = np.arange(0, pred_df.shape[0], per_loop*418)

In [None]:
max_inun = pred_df.groupby('id').aggregate({'true':'max'})
nonzero_ids = max_inun.index[max_inun['true'].values>0]

In [None]:
per_playa_loss_series = pd.Series(dtype='float64')
for start_point in starts:
    end = min(pred_df.shape[0], start_point + per_loop*418)
    temp_loss_series = pred_df.iloc[start_point:end].groupby('id').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
    per_playa_loss_series = per_playa_loss_series.append(temp_loss_series)

In [None]:
nonzero_per_playa_loss = per_playa_loss_series.loc[nonzero_ids].sort_values()

In [None]:
# Plot best, worst, and a few in the middle
def plot_single_playa_trajectory(df, id):
    playa_df = df.loc[id]
    playa_df[['true','pred']].plot(figsize=[15,5])

In [None]:
plot_single_playa_trajectory(pred_df,nonzero_per_playa_loss.index[0])

In [None]:
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[-1])

In [None]:
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[int(nonzero_per_playa_loss.shape[0]/2)])
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[int(nonzero_per_playa_loss.shape[0]/2)+1])
plot_single_playa_trajectory(pred_df, nonzero_per_playa_loss.index[int(nonzero_per_playa_loss.shape[0]/2)-1])

# Playa Inundation over time (predicted vs real)

In [None]:
pred_df = pred_df.assign(pred_binary=pred_df['pred']>0.25)

In [None]:
pred_df[['true','pred_binary']].groupby('date').mean().plot(figsize=[15,5])
plt.axvline(dt.datetime(2014,1,31), color='black')
plt.axvline(dt.datetime(2010,1,31), color='black')

# Spatial representation of val/test error
## By state to start

In [None]:
pred_df['set_flag'] = 2
pred_df.loc[pred_df.index.get_level_values(1)<pd.Timestamp('2014-01-01'), 'set_flag'] = 1
pred_df.loc[pred_df.index.get_level_values(1)<pd.Timestamp('2010-01-01'), 'set_flag'] = 0

In [None]:
test_pred_df = pred_df.loc[pred_df['set_flag']==2]
val_pred_df = pred_df.loc[pred_df['set_flag']==1]

In [None]:
log_loss(test_pred_df['true'], test_pred_df['pred'])

In [None]:
log_loss(val_pred_df['true'], val_pred_df['pred'])

In [None]:
test_pred_df = test_pred_df.reset_index().set_index('id')
val_pred_df = val_pred_df.reset_index().set_index('id')

In [None]:
test_pred_df = test_pred_df.join(playa_att_df, how='inner')
val_pred_df = val_pred_df.join(playa_att_df, how='inner')

In [None]:
state_test_loss = test_pred_df.groupby('state').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
state_val_loss = val_pred_df.groupby('state').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))

In [None]:
state_val_loss.plot.bar()
plt.show()
state_test_loss.plot.bar()
plt.show()

## By HUC 4

In [None]:
test_pred_df['huc4'] = test_pred_df['huc12'].astype(str).str[0:4]
val_pred_df['huc4'] = val_pred_df['huc12'].astype(str).str[0:4]

In [None]:
huc_test_loss = test_pred_df.groupby('huc4').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
huc_val_loss = val_pred_df.groupby('huc4').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))

In [None]:
huc_val_loss.plot.bar()
plt.show()
huc_test_loss.plot.bar()
plt.show()

## By Author

In [None]:
author_test_loss = test_pred_df.groupby('author').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))
author_val_loss = val_pred_df.groupby('author').apply(lambda df: log_loss(df['true'],df['pred'], labels=[0,1]))

In [None]:
author_val_loss.plot.bar()
plt.show()
author_test_loss.plot.bar()
plt.show()