In [None]:
# general
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import warnings

# modeling
import lightgbm as lgb
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import shap

# custom
from scripts import HyperparameterTuning as HT
from scripts import Plots, Metrics

# Load and pre-process data

In [None]:
# options
env_path = 'data/environmental_data.csv'
bat_path = 'data/bat-level_data.csv'
random_state = 1337
dataset = 'bat' # 'env', 'bat'
target = 'shortage'

# load configs
env_features = []
with open('config/env_features.txt', 'r') as f:
    for line in f:
        env_features.append(line.strip())
bat_features = []
with open('config/bat_features.txt', 'r') as f:
    for line in f:
        bat_features.append(line.strip())
rename = {}
with open('config/rename.tsv', 'r') as f:
    for line in f:
        line = line.strip().split('\t')
        rename[line[0]] = line[1]

# book keeping
features = env_features if dataset == 'env' else bat_features

# load both datasets, sort, and merge
df_env = pd.read_csv(env_path).sort_values(
    by=['cal_year', 'cal_month'], ascending=[True, True])
df_bat = pd.read_csv(bat_path).sort_values(
    by=['cal_year', 'cal_month'], ascending=[True, True])
df = pd.merge(df_env, df_bat, on=['cal_year', 'cal_month'], how='outer')

# convert categories to binary and drop missing
df = df.replace('shortage', 1).replace('not_shortage', 0)
df = df.dropna(subset=[target])
df[target] = df[target].astype(int)

# include season as additional feature
to_season = {
    12: 1,  1: 1,  2: 1,
     3: 2,  4: 2,  5: 2,
     6: 3,  7: 3,  8: 3,
     9: 4, 10: 4, 11: 4,}
df['cal_season'] = df['cal_month'].apply(lambda x: to_season[x]).astype(float)

# drop rows with >50% missing features
df = df.dropna(thresh=0.5*len(features), subset=features)

# impute missing values
n_missing = df[features].isna().sum().sum()
n_total = len(df) * len(features)
imp = IterativeImputer(max_iter=100, random_state=random_state)
df[features] = imp.fit_transform(df[features])

# start integer features from 0 and convert to categorical
for feature in ['cal_year', 'cal_month']:
    df[feature] = df[feature].astype(int)
for feature in ['cal_season', 'cal_month']:
    df[feature] = (df[feature] - df[feature].min()).astype('category')

# reset df index starting from 0
df = df.reset_index(drop=True)

# summary
print('Missing proportion: {:.2f}% values'.format(n_missing / n_total * 100))
print('Data shape: {} months, {} features'.format(*df[features].shape))
print('Food shortages: {} / {} ({:.2f}%)'.format(
    df[target].sum(), len(df), df[target].sum() / len(df) * 100))

# Split data into train/val/test sets

In [None]:
# options
start_year = df['cal_year'].min() + 4
test_year = 2018
split_size = 2

# convert dataframes to numpy arrays for modeling
X, y = df[features].values, df[target].values
years = df['cal_year'].values

# get indices of training and validation sets
split_years = np.arange(start_year, test_year, split_size)
train_idxs, val_idxs = [], []
for i, sy in enumerate(split_years):
    train_idxs.append(np.argwhere(years < sy).flatten())
    val_idxs.append(np.argwhere((years >= sy) * (years < sy+split_size)).flatten())
test_idx = np.argwhere(years >= test_year).flatten()

# compute train/val score weights
weights = [len(t_idx) for t_idx in train_idxs]

# plot train/val/test splits
Plots.cv_splits(
    years, 
    cv_splits=[train_idxs, val_idxs, test_idx], 
    dataset=dataset,
    save_name=f'figures/{dataset}_cv_splits.pdf')

# Train GBDT using hyperparameter grid search

In [None]:
# options
np.random.seed(random_state)
use_best = False

# model class
model_class = lgb.LGBMClassifier

# add eval method to model class (minimum validation log loss)
def evaluate(self, x, y):
    return min(self.evals_result_['valid_0']['binary_logloss'])
model_class.evaluate = evaluate

# model parameters
model_hyper = {
    'learning_rate': [0.01, 0.05, 0.1], # boosting learning rate   (default=0.1)
    'num_leaves': [4, 8, 16], # max tree leaves for base learners  (default=31)
    'max_bin': [4, 8, 16], # max bins that feature is bucketed     (default=255)
    'min_child_samples': [1, 5, 10], # min data in leaf            (default=20)
    'min_child_weight': [0.001, 0.01, 0.1], # min sum leaf hessian (default=1e-3)
    'reg_alpha': [0.0, 0.01, 0.1], # l1 regularization             (default=0.0)
    'reg_lambda': [0.0, 0.01, 0.1], # l2 regularization            (default=0.0)
}
model_fixed = {
    'boosting_type': 'gbdt', # gradient boosting decision tree
    'objective': 'binary', # shortage / no shortage binary classification
    'n_estimators': 100, # number of iterations
    'class_weight': 'balanced', # include class balance in loss function
    'importance_type': 'split', # feature importance by number of splits
    'n_jobs': 1, # -1: use all cores (wasteful on small datasets like this one)
    'max_depth': 4, # constrain max tree depth to prevent overfitting
    'cat_l2': 0, # L2 regularization for categorical features
    'cat_smooth': 0, # smoothing for categorical features
    'min_split_gain': 0.0, # min. reduction in loss from adding a split
    'verbose': -1, # -1 = silent, 0 = warn, 1 = info
    'random_state': random_state, # seed
}

# train parameters
train_hyper = {}
train_fixed = {
    'feature_name': 'auto', # 'None', 'auto'
    'categorical_feature': None, # None, 'auto'
    'eval_metric': model_fixed['objective'],
    'callbacks': [ # early stopping so lgbm records validation performance
        lgb.early_stopping(model_fixed['n_estimators'],
        verbose=False)], 
}

# best parameters
env_best = {
    'learning_rate':     [0.1],
    'num_leaves':        [16],
    'max_bin':           [4],
    'min_child_samples': [5],
    'min_child_weight':  [0.01],
    'reg_alpha':         [0.0],
    'reg_lambda':        [0.01],
}
bat_best = {
    'learning_rate':     [0.1],
    'num_leaves':        [8],
    'max_bin':           [16],
    'min_child_samples': [5],
    'min_child_weight':  [0.01],
    'reg_alpha':         [0.0],
    'reg_lambda':        [0.1],
}
if use_best and dataset == 'env': model_hyper = env_best
if use_best and dataset == 'bat': model_hyper = bat_best

# tune model hyperparameters
model_best, train_best, model_list, score_list = HT.hyperparameter_tuning(
    model_class=model_class,
    model_hyper=model_hyper,
    model_fixed=model_fixed,
    train_hyper=train_hyper,
    train_fixed=train_fixed,
    x=X,
    y=y,
    train_indices=train_idxs,
    val_indices=val_idxs,
    weights=weights,
    verbose=True,
)
time.sleep(0.5)

# best iteration is that which minimizes the validation score
def get_best_iter(m): 
    return np.nanargmin(m.evals_result_['valid_0']['binary_logloss'])

# get best iteration
score_means = [np.average(s, weights=weights) for s in score_list]
best_models = model_list[np.nanargmin(score_means)]
iterations = [get_best_iter(m) for m in best_models]
best_iteration = int(np.average(iterations, weights=weights))
model_fixed['n_estimators'] = best_iteration
print('Best iteration:', best_iteration)

# Plot model predictions

In [None]:
def predict(train_idx, val_idx):
    
    # fit model
    model = model_class(
        **model_best, 
        **model_fixed,
    ).fit(
        X=X[train_idx],
        y=y[train_idx],
        eval_set=(X[val_idx], y[val_idx]),
        **train_best,
        **train_fixed,
    )

    # predict probabilities
    y_pred_train = model.predict_proba(
        X[train_idx], num_iteration=best_iteration)[:, 1]
    y_pred_val = model.predict_proba(
        X[val_idx], num_iteration=best_iteration)[:, 1]

    return y_pred_train, y_pred_val, model

# compute predictions for each set
y_prob_train = predict(train_idxs[0], val_idxs[0])[0]
y_prob_val = np.concatenate([predict(t, v)[1] for t, v in zip(train_idxs, val_idxs)])
y_prob_test = predict(np.arange(test_idx[0]), test_idx)[1]
y_prob = np.concatenate([y_prob_train, y_prob_val, y_prob_test])

# create dates from years and months
dates = df['cal_year'].astype(str) + '-' + df['cal_month'].astype(int).astype(str)
dates = pd.Series(dates.values, index=np.arange(len(dates)))

# compute optimal probability threshold based on f1 score
val_idx = np.concatenate(val_idxs)
threshold = Metrics.get_threshold(y[val_idx], y_prob_val)
y_pred = (y_prob > threshold).astype(int)

# plot predictions over time
Plots.predictions(
    y_true=y, 
    y_probs=[y_prob_train, y_prob_val, y_prob_test],
    cv_splits=[train_idxs, val_idxs, test_idx],
    dates=dates, 
    threshold=threshold, 
    dataset=dataset,
    save_name=f'figures/{dataset}_predictions.pdf',
)

# Print model performance metrics

In [None]:
Metrics.print_metrics(
    y, 
    y_prob, 
    dates.values, 
    val_idx=val_idx, 
    test_idx=test_idx)

# Compute shap values and shap interactions

In [None]:
# book keeping
train_idx = np.argwhere(years < test_year).flatten()
test_idx = np.argwhere(years >= test_year).flatten()

# split datasets
x_train = pd.DataFrame(X[train_idx], columns=[rename[f] for f in features])
y_train = y[train_idx]
x_test = pd.DataFrame(X[test_idx], columns=[rename[f] for f in features])
y_test = y[test_idx]
x_total = pd.DataFrame(X, columns=[rename[f] for f in features])
y_total = y

# prefit GBM Classifier on training set
gbdt = predict(train_idx, test_idx)[2]

# compute shap values [N, F]
explainer = shap.Explainer(gbdt, algorithm='tree', seed=random_state)
train_shap_values = explainer(x_train)[:, :, 1] # 1 = shortage
test_shap_values = explainer(x_test)[:, :, 1] # 1 = shortage

# compute SHAP interactions [N, F, F]
train_shap_interactions = explainer.shap_interaction_values(x_train)
test_shap_interactions = explainer.shap_interaction_values(x_test)

# normalize shap values [F]
train_shap_norms = np.abs(train_shap_values.values).mean(0)
train_shap_norms = train_shap_norms / train_shap_norms.sum()
test_shap_norms = np.abs(test_shap_values.values).mean(0)
test_shap_norms = test_shap_norms / test_shap_norms.sum()

# Plot shap values

In [None]:
# options
max_display = (train_shap_norms > 0.025).sum()
order = list(np.argsort(train_shap_norms)[::-1])
cmap = 'coolwarm'

# plot shap values for training set
Plots.shap_beeswarm(
    shap_values=train_shap_values, 
    max_display=max_display,
    split='train',
    order=order, 
    save_name=f'figures/{dataset}_shap_values_train.pdf',
)

# plot shap values for test set
Plots.shap_beeswarm(
    shap_values=test_shap_values, 
    max_display=max_display,
    split='test',
    order=order, 
    save_name=f'figures/{dataset}_shap_values_test.pdf',
)

# Plot shap interactions

In [None]:
# get top two feature names, values, shaps, and interactions
f1 = features[order[0]]
f2 = features[order[1]]
d1 = x_train[rename[f1]].values.copy()
d2 = x_train[rename[f2]].values.copy()
s1 = train_shap_values.values[:, features.index(f1)].copy()
s2 = train_shap_values.values[:, features.index(f2)].copy()
i = 2*train_shap_interactions[:, features.index(f1), features.index(f2)]

# prepare data
x_dt = np.concatenate([d1[:, None], d2[:, None]], axis=1)
y1 = (np.sign(s1) > 0).astype(int)
y2 = (np.sign(s2) > 0).astype(int)
yi = (np.sign(i) > 0).astype(int)

# individual splits
individual_splits = [
    Metrics.optimal_shap_splits(x_dt[:, 0:1], y1, [f1])[1][0],
    Metrics.optimal_shap_splits(x_dt[:, 1:2], y2, [f2])[1][0]]

# interaction splits
interaction_splits = Metrics.optimal_shap_splits(x_dt, yi, [f1, f2])[1]

Plots.shap_interactions(
    feature_names=[f1, f2],
    feature_data=[d1, d2],
    feature_shaps=[s1, s2],
    feature_interactions=i,
    individual_splits=individual_splits,
    interaction_splits=interaction_splits,
    rename=rename,
    save_name=f'figures/{dataset}_shap_interactions.pdf',
)

# Plot threshold model

In [None]:
#
# plot threshold model
#

# book keeping
if dataset == 'env':
    rules = [
        lambda x, t: (x >= t).astype(int), 
        lambda x, t: (x >= t).astype(int)]
if dataset == 'bat':
    rules = [
        lambda x, t: (x <= t).astype(int), 
        lambda x, t: (x > t).astype(int)]
d1 = x_total[rename[f1]].values.copy()
d2 = x_total[rename[f2]].values.copy()

# plot threshold model
Plots.threshold_model(
    y_true=y,
    dates=dates,
    feature_names=[rename[f1], rename[f2]],
    feature_data=[d1, d2],
    feature_splits=interaction_splits,
    feature_rules=rules,
    train_idx=train_idx,
    test_idx=test_idx,
    dataset=dataset,
    save_name=f'figures/{dataset}_threshold_model.pdf',
)

In [None]:
# print individual-level metrics
split1, split2 = individual_splits
y_ind_test = rules[0](d1[test_idx], split1) * rules[1](d2[test_idx], split2)
Metrics.print_metrics(
    y, 
    np.concatenate([np.zeros(len(train_idx)), y_ind_test]),
    dates.values, 
    threshold=0.5,
    test_idx=test_idx)

In [None]:
# print interaction-level metrics
split1, split2 = interaction_splits
y_int_test = rules[0](d1[test_idx], split1) * rules[1](d2[test_idx], split2)
Metrics.print_metrics(
    y, 
    np.concatenate([np.zeros(len(train_idx)), y_int_test]),
    dates.values, 
    threshold=0.5,
    test_idx=test_idx)