In [None]:
import pandas as pd
import numpy as np
import random
import os
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow_probability as tfp
tfk = tf.keras
tf.keras.backend.set_floatx('float64')
tfd = tfp.distributions
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
tf.random.set_seed(SEED)

In [None]:
neg_log_likelihood = lambda x, rv_x: -rv_x.log_prob(x)

#### The data

In [None]:
jinja_df = pd.read_csv('../data/jinja_data.csv', parse_dates=['timestamp'])
jinja_df.head()

In [None]:
latitudes = jinja_df['latitude'].unique()
longitudes = jinja_df['longitude'].unique()
device_ids = jinja_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

In [None]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value']
for i, device_id in enumerate(device_ids):
    device_df = utils.get_device_data(jinja_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

#### Model training and validation

In [None]:
def bnn(X_train, y_train, epochs=1000, optimizer='RMSProp', dropout=0.2):

    prior = tfd.Independent(tfd.Normal(loc=tf.zeros(1, dtype=tf.float64), scale=1.0), 
                            reinterpreted_batch_ndims=1)
    model = tfk.Sequential([
        tfk.layers.InputLayer(input_shape=(3,), name='input'),
        tfk.layers.Dropout(dropout, name='dropout1'),
        tfk.layers.Dense(10, activation='relu', name='dense_1'),
        tfk.layers.Dropout(dropout, name='dropout2'),
        tfk.layers.Dense(tfp.layers.MultivariateNormalTriL.params_size(1), activation=None, name='distribution_weights'),
        tfk.layers.Dropout(dropout, name='dropout3'),
        tfp.layers.MultivariateNormalTriL(1, activity_regularizer=tfp.layers.KLDivergenceRegularizer(prior, weight=1/32), 
                                          name='output')], name='model')
    
    model.compile(optimizer=optimizer, loss=tf.keras.losses.MeanSquaredError())
    checkpoint = ModelCheckpoint('../models/bnn_checkpoint.h5', monitor='val_loss', save_best_only=True,
                                save_weights_only=False)
    early_stopping = EarlyStopping(monitor='val_loss', patience=300)
    model.fit(X_train, y_train, batch_size=32, epochs=epochs, callbacks=[checkpoint, early_stopping], 
              validation_split=0.2)
    return model

In [None]:
def cross_validation(final_df, idx):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = jinja_df[jinja_df.device_number == device_ids[idx]]
    assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
    assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
    assert(len(train_df.longitude.unique()) == len(longitudes)-1)
    assert len(final_df) == len(test_df) + len(train_df)
    
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
#     data_train =tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
#     data_train = data_train.batch(32).repeat(n_epochs)
    X_train, y_train = np.array(X_train), np.array(y_train)#.reshape(-1, 1)
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
#     data_test =tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))
#     data_test = data_test.batch(1)
    X_test, y_test = np.array(X_test), np.array(y_test)#.reshape(-1, 1)
    
    model = bnn(X_train, y_train)
    y_pred = model.predict(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    return rmse

In [None]:
rmse_list = []
for i in range(len(latitudes)):
    rmse = cross_validation(final_df, i)
    rmse_list.append(rmse)
    print(f'{device_ids[i]} successful')

In [None]:
mean_rmse = np.mean(rmse_list)          
mean_rmse

In [None]:
rmse_list