In [None]:
import pandas as pd
import numpy as np
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
import gpflow
from gpflow import set_trainable
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import warnings
import tensorflow as tf
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
SEED = 42
# random.seed(SEED)
np.random.seed(SEED)
# os.environ['PYTHONHASHSEED']=str(SEED)
tf.random.set_seed(SEED)
# gpflow.config.set_default_seed(SEED)

In [None]:
kampala_devices = pd.read_csv('../data/kampala_devices.csv', usecols=['lat', 'long', 'id'])
kampala_devices.head()

In [None]:
len(kampala_devices)

In [None]:
kampala_df = pd.read_csv('../data/kampala_data_with_met.csv', parse_dates=['timestamp'])
kampala_df.head()

In [None]:
latitudes = kampala_df['latitude'].unique()
longitudes = kampala_df['longitude'].unique()
device_ids = kampala_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

In [None]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value', 'temperature', 'humidity', 'wind_speed', 'wind_direction']
# for i, device_id in enumerate(device_ids):
for i, device_id in kampala_devices.id.iteritems():
    device_df = utils.get_device_data(kampala_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
#     print(f'{device_id}: {len(processed_df)}')
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

In [None]:
len(final_df), len(final_df.latitude.unique()), len(final_df.longitude.unique())

In [None]:
def cross_validation(final_df, idx, kernel_variance, lengthscales, likelihood_variance, trainable_kernel, 
                     trainable_variance, trainable_lengthscales):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = kampala_df[kampala_df.device_number == device_ids[idx]]
#     assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
#     assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
#     assert(len(train_df.longitude.unique()) == len(longitudes)-1)
#     assert len(final_df) == len(test_df) + len(train_df)
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
    X_train, y_train = np.array(X_train), np.array(y_train).reshape(-1, 1)
    if X_train.shape[0] > 39999:
        X_train = X_train[::6, :]
        y_train = y_train[::6, :]
#     print('printing x_train')
#     print(f'X_train shape:{X_train.shape}')
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
    X_test, y_test = np.array(X_test), np.array(y_test).reshape(-1, 1)
    #to delete
    #X_train, y_train, X_test, y_test = X_train[:100, :], y_train[:100, :], X_test[:100, :], y_test[:100, :]
    
    if lengthscales == 'train_shape':
        lengthscales = np.ones(X_train.shape[1])
    
    if (lengthscales is None) & (kernel_variance is None):
        k = gpflow.kernels.RBF() + gpflow.kernels.Bias()
    elif lengthscales is None:
        k = gpflow.kernels.RBF(variance=kernel_variance) + gpflow.kernels.Bias()
    elif kernel_variance is None:
        k = gpflow.kernels.RBF(lengthscales=lengthscales) + gpflow.kernels.Bias()
    else:
        k = gpflow.kernels.RBF(lengthscales=lengthscales, variance=kernel_variance) + gpflow.kernels.Bias()
#     print('Training model .....................')    
    m = gpflow.models.GPR(data=(X_train, y_train), kernel=k, mean_function=None)
    if likelihood_variance is None:
        pass
    else:
        m.likelihood.variance.assign(likelihood_variance)
    set_trainable(m.kernel.kernels[0].variance, trainable_kernel)
    set_trainable(m.likelihood.variance, trainable_variance)
    set_trainable(m.kernel.kernels[0].lengthscales, trainable_lengthscales)
    
    #optimization
#     print('Optimizing model ...........................')
    opt = gpflow.optimizers.Scipy()
    def objective_closure():
        return - m.log_marginal_likelihood()
    
    opt_logs = opt.minimize(objective_closure,
                            m.trainable_variables,
                            options=dict(maxiter=100))

    #prediction
    mean, var = m.predict_f(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, mean.numpy()))
    mape = mean_absolute_percentage_error(y_test, mean.numpy())
    return rmse, mape
    
#     return mean.numpy(), var.numpy(), Xtest, Ytest, round(rmse, 2)

#### The real work

In [None]:
# lengthscales = [[0.08, 0.08, 1], None, 'train_shape']
# lengthscales = [[0.08, 0.08, 1]]
lengthscale = None
# likelihood_variances = [400, 625, None]
likelihood_variance = 400
# kernel_variances = [625, 400, None]
kernel_variance  = 625
# trainable_kernels = [True, False]
trainable_kernel = True
# trainable_variances = [True, False]
trainable_variance = True
# trainable_lengthscales = [False, True]
trainable_lengthscale = True #only changed this

In [None]:
# count = 0
results_df = pd.DataFrame()
# for lengthscale in lengthscales:
#     for likelihood_variance in likelihood_variances:
#         for kernel_variance in kernel_variances:
#             for trainable_kernel in trainable_kernels:
#                 for trainable_variance in trainable_variances:
#                     for trainable_lengthscale in trainable_lengthscales:
#                         count+=1
rmse_list = []
mape_list = []
#                         print(f'EXPERIMENT {count}')
for i in range(len(latitudes)):
    try:
        rmse, mape = cross_validation(final_df, i, kernel_variance, lengthscale, likelihood_variance, 
                                trainable_kernel, trainable_variance, trainable_lengthscale)
        rmse_list.append(rmse)
        mape_list.append(mape)
        print(f'{device_ids[i]} successful')
    except Exception as e:
        print(e)
        print(f'{device_ids[i]} failed')
        break

mean_rmse = np.mean(rmse_list)
mean_mape = np.mean(mape_list)
results_dict= {'lengthscale':lengthscale, 'likelihood_variance':likelihood_variance, 
               'kernel_variance':kernel_variance, 'trainable_kernel':trainable_kernel, 
               'trainable_variance':trainable_variance, 'trainable_lengthscale':trainable_lengthscale, 
               'avg_rmse':mean_rmse, 'rmse_list':rmse_list, 'avg_mape':mean_mape, 'mape_list':mape_list}
print(results_dict)
results_df = results_df.append(results_dict, ignore_index=True)
results_df.to_csv(f'../results/basic_results_with_met.csv', index=False)               

In [None]:
mean_mape

###### 