In [1]:
import pandas as pd
import numpy as np
import random
import sys
import tensorflow as tf
import os
from math import sqrt
sys.path.append('../..')
from modules import utils
import gpflow
from gpflow import set_trainable
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
# random.seed(SEED)
np.random.seed(SEED)
# os.environ['PYTHONHASHSEED']=str(SEED)
tf.random.set_seed(SEED)
# gpflow.config.set_default_seed(SEED)

In [3]:
jinja_df = pd.read_csv('../data/jinja_data_with_met.csv', parse_dates=['timestamp'])
jinja_df.head()

Unnamed: 0,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,site_name,latitude,longitude,city,temperature,humidity,wind_speed,precipitation,wind_direction,wind_gusts
0,2021-09-01 00:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,"Jinja Main Street, Jinja",0.432968,33.21001,Jinja,19.2,94.075,3.69,0.221,335.75,4.448333
1,2021-09-01 01:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,"Jinja Main Street, Jinja",0.432968,33.21001,Jinja,18.825,94.625,5.1275,0.119,310.083333,6.5675
2,2021-09-01 02:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,"Jinja Main Street, Jinja",0.432968,33.21001,Jinja,18.141667,92.749999,4.653333,0.153,336.25,5.87
3,2021-09-01 03:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,"Jinja Main Street, Jinja",0.432968,33.21001,Jinja,17.958333,91.141667,4.114167,0.17,333.0,5.175833
4,2021-09-01 04:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,"Jinja Main Street, Jinja",0.432968,33.21001,Jinja,17.975,91.733334,3.645,0.136,328.5,4.425833


In [4]:
jinja_df.tail()

Unnamed: 0,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,site_name,latitude,longitude,city,temperature,humidity,wind_speed,precipitation,wind_direction,wind_gusts
17067,2021-11-30 19:00:00+00:00,58.2813,74.60381,84.830833,76.151581,615c878d580358002ae96dc2,1379965,"Mpumudde, Jinja",0.46146,33.2103,Jinja,20.983333,80.191668,2.28,0.0,34.083333,2.8
17068,2021-11-30 20:00:00+00:00,33.8288,48.961333,59.852,48.60819,615c878d580358002ae96dc2,1379965,"Mpumudde, Jinja",0.46146,33.2103,Jinja,20.658333,81.833333,2.5825,0.0,65.916667,2.865
17069,2021-11-30 21:00:00+00:00,,,,,615c878d580358002ae96dc2,1379965,"Mpumudde, Jinja",0.46146,33.2103,Jinja,20.358333,83.041665,2.853333,0.0,11.5,3.045833
17070,2021-11-30 22:00:00+00:00,,,,,615c878d580358002ae96dc2,1379965,"Mpumudde, Jinja",0.46146,33.2103,Jinja,20.108333,85.3,3.509167,0.0,297.75,3.71
17071,2021-11-30 23:00:00+00:00,,,,,615c878d580358002ae96dc2,1379965,"Mpumudde, Jinja",0.46146,33.2103,Jinja,19.95,85.85,3.285,0.0,323.666667,3.523333


In [5]:
latitudes = jinja_df['latitude'].unique()
longitudes = jinja_df['longitude'].unique()
device_ids = jinja_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

(10, 10, 10)

In [6]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value', 'temperature', 'humidity']
for i, device_id in enumerate(device_ids):
    device_df = utils.get_device_data(jinja_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5,temperature,humidity,wind_speed,wind_direction
0,452909.0,0.432968,33.21001,12.2844,18.233333,92.616666,3.0575,324.75
1,452910.0,0.432968,33.21001,11.6507,18.925,93.533333,2.6075,322.166667
2,452911.0,0.432968,33.21001,22.398,20.083333,89.033333,2.8825,98.083333
3,452912.0,0.432968,33.21001,17.4937,20.916667,80.658334,3.006667,291.333333
4,452913.0,0.432968,33.21001,25.1622,20.95,80.083333,3.180833,93.833333


In [7]:
len(final_df.latitude.unique()), len(final_df.longitude.unique())

(10, 10)

#### delete from here

#### end here

In [8]:
def cross_validation(final_df, idx, kernel_variance, lengthscales, likelihood_variance, trainable_kernel, 
                     trainable_variance, trainable_lengthscales):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = jinja_df[jinja_df.device_number == device_ids[idx]]
#     assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
    assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
    assert(len(train_df.longitude.unique()) == len(longitudes)-1)
    assert len(final_df) == len(test_df) + len(train_df)
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
    X_train, y_train = np.array(X_train), np.array(y_train).reshape(-1, 1)
    if X_train.shape[0] > 9999:
        X_train = X_train[::2, :]
        y_train = y_train[::2, :]
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
    X_test, y_test = np.array(X_test), np.array(y_test).reshape(-1, 1)
    #to delete
    #X_train, y_train, X_test, y_test = X_train[:100, :], y_train[:100, :], X_test[:100, :], y_test[:100, :]
    
    if lengthscales == 'train_shape':
        lengthscales = np.ones(X_train.shape[1])
    
    if (lengthscales is None) & (kernel_variance is None):
        k = gpflow.kernels.RBF() + gpflow.kernels.Bias()
    elif lengthscales is None:
        k = gpflow.kernels.RBF(variance=kernel_variance) + gpflow.kernels.Bias()
    elif kernel_variance is None:
        k = gpflow.kernels.RBF(lengthscales=lengthscales) + gpflow.kernels.Bias()
    else:
        k = gpflow.kernels.RBF(lengthscales=lengthscales, variance=kernel_variance) + gpflow.kernels.Bias()
        
    m = gpflow.models.GPR(data=(X_train, y_train), kernel=k, mean_function=None)
    if likelihood_variance is None:
        pass
    else:
        m.likelihood.variance.assign(likelihood_variance)
    set_trainable(m.kernel.kernels[0].variance, trainable_kernel)
    set_trainable(m.likelihood.variance, trainable_variance)
    set_trainable(m.kernel.kernels[0].lengthscales, trainable_lengthscales)
    
    #optimization
    opt = gpflow.optimizers.Scipy()
    def objective_closure():
        return - m.log_marginal_likelihood()
    
    opt_logs = opt.minimize(objective_closure,
                            m.trainable_variables,
                            options=dict(maxiter=100))

    #prediction
    mean, var = m.predict_f(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, mean.numpy()))
    mape = mean_absolute_percentage_error(y_test, mean.numpy())
    return rmse, mape
    
#     return mean.numpy(), var.numpy(), Xtest, Ytest, round(rmse, 2)

#### The real work

In [11]:
# lengthscales = [[0.008, 0.008, 2], None, 'train_shape']
# lengthscale = [0.008, 0.008, 2]
lengthscale = None
# likelihood_variances = [400, 625, None]
likelihood_variance = 400
# kernel_variances = [400, 625, None]
kernel_variance = None
# trainable_kernels = [False, True]
trainable_kernel = True 
# trainable_variances = [False, True]
trainable_variance = False
# trainable_lengthscales = [False, True]
trainable_lengthscale = True

In [12]:
results_df = pd.DataFrame()
# count = 12
# results_df = pd.read_csv(f'../results/basic_results_{count}.csv')
# for lengthscale in lengthscales:
#     for likelihood_variance in likelihood_variances:
#         for kernel_variance in kernel_variances:
#             for trainable_kernel in trainable_kernels:
#                 for trainable_variance in trainable_variances:
#                     for trainable_lengthscale in trainable_lengthscales:
# #                         count+=1
rmse_list, mape_list = [], []
# print(f'EXPERIMENT {count}')
for i in range(len(latitudes)):
    try:
        rmse, mape = cross_validation(final_df, i, kernel_variance, lengthscale, likelihood_variance, 
                                trainable_kernel, trainable_variance, trainable_lengthscale)
        rmse_list.append(rmse)
        mape_list.append(mape)
        print(f'{device_ids[i]} successful')
    except Exception as e:
        print(e)
        print(f'{device_ids[i]} failed')
        break

mean_rmse = np.mean(rmse_list)
mean_mape = np.mean(mape_list)
results_dict= {'lengthscale':lengthscale, 'likelihood_variance':likelihood_variance, 
               'kernel_variance':kernel_variance, 'trainable_kernel':trainable_kernel, 
               'trainable_variance':trainable_variance, 'trainable_lengthscale':trainable_lengthscale, 
               'avg_rmse':mean_rmse, 'rmse_list':rmse_list, 'avg_mape':mean_mape, 'mape_list':mape_list}
print(results_dict)
results_df = results_df.append(results_dict, ignore_index=True)
results_df.to_csv(f'../results/basic_results_with_met_and_mape.csv', index=False)               

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
689753 successful
1014692 successful
1014697 successful
1014698 successful
1290038 successful
1373035 successful
1373037 successful
1373038 successful
1373039 successful
1379965 successful
{'lengthscale': None, 'likelihood_variance': 400, 'kernel_variance': None, 'trainable_kernel': True, 'trainable_variance': False, 'trainable_lengthscale': True, 'avg_rmse': 68.51936750073381, 'rmse_list': [68.09292088669062, 57.224945169361526, 65.62359386809874, 70.96320782758518, 87.24487629107983, 67.01814528405953, 48.69883162874867, 87.03173922145044, 68.67100984157597, 64.62440498868776], 'avg_mape': 0.7359511902960795, 'mape_list': [0.653075255096018, 0.5055916442445856, 0.6936290989133543, 0.7789224057746448, 0.8036097863056118, 0.5989953133325926, 0.5070651632923313, 1.2526342435426876, 0.79953

In [13]:
# results_df=pd.DataFrame()
# results_df = results_df.append(results_dict, ignore_index=True)
# results_df.to_csv(f'../results/basic_results_with_mape.csv', index=False)               

In [14]:
rmse_list

[68.09292088669062,
 57.224945169361526,
 65.62359386809874,
 70.96320782758518,
 87.24487629107983,
 67.01814528405953,
 48.69883162874867,
 87.03173922145044,
 68.67100984157597,
 64.62440498868776]

In [15]:
mean_rmse

68.51936750073381

In [16]:
mape_list

[0.653075255096018,
 0.5055916442445856,
 0.6936290989133543,
 0.7789224057746448,
 0.8036097863056118,
 0.5989953133325926,
 0.5070651632923313,
 1.2526342435426876,
 0.7995380044420238,
 0.7664509880169443]

In [17]:
mean_mape

0.7359511902960795