In [2]:
import pandas as pd
import numpy as np
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
import gpflow
from gpflow import set_trainable
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
jinja_df = pd.read_csv('../data/jinja_data.csv', parse_dates=['timestamp'])
jinja_df.head()

Unnamed: 0,site_name,latitude,longitude,city,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,device_name
0,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 00:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
1,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 01:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
2,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 02:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
3,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 03:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
4,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 04:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23


In [4]:
# f = lambda time: pd.Timestamp.fromtimestamp(time*3600)
# f2 = np.vectorize(f)

In [5]:
latitudes = jinja_df['latitude'].unique()
longitudes = jinja_df['longitude'].unique()
device_ids = jinja_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

(10, 10, 10)

In [6]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value']
for i, device_id in enumerate(device_ids):
    device_df = utils.get_device_data(jinja_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5
0,452909.0,0.437337,33.211051,12.2844
1,452910.0,0.437337,33.211051,11.6507
2,452911.0,0.437337,33.211051,22.398
3,452912.0,0.437337,33.211051,17.4937
4,452913.0,0.437337,33.211051,25.1622


In [7]:
def cross_validation(final_df, idx, kernel_variance, lengthscales, likelihood_variance, trainable_kernel, 
                     trainable_variance, trainable_lengthscales):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = jinja_df[jinja_df.device_number == device_ids[idx]]
    assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
    assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
    assert(len(train_df.longitude.unique()) == len(longitudes)-1)
    assert len(final_df) == len(test_df) + len(train_df)
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
    X_train, y_train = np.array(X_train), np.array(y_train).reshape(-1, 1)
    if X_train.shape[0] > 9999:
        X_train = X_train[::2, :]
        y_train = y_train[::2, :]
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
    X_test, y_test = np.array(X_test), np.array(y_test).reshape(-1, 1)
    
    if lengthscales == 'train_shape':
        lengthscales = np.ones(X_train.shape[1])
    
    if (lengthscales is None) & (kernel_variance is None):
        k = gpflow.kernels.RBF() + gpflow.kernels.Bias()
    elif lengthscales is None:
        k = gpflow.kernels.RBF(variance=kernel_variance) + gpflow.kernels.Bias()
    elif kernel_variance is None:
        k = gpflow.kernels.RBF(lengthscales=lengthscales) + gpflow.kernels.Bias()
    else:
        k = gpflow.kernels.RBF(lengthscales=lengthscales, variance=kernel_variance) + gpflow.kernels.Bias()
        
    m = gpflow.models.GPR(data=(X_train, y_train), kernel=k, mean_function=None)
    if likelihood_variance is None:
        pass
    else:
        m.likelihood.variance.assign(likelihood_variance)
    set_trainable(m.kernel.kernels[0].variance, trainable_kernel)
    set_trainable(m.likelihood.variance, trainable_variance)
    set_trainable(m.kernel.kernels[0].lengthscales, trainable_lengthscales)
    
    #optimization
    opt = gpflow.optimizers.Scipy()
    def objective_closure():
        return - m.log_marginal_likelihood()
    
    opt_logs = opt.minimize(objective_closure,
                            m.trainable_variables,
                            options=dict(maxiter=100))

    #prediction
    mean, var = m.predict_f(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, mean.numpy()))
    return rmse
    
#     return mean.numpy(), var.numpy(), Xtest, Ytest, round(rmse, 2)

#### The real work

In [8]:
lengthscales = [None, [0.008, 0.008, 2], 'train_shape']
likelihood_variances = [None, 425, 600]
kernel_variances = [None, 425, 600]
trainable_kernels = [True, False]
trainable_variances = [True, False]
trainable_lengthscales = [True, False]

#### delete from here

In [9]:
# lengthscale = 'train_shape'
# likelihood_variance = None
# kernel_variance = 425
# trainable_kernel = False
# trainable_variance = False
# trainable_lengthscale = False

In [10]:
# rmse_list = []
# lat = latitudes[0]

In [11]:
# rmse = cross_validation(final_df, 0, kernel_variance, lengthscale, likelihood_variance, trainable_kernel, 
#                         trainable_variance, trainable_lengthscale)

#### end here

In [None]:
results_df = pd.DataFrame()
count = 0
for lengthscale in lengthscales:
    for likelihood_variance in likelihood_variances:
        for kernel_variance in kernel_variances:
            for trainable_kernel in trainable_kernels:
                for trainable_variance in trainable_variances:
                    for trainable_lengthscale in trainable_lengthscales:
                        count+=1
                        rmse_list = []
                        print(f'EXPERIMENT {count}')
                        for i in range(len(latitudes)):
                            try:
                                rmse = cross_validation(final_df, i, kernel_variance, lengthscale,
                                                        likelihood_variance, trainable_kernel, trainable_variance,
                                                        trainable_lengthscale)
                                rmse_list.append(rmse)
                                print(f'{device_ids[i]} successful')
                            except Exception as e:
                                print(e)
                                print(f'{device_ids[i]} failed')
                            
                    mean_rmse = np.mean(rmse_list)
                    results_dict= {'lengthscale':lengthscale, 'likelihood_variance':likelihood_variance, 
                                   'kernel_variance':kernel_variance, 'trainable_kernel':trainable_kernel, 
                                   'trainable_variance':trainable_variance, 
                                   'trainable_lengthscale':trainable_lengthscale, 'avg_rmse':mean_rmse, 
                                   'rmse_list':rmse_list}
                    print(results_dict)
                    results_df = results_df.append(results_dict, ignore_index=True)
                    results_df.to_csv('../results/hyperparameter_results.csv', index=False)               

EXPERIMENT 1


2023-05-29 16:46:45.668755: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-05-29 16:46:45.668784: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: PL1207-PRO.paris.inria.fr
2023-05-29 16:46:45.668787: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: PL1207-PRO.paris.inria.fr
2023-05-29 16:46:45.668914: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 525.105.17
2023-05-29 16:46:45.668934: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: NOT_FOUND: could not find kernel module information in driver version file contents: "NVRM version: NVIDIA UNIX Open Kernel Module for x86_64  525.105.17  Release Build  (dvs-builder@U16-T02-30-1)  Tue Mar 28 22:29:15 UTC 2023
GCC version:  gcc v

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
689753 successful
1014698 successful
1014692 successful
1014697 successful
1290038 successful
1373035 successful
1373037 successful
1373038 successful
1373039 successful
1379965 successful
EXPERIMENT 2
689753 successful
1014698 successful
1014692 successful
1014697 successful
1290038 successful
1373035 successful
1373037 successful
