In [2]:
import pandas as pd
import numpy as np
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
import gpflow
from gpflow import set_trainable
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
kampala_devices = pd.read_csv('../data/kampala_devices.csv', usecols=['lat', 'long', 'id'])
kampala_devices.head()

Unnamed: 0,id,lat,long
0,930434,0.360209,32.610756
1,718028,0.3075,32.6206
2,912224,0.34646,32.70328
3,930426,0.3655,32.6468
4,930427,0.2689,32.588


In [4]:
kampala_df = pd.read_csv('../data/kampala_data.csv', parse_dates=['timestamp'])
kampala_df.head()

Unnamed: 0,site_name,latitude,longitude,city,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,device_name
0,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 00:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
1,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 01:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
2,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 02:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
3,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 03:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
4,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 04:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26


In [5]:
latitudes = kampala_df['latitude'].unique()
longitudes = kampala_df['longitude'].unique()
device_ids = kampala_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

(52, 52, 57)

In [6]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value']
# for i, device_id in enumerate(device_ids):
for i, device_id in kampala_devices.id.iteritems():
    device_df = utils.get_device_data(kampala_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
#     print(f'{device_id}: {len(processed_df)}')
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5
0,453031.0,0.356989,32.613888,10.5477
1,453032.0,0.356989,32.613888,16.425
2,453033.0,0.356989,32.613888,17.7239
3,453034.0,0.356989,32.613888,16.1533
4,453035.0,0.356989,32.613888,18.0123


In [7]:
len(final_df), len(final_df.latitude.unique()), len(final_df.longitude.unique())

(48112, 35, 35)

In [8]:
def cross_validation(final_df, idx, kernel_variance, lengthscales, likelihood_variance, trainable_kernel, 
                     trainable_variance, trainable_lengthscales):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = kampala_df[kampala_df.device_number == device_ids[idx]]
#     assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
#     assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
#     assert(len(train_df.longitude.unique()) == len(longitudes)-1)
#     assert len(final_df) == len(test_df) + len(train_df)
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
    X_train, y_train = np.array(X_train), np.array(y_train).reshape(-1, 1)
    if X_train.shape[0] > 39999:
        X_train = X_train[::6, :]
        y_train = y_train[::6, :]
#     print('printing x_train')
    print(f'X_train shape:{X_train.shape}')
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
    X_test, y_test = np.array(X_test), np.array(y_test).reshape(-1, 1)
    #to delete
    #X_train, y_train, X_test, y_test = X_train[:100, :], y_train[:100, :], X_test[:100, :], y_test[:100, :]
    
    if lengthscales == 'train_shape':
        lengthscales = np.ones(X_train.shape[1])
    
    if (lengthscales is None) & (kernel_variance is None):
        k = gpflow.kernels.RBF() + gpflow.kernels.Bias()
    elif lengthscales is None:
        k = gpflow.kernels.RBF(variance=kernel_variance) + gpflow.kernels.Bias()
    elif kernel_variance is None:
        k = gpflow.kernels.RBF(lengthscales=lengthscales) + gpflow.kernels.Bias()
    else:
        k = gpflow.kernels.RBF(lengthscales=lengthscales, variance=kernel_variance) + gpflow.kernels.Bias()
    print('Training model .....................')    
    m = gpflow.models.GPR(data=(X_train, y_train), kernel=k, mean_function=None)
    if likelihood_variance is None:
        pass
    else:
        m.likelihood.variance.assign(likelihood_variance)
    set_trainable(m.kernel.kernels[0].variance, trainable_kernel)
    set_trainable(m.likelihood.variance, trainable_variance)
    set_trainable(m.kernel.kernels[0].lengthscales, trainable_lengthscales)
    
    #optimization
    print('Optimizing model ...........................')
    opt = gpflow.optimizers.Scipy()
    def objective_closure():
        return - m.log_marginal_likelihood()
    
    opt_logs = opt.minimize(objective_closure,
                            m.trainable_variables,
                            options=dict(maxiter=100))

    #prediction
    mean, var = m.predict_f(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, mean.numpy()))
    return rmse
    
#     return mean.numpy(), var.numpy(), Xtest, Ytest, round(rmse, 2)

#### The real work

In [11]:
# lengthscales = [[0.08, 0.08, 1], None, 'train_shape']
lengthscales = [[0.08, 0.08, 1]]
# likelihood_variances = [400, 625, None]
likelihood_variances = [400]
# kernel_variances = [625, 400, None]
kernel_variances  = [625]
# trainable_kernels = [True, False]
trainable_kernels = [True]
# trainable_variances = [True, False]
trainable_variances = [True]
# trainable_lengthscales = [False, True]
trainable_lengthscales = [True] #only changed this

In [12]:
# results_df = pd.DataFrame()
count = 1
results_df = pd.DataFrame()
for lengthscale in lengthscales:
    for likelihood_variance in likelihood_variances:
        for kernel_variance in kernel_variances:
            for trainable_kernel in trainable_kernels:
                for trainable_variance in trainable_variances:
                    for trainable_lengthscale in trainable_lengthscales:
                        count+=1
                        rmse_list = []
                        print(f'EXPERIMENT {count}')
                        for i in range(len(latitudes)):
                            try:
                                print(f'{device_ids[i]} starting ...')
                                rmse = cross_validation(final_df, i, kernel_variance, lengthscale, likelihood_variance, 
                                                        trainable_kernel, trainable_variance, trainable_lengthscale)
                                rmse_list.append(rmse)
                                print(f'{device_ids[i]} successful')
                            except Exception as e:
                                print(e)
                                print(f'{device_ids[i]} failed')
                                break
                            
                        mean_rmse = np.mean(rmse_list)
                        results_dict= {'lengthscale':lengthscale, 'likelihood_variance':likelihood_variance, 
                                       'kernel_variance':kernel_variance, 'trainable_kernel':trainable_kernel, 
                                       'trainable_variance':trainable_variance, 
                                       'trainable_lengthscale':trainable_lengthscale, 'avg_rmse':mean_rmse, 
                                       'rmse_list':rmse_list}
                        print(results_dict)
                        results_df = results_df.append(results_dict, ignore_index=True)
                        results_df.to_csv(f'../results/basic_results_{count}.csv', index=False)               

EXPERIMENT 2
689761 starting ...
X_train shape:(7664, 3)
Training model .....................
Optimizing model ...........................
689761 successful
718028 starting ...
X_train shape:(7770, 3)
Training model .....................
Optimizing model ...........................
718028 successful
718029 starting ...
X_train shape:(7926, 3)
Training model .....................
Optimizing model ...........................
718029 successful
737273 starting ...
X_train shape:(7924, 3)
Training model .....................
Optimizing model ...........................
737273 successful
737276 starting ...
X_train shape:(7944, 3)
Training model .....................
Optimizing model ...........................
737276 successful
755612 starting ...
X_train shape:(7745, 3)
Training model .....................
Optimizing model ...........................
755612 successful
755614 starting ...
X_train shape:(7663, 3)
Training model .....................
Optimizing model .........................