In [1]:
import pandas as pd
import numpy as np
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
import gpflow
from gpflow import set_trainable
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
kampala_df = pd.read_csv('../data/kampala_data.csv', parse_dates=['timestamp'])
kampala_df.head()

Unnamed: 0,site_name,latitude,longitude,city,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,device_name
0,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 00:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
1,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 01:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
2,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 02:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
3,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 03:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
4,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 04:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26


In [3]:
kampala_df.tail()

Unnamed: 0,site_name,latitude,longitude,city,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,device_name
87838,"Kireka, Kira Municipality",0.343328,32.646793,Kampala,2021-11-29 19:00:00+00:00,46.1765,59.200595,69.94131,58.653479,60d058c8048305120d2d6149,1290040,aq_g5_4
87839,"Kireka, Kira Municipality",0.343328,32.646793,Kampala,2021-11-29 20:00:00+00:00,51.7306,64.723837,76.067326,64.870471,60d058c8048305120d2d6149,1290040,aq_g5_4
87840,"Kireka, Kira Municipality",0.343328,32.646793,Kampala,2021-11-29 21:00:00+00:00,45.1198,56.668452,69.724881,56.730288,60d058c8048305120d2d6149,1290040,aq_g5_4
87841,"Kireka, Kira Municipality",0.343328,32.646793,Kampala,2021-11-29 22:00:00+00:00,58.8232,67.012262,78.508452,67.668048,60d058c8048305120d2d6149,1290040,aq_g5_4
87842,"Kireka, Kira Municipality",0.343328,32.646793,Kampala,2021-11-29 23:00:00+00:00,64.8027,86.46119,95.148214,88.287475,60d058c8048305120d2d6149,1290040,aq_g5_4


In [4]:
latitudes = kampala_df['latitude'].unique()
longitudes = kampala_df['longitude'].unique()
device_ids = kampala_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

(51, 51, 56)

In [6]:
device_ids

array([ 689761,  718028,  718029,  737273,  737276,  755612,  755614,
        782718,  782719,  782720,  782721,  782722,  832251,  832252,
        832253,  832254,  832255,  870142,  870143,  870144,  870145,
        870146,  870147,  912219,  912220,  912221,  912222,  912223,
        912224,  912225,  930426,  930427,  930429,  930431,  930434,
        967600,  967601, 1014687, 1014691, 1014696, 1290037, 1290039,
       1290041, 1290042, 1351535, 1351542, 1351543, 1351546, 1371822,
       1373036, 1373040, 1375490, 1375491, 1375492, 1575534, 1290040],
      dtype=int64)

In [5]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value']
for i, device_id in enumerate(device_ids):
    device_df = utils.get_device_data(kampala_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5
0,452936.0,0.317725,32.592509,51.2402
1,452937.0,0.317725,32.592509,25.9089
2,452938.0,0.317725,32.592509,26.4925
3,452939.0,0.317725,32.592509,25.4096
4,452940.0,0.317725,32.592509,23.5744


#### The real work

In [None]:
lengthscales = [[0.08, 0.08, 1], None, 'train_shape']
likelihood_variances = [400, 625, None]
kernel_variances = [400, 625, None]
trainable_kernels = [False, True]
trainable_variances = [False, True]
trainable_lengthscales = [False, True]

In [None]:
results_df = pd.DataFrame()
count = 0
for lengthscale in lengthscales:
    for likelihood_variance in likelihood_variances:
        for kernel_variance in kernel_variances:
            for trainable_kernel in trainable_kernels:
                for trainable_variance in trainable_variances:
                    for trainable_lengthscale in trainable_lengthscales:
                        count+=1
                        rmse_list = []
                        print(f'EXPERIMENT {count}')
                        for i in range(len(latitudes)):
                            try:
                                rmse = cross_validation(final_df, i, kernel_variance, lengthscale,
                                                        likelihood_variance, trainable_kernel, trainable_variance,
                                                        trainable_lengthscale)
                                rmse_list.append(rmse)
                                print(f'{device_ids[i]} successful')
                            except Exception as e:
                                print(e)
                                print(f'{device_ids[i]} failed')
                            
                        mean_rmse = np.mean(rmse_list)
                        results_dict= {'lengthscale':lengthscale, 'likelihood_variance':likelihood_variance, 
                                       'kernel_variance':kernel_variance, 'trainable_kernel':trainable_kernel, 
                                       'trainable_variance':trainable_variance, 
                                       'trainable_lengthscale':trainable_lengthscale, 'avg_rmse':mean_rmse, 
                                       'rmse_list':rmse_list}
                        print(results_dict)
                        results_df = results_df.append(results_dict, ignore_index=True)
                        results_df.to_csv(f'../results/basic_results_{count}.csv', index=False)               