In [1]:
import pandas as pd
import numpy as np
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
kampala_devices = pd.read_csv('../data/kampala_devices.csv', usecols=['lat', 'long', 'id'])
kampala_devices.head()

Unnamed: 0,id,lat,long
0,930434,0.360209,32.610756
1,718028,0.3075,32.6206
2,912224,0.34646,32.70328
3,930426,0.3655,32.6468
4,930427,0.2689,32.588


In [3]:
kampala_df = pd.read_csv('../data/kampala_data.csv', parse_dates=['timestamp'])
kampala_df.head()

Unnamed: 0,site_name,latitude,longitude,city,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,device_name
0,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 00:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
1,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 01:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
2,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 02:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
3,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 03:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26
4,"Civic Centre, Kampala Central",0.317725,32.592509,Kampala,2021-09-01 04:00:00+00:00,,,,,60d058c8048305120d2d6145,689761,aq_26


In [4]:
final_df = pd.DataFrame()
# device_ids = []
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value', 'device_number']
# for i, device_id in enumerate(device_ids):
for i, device_id in kampala_devices.id.iteritems():
    device_df = utils.get_device_data(kampala_df, device_id, cols)
#     if len(device_df)!=0:
#         device_ids.append(device_id)
    processed_df = utils.preprocessing(device_df)
#     print(f'{device_id}: {len(processed_df)}')
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5,device_number
0,453031.0,0.356989,32.613888,10.5477,930434
1,453032.0,0.356989,32.613888,16.425,930434
2,453033.0,0.356989,32.613888,17.7239,930434
3,453034.0,0.356989,32.613888,16.1533,930434
4,453035.0,0.356989,32.613888,18.0123,930434


In [5]:
latitudes = final_df['latitude'].unique()
longitudes = final_df['longitude'].unique()
device_ids = final_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

(35, 35, 34)

In [6]:
final_df = final_df.drop(['device_number'], axis=1)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5
0,453031.0,0.356989,32.613888,10.5477
1,453032.0,0.356989,32.613888,16.425
2,453033.0,0.356989,32.613888,17.7239
3,453034.0,0.356989,32.613888,16.1533
4,453035.0,0.356989,32.613888,18.0123


In [7]:
def cross_validation(final_df, idx):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = kampala_df[kampala_df.device_number == device_ids[idx]]
#     assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
#     assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
#     assert(len(train_df.longitude.unique()) == len(longitudes)-1)
#     assert len(final_df) == len(test_df) + len(train_df)
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
#     X_train, y_train = np.array(X_train), np.array(y_train).reshape(-1, 1)
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
#     X_test, y_test = np.array(X_test), np.array(y_test).reshape(-1, 1)
    
    xgb = XGBRegressor(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    
    
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mape
    
#     return mean.numpy(), var.numpy(), Xtest, Ytest, round(rmse, 2)

In [8]:
len(latitudes)

35

In [9]:
rmse_list, mape_list = [], []
for i in range(len(latitudes)):
    try:
        rmse, mape = cross_validation(final_df, i)
        rmse_list.append(rmse)
        mape_list.append(mape)
        print(f'Location {i} successful')
    except Exception as e:
        print(e)
        print(f'Location {i} failed')
#         break

mean_rmse = np.mean(rmse_list)
mean_mape = np.mean(mape_list)
mean_rmse, mean_mape

Location 0 successful
Location 1 successful
Location 2 successful
Location 3 successful
Location 4 successful
Location 5 successful
Location 6 successful
Location 7 successful
Location 8 successful
Location 9 successful
Location 10 successful
Location 11 successful
Location 12 successful
Location 13 successful
Location 14 successful
Location 15 successful
Location 16 successful
Location 17 successful
Location 18 successful
Location 19 successful
Location 20 successful
Location 21 successful
Location 22 successful
Location 23 successful
Location 24 successful
Location 25 successful
Location 26 successful
Location 27 successful
Location 28 successful
Location 29 successful
Location 30 successful
Location 31 successful
Location 32 successful
Location 33 successful
index 34 is out of bounds for axis 0 with size 34
Location 34 failed


(18.50559878710033, 0.4805011591530654)

In [10]:
rmse_list

[14.676294273725324,
 12.513742424381853,
 11.551442099257818,
 14.920480061373244,
 18.503969197323578,
 22.80116248080197,
 18.09120956359927,
 9.879221789339367,
 39.04841130283334,
 17.549234822735524,
 21.17256810445026,
 25.576062451143482,
 22.490445403188495,
 34.03747875714645,
 11.584453596128187,
 16.65787583771237,
 22.560011600092547,
 17.28888570191722,
 17.776766670846317,
 10.785037713214383,
 37.34481109695689,
 10.481343724218256,
 14.533559086811717,
 11.943491732740503,
 19.06359063762287,
 14.467953443756771,
 21.125174535537173,
 19.712820154230695,
 19.146708337582645,
 12.918400428053996,
 10.716043703122336,
 18.31399156643644,
 11.471242203604273,
 28.486474259525718]

In [11]:
mape_list

[0.5810813560771748,
 0.6416035467611094,
 0.5410322233592667,
 0.6258470387275604,
 0.2918988686520461,
 0.25620254003925663,
 0.3567466432035326,
 0.25530103605298193,
 0.38079454071148017,
 0.32163983660679846,
 0.2991916601978244,
 1.2171469907278567,
 0.9234864900726542,
 0.32808865414120136,
 0.5683074814926653,
 0.45263158142187987,
 0.298056235253194,
 0.2930426771545587,
 0.7971014747884371,
 0.4456097012911204,
 0.5296725087423992,
 0.44786031033538337,
 0.7302337102689582,
 0.45284076667637985,
 0.7877469440186418,
 0.2932667537710202,
 0.2841441806054401,
 0.31176172839095845,
 0.5613899569959682,
 0.5494120539554178,
 0.24635687364553122,
 0.29609852907353423,
 0.24064199485213428,
 0.7308025231398538]