In [2]:
import pandas as pd
import numpy as np
import sys
from math import sqrt
sys.path.append('../..')
from modules import utils
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
def scale_data(X):
    scaler = MinMaxScaler()
    X_scaled = X.copy()
    X_scaled[:, 0] = scaler.fit_transform(X[:, 0].reshape(-1, 1)).flatten()
    return X_scaled

#### The data

In [4]:
jinja_df = pd.read_csv('../data/jinja_data.csv', parse_dates=['timestamp'])
jinja_df.head()

Unnamed: 0,site_name,latitude,longitude,city,timestamp,pm2_5_calibrated_value,pm2_5_raw_value,pm10_raw_value,pm10_calibrated_value,site_id,device_number,device_name
0,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 00:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
1,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 01:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
2,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 02:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
3,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 03:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23
4,"Jinja Main Street, Jinja",0.437337,33.211051,Jinja,2021-09-01 04:00:00+00:00,,,,,60d058c8048305120d2d6142,689753,aq_23


In [5]:
latitudes = jinja_df['latitude'].unique()
longitudes = jinja_df['longitude'].unique()
device_ids = jinja_df['device_number'].unique()
len(latitudes), len(longitudes), len(device_ids)

(10, 10, 10)

In [6]:
final_df = pd.DataFrame()
cols = ['timestamp', 'latitude', 'longitude', 'pm2_5_calibrated_value']
for i, device_id in enumerate(device_ids):
    device_df = utils.get_device_data(jinja_df, device_id, cols)
    processed_df = utils.preprocessing(device_df)
    final_df = pd.concat([final_df, processed_df])
final_df.reset_index(drop=True, inplace=True)
final_df.head()

Unnamed: 0,time,latitude,longitude,pm2_5
0,452909.0,0.437337,33.211051,12.2844
1,452910.0,0.437337,33.211051,11.6507
2,452911.0,0.437337,33.211051,22.398
3,452912.0,0.437337,33.211051,17.4937
4,452913.0,0.437337,33.211051,25.1622


#### Model training and validation

In [7]:
def svr(X_train, y_train):
    model = SVR().fit(X_train, y_train)
    return model

In [8]:
def cross_validation(final_df, idx):
    device_indices = final_df[final_df.latitude==latitudes[idx]].index
    device_df = jinja_df[jinja_df.device_number == device_ids[idx]]
    assert(len(device_indices) == len(device_df)-device_df.pm2_5_calibrated_value.isna().sum())
    
    test_df = final_df.loc[device_indices]
    assert(len(test_df.longitude.unique()) == 1)
    
    train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
    assert(len(train_df.longitude.unique()) == len(longitudes)-1)
    assert len(final_df) == len(test_df) + len(train_df)
    
    X_train = train_df.iloc[:, 0:-1]
    y_train = train_df.iloc[:, -1]
    X_train, y_train = np.array(X_train), np.array(y_train)#.reshape(-1, 1)
    X_train_scaled = scale_data(X_train)
    
    X_test = test_df.iloc[:, 0:-1]
    y_test = test_df.iloc[:, -1]
    X_test, y_test = np.array(X_test), np.array(y_test)#.reshape(-1, 1)
    X_test_scaled = scale_data(X_test)
    
    model = svr(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)
    return rmse, mape

#### delete from here

In [9]:
idx = 0
device_indices = final_df[final_df.latitude==latitudes[idx]].index
device_df = jinja_df[jinja_df.device_number == device_ids[idx]]
test_df = final_df.loc[device_indices]
train_df = pd.concat([final_df, test_df]).drop_duplicates(keep=False)
X_train = train_df.iloc[:, 0:-1]
y_train = train_df.iloc[:, -1]
X_train, y_train = np.array(X_train), np.array(y_train)#.reshape(-1, 1)
X_train_scaled = scale_data(X_train)

X_test = test_df.iloc[:, 0:-1]
y_test = test_df.iloc[:, -1]
X_test, y_test = np.array(X_test), np.array(y_test)#.reshape(-1, 1)
X_test_scaled = scale_data(X_test)

In [14]:
%%timeit
model = svr(X_train_scaled, y_train)

2.53 s ± 40.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
model = svr(X_train_scaled, y_train)

In [17]:
%%timeit
y_pred = model.predict(X_test_scaled)

552 ms ± 6.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### end here

In [16]:
rmse_list, mape_list = [], []
for i in range(len(latitudes)):
    rmse, mape = cross_validation(final_df, i)
    rmse_list.append(rmse)
    mape_list.append(mape)
    print(f'{device_ids[i]} successful')
rmse_list

689753 successful
1014698 successful
1014692 successful
1014697 successful
1290038 successful
1373035 successful
1373037 successful
1373038 successful
1373039 successful
1379965 successful


[9.234712511732758,
 29.914659912569142,
 16.317699971397147,
 23.427584107774237,
 11.744413772333857,
 27.12214749590781,
 21.22179174029281,
 12.309212405435982,
 28.58676029721792,
 30.150173298506132]

In [17]:
mean_rmse = np.mean(rmse_list)          
mean_rmse

21.00291555131678

In [18]:
mape_list

[0.39409742527707436,
 0.35765608026622286,
 0.43825804924849343,
 0.36119517577825505,
 0.3107259821026086,
 0.40309303801454316,
 0.32640930616121666,
 0.48050101001282647,
 0.40872547450993535,
 0.39103067935436275]

In [19]:
mean_mape = np.mean(mape_list)  
mean_mape

0.3871692220725539