In [1]:
import pandas as pd
import numpy as np
import glob

## Prepare the data

In [3]:
df = pd.concat([pd.read_csv(f) for f in glob.glob('data/*.csv')], ignore_index = True)
df.head()

Unnamed: 0,001e06109416,1065.3788811188808,1530403200,52.572027972028,avg_value_hrf,bmp180,metsense,node_id,parameter,pressure,sensor,subsystem,temperature,timestamp
0,,,,,-0.455178,,,001e06113107,concentration,,co,chemsense,,1525990000.0
1,,,,,19.377097,,,001e0610ee82,temperature,,at3,chemsense,,1525990000.0
2,,,,,23.334481,,,001e0611536c,intensity,,tsl250rd,lightsense,,1525990000.0
3,,,,,14.8585,,,001e0610ee43,temperature,,htu21d,metsense,,1525990000.0
4,,,,,17.963645,,,001e0610f6dd,temperature,,htu21d,metsense,,1525990000.0


In [4]:
df = df[['timestamp','node_id', 'subsystem','sensor','parameter', 'avg_value_hrf']]
#df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.head()

Unnamed: 0,timestamp,node_id,subsystem,sensor,parameter,avg_value_hrf
0,1525990000.0,001e06113107,chemsense,co,concentration,-0.455178
1,1525990000.0,001e0610ee82,chemsense,at3,temperature,19.377097
2,1525990000.0,001e0611536c,lightsense,tsl250rd,intensity,23.334481
3,1525990000.0,001e0610ee43,metsense,htu21d,temperature,14.8585
4,1525990000.0,001e0610f6dd,metsense,htu21d,temperature,17.963645


In [5]:
dfNodes = pd.read_csv('nodes.csv', index_col=0)
dfNodes = dfNodes[['lat', 'lon']]
dfNodes.head()

Unnamed: 0_level_0,lat,lon
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1
001e0610ba46,41.878377,-87.627678
001e0610ba3b,41.858136,-87.616055
001e0610ba8f,41.810342,-87.590228
001e0610ba16,41.891964,-87.611603
001e0610ba8b,41.7806,-87.586456


In [6]:
# Join the data to get lat & lon
dfData = df.join(dfNodes, on='node_id')
dfData.head()

Unnamed: 0,timestamp,node_id,subsystem,sensor,parameter,avg_value_hrf,lat,lon
0,1525990000.0,001e06113107,chemsense,co,concentration,-0.455178,41.751142,-87.71299
1,1525990000.0,001e0610ee82,chemsense,at3,temperature,19.377097,41.921405,-87.677766
2,1525990000.0,001e0611536c,lightsense,tsl250rd,intensity,23.334481,41.88575,-87.62969
3,1525990000.0,001e0610ee43,metsense,htu21d,temperature,14.8585,41.788608,-87.598713
4,1525990000.0,001e0610f6dd,metsense,htu21d,temperature,17.963645,41.895355,-87.726064


In [7]:
# Create the independent variables
X = dfData[(dfData.subsystem=='alphasense') & (dfData.sensor == 'opc_n2') & (dfData.parameter == 'pm2_5')]
X = X[['timestamp','lat','lon']]
X = X.values

In [8]:
# Create the predicted variable
y = dfData[(dfData.subsystem=='alphasense') & (dfData.sensor == 'opc_n2') & (dfData.parameter == 'pm2_5')]
y = y['avg_value_hrf']

### Build the predictive model

In [9]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import scipy as sp

In [11]:
# Split the data into a Training and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=45)

In [12]:
# KNN: Train on the training data using nearest neighbors
neigh = KNeighborsRegressor(n_neighbors=3, weights = 'distance')
neigh.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='distance')

In [13]:
# Check its accuracy by looking at the Root Mean Square Error of the KNN on the Test dataset
predVal = neigh.predict(X_test)
dfTest=pd.DataFrame([predVal,y_test]).transpose()
dfTest.columns=('p','y')
rmse = ((dfTest.p - dfTest.y) ** 2).mean() ** .5
print('RMSE = {} μg/m^3'.format(rmse))
print('Max = {}'.format(dfTest['y'].max()))
print('Min = {}'.format(dfTest['y'].min()))
print('Avg = {}'.format(dfTest['y'].mean()))

RMSE = 1.5234797763729575 μg/m^3
Max = 36.963464285714274
Min = 0.0
Avg = 2.214686732628455


In [14]:
p

Coefficient of determination R^2 of the prediction = 0.8326757742836199


### Save the model to disk for later use

In [15]:
import pickle

In [16]:
pickle.dump(neigh, open('model-knn.pickle', 'wb'))

In [17]:
# Example using the saved model
import time
loaded_model = pickle.load(open('model-knn.pickle', 'rb'))

# Predict given: [timestmap in unix time, lat, lon]
prediction = loaded_model.predict([[int(time.time()), 41.913830, -87.650910]])
print('Prediction = {} μg/m^3'.format(prediction[0]))

Prediction = 0.5169955721632646 μg/m^3


## Available Parameters

In [18]:
df[['subsystem','sensor','parameter']].dropna().drop_duplicates()

Unnamed: 0,subsystem,sensor,parameter
0,chemsense,co,concentration
1,chemsense,at3,temperature
2,lightsense,tsl250rd,intensity
3,metsense,htu21d,temperature
5,chemsense,at1,temperature
6,lightsense,hmc5883l,magnetic_field_x
7,lightsense,tmp421,temperature
9,chemsense,at0,temperature
10,lightsense,ml8511,intensity
13,chemsense,lps25h,temperature


## Building a Grid

Ok, that was a fun test, but we need to actually build a grid of coordinates to predict over a map.

In [29]:
max_lat = np.max(X[:, 1])
min_lat = np.min(X[:, 1])
max_lon = np.max(X[:, 2])
min_lon = np.min(X[:, 2])
print('Latitude Range: {} to {}'.format(min_lat, max_lat))
print('Longitude Range: {} to {}'.format(min_lon, max_lon))

Latitude Range: 41.722457 to 41.903632
Longitude Range: -87.74526800000001 to -87.57535


In [40]:
resolution_lon = 10
resolution_lat = 10
grid_lon = np.linspace(min_lon, max_lon, resolution_lon)
grid_lat = np.linspace(min_lat, max_lat, resolution_lat)

In [39]:
grid_lat

array([41.722457  , 41.72428705, 41.7261171 , 41.72794715, 41.7297772 ,
       41.73160725, 41.7334373 , 41.73526735, 41.7370974 , 41.73892745,
       41.74075751, 41.74258756, 41.74441761, 41.74624766, 41.74807771,
       41.74990776, 41.75173781, 41.75356786, 41.75539791, 41.75722796,
       41.75905801, 41.76088806, 41.76271811, 41.76454816, 41.76637821,
       41.76820826, 41.77003831, 41.77186836, 41.77369841, 41.77552846,
       41.77735852, 41.77918857, 41.78101862, 41.78284867, 41.78467872,
       41.78650877, 41.78833882, 41.79016887, 41.79199892, 41.79382897,
       41.79565902, 41.79748907, 41.79931912, 41.80114917, 41.80297922,
       41.80480927, 41.80663932, 41.80846937, 41.81029942, 41.81212947,
       41.81395953, 41.81578958, 41.81761963, 41.81944968, 41.82127973,
       41.82310978, 41.82493983, 41.82676988, 41.82859993, 41.83042998,
       41.83226003, 41.83409008, 41.83592013, 41.83775018, 41.83958023,
       41.84141028, 41.84324033, 41.84507038, 41.84690043, 41.84

## Try building a model using Kriging
    pip install pykrige

In [19]:
import pykrige.kriging_tools as kt
from pykrige.ok import OrdinaryKriging

In [25]:
timestamps= X[:, 0]
latitudes = X[:, 1]
longitudes= X[:, 2]

In [35]:
OK = OrdinaryKriging(longitudes, latitudes, y, variogram_model='linear',
                     verbose=False, enable_plotting=False)

In [41]:
# Predict
z, ss = OK.execute('grid', grid_lon, grid_lat)

In [54]:
grid_lon

array([-87.745268  , -87.72638822, -87.70750844, -87.68862867,
       -87.66974889, -87.65086911, -87.63198933, -87.61310956,
       -87.59422978, -87.57535   ])

In [50]:
import folium
from folium import plugins
from folium.plugins import HeatMap

In [60]:
concentrations = []
for x in range(z.shape[0]):
    for y in range(z.shape[1]):
        concentrations.append([
                               grid_lat[y], 
                               grid_lon[x], 
                               z[x,y]])

In [61]:
mapPM25 = folium.Map(location=[41.8781, -87.6298], zoom_start = 11)
HeatMap(concentrations).add_to(mapPM25)
mapPM25