## Setting Up

### Importing libs

In [58]:
import pandas as pd
import numpy as np
import geopandas as gpd
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point
import geog
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.model_selection import train_test_split

### Defining Consts

In [23]:
### Consts
datapath = '../rawdata/sensors/'
metadata_file = datapath + 'nodes.txt'

### Loading datasets

In [30]:
## Loading 311
noiseComplaints = pd.read_pickle('../data/311/311.pkl')
noiseComplaints = gpd.GeoDataFrame(noiseComplaints, crs={'init' : 'epsg:4326'}, geometry='geometry')
noiseComplaints = noiseComplaints['2018-01-01':'2018-12-31']

## Loading taxi regions
taxi_regions = gpd.read_file('zip://../assets/taxi_zones.zip')
taxi_regions = taxi_regions.to_crs({'init':'epsg:3857'})

## Merging datasets

In [73]:
dataset = {}

count = 0

f = open(metadata_file)
for line in f:
    
    # reading sensor metadata
    s, lat, lon = line.split(' ')
    lat = float(lat)
    lon = float(lon)
    
    print('sensorID: ', s)
    
    # creating empty timeseries
    df_timeseries = pd.DataFrame()
    df_timeseries['datetime'] = pd.date_range('2018-01-01', '2018-12-31', freq="1h")
    df_timeseries.set_index(['datetime'], inplace = True)
    
    # reading sensor data
    sensorData = pd.read_pickle(datapath + s + '.pkl')
    sensorData['dbas'] = sensorData['sum'] / sensorData['count']
    
    # filtering noise complaints
    noiseComplaints_temp = noiseComplaints.to_crs({'init':'epsg:3857'})
    noiseComplaints_temp = spatialJoin(lat, lon, s, noiseComplaints_temp)            
    noiseComplaints_temp = noiseComplaints_temp.resample('H').agg({'Descriptor': 'count'})
    noiseComplaints_temp.rename({'Descriptor':'noise'}, inplace=True)
    
    ## adding noise and dbas to the dataframe
    df_timeseries['noise'] = noiseComplaints_temp
    df_timeseries['dbas'] = sensorData[['dbas']]
    
    # adding cos and sin to the dataframe
    df_timeseries['hour'] = df_timeseries.index.hour
    df_timeseries['hour_sin'] = np.sin(df_timeseries['hour'])
    df_timeseries['hour_cos'] = np.cos(df_timeseries['hour'])
    
    ## adding to the dictionary
    dataset[s] = {}
    df_timeseries = df_timeseries.dropna(subset=['dbas'])
    df_timeseries['noise'].fillna(0, inplace=True)
    dataset[s]['training'] = df_timeseries.dropna(subset=['dbas'])
    
    count += 1
    if count > 3:
        break

sensorID:  sonycnode-b827eb0d8af7.sonyc
sensorID:  sonycnode-b827eb0fedda.sonyc
sensorID:  sonycnode-b827eb122f0f.sonyc
sensorID:  sonycnode-b827eb132382.sonyc


In [72]:
df = dataset['sonycnode-b827eb132382.sonyc']['training'].dropna(subset=['dbas'])
df[df.isnull().any(axis=1)]

Unnamed: 0_level_0,noise,dbas,hour,hour_sin,hour_cos
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00,,83.139089,0,0.0,1.0
2018-01-01 01:00:00,,82.451193,1,0.841471,0.540302
2018-01-01 02:00:00,,82.132457,2,0.909297,-0.416147
2018-01-01 03:00:00,,80.757893,3,0.14112,-0.989992
2018-01-01 04:00:00,,80.138646,4,-0.756802,-0.653644
2018-01-01 05:00:00,,76.648088,5,-0.958924,0.283662
2018-01-01 06:00:00,,76.872214,6,-0.279415,0.96017


## Running the regressor

In [76]:
for sensor in dataset:
    
    errorDF = pd.DataFrame(columns=['actual', 'predicted', 'error', 'std_dev'])
    
    ## defining kernel
    kernel_regressor = DotProduct() + WhiteKernel()

    ## defining regressor
    gp_regressor = GaussianProcessRegressor(kernel=kernel_regressor,random_state=0)

    ## spliting into features and results
    X = dataset[sensor]['training'][['noise', 'hour_sin', 'hour_cos']]
    y = dataset[sensor]['training'][['dbas']]
    
    ## splitting into train test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

    ## training
    gp_regressor.fit(X_train, y_train)
    
    ## predicting
    y_pred, y_pred_std = gp_regressor.predict(X_test, return_std=True)
    
    y_test['predicted'] = y_pred
    y_test['std_dev'] = y_pred_std
    
    dataset[sensor]['summary'] = y_test
    
    
    break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Helper Functions

### Geospatial Functions

In [14]:
def spatialJoin(sensorLat, sensorLon, sensorID, geoDataFrame):
    
    d = 500 # meters
    n_points = 20
    angles = np.linspace(0, 360, n_points)
    center = shapely.geometry.Point(sensorLon, sensorLat)
    polygon = Polygon(geog.propagate(center, angles, d))
    
    sinpoly = gpd.GeoDataFrame(crs={'init': 'epsg:4326'})
    sinpoly = sinpoly.append({'geometry': polygon, 'sensorID':sensorID}, ignore_index=True) 
    sinpoly = sinpoly.to_crs({'init':'epsg:3857'})
    
    dataframe = gpd.tools.sjoin(geoDataFrame, sinpoly, how='inner', op="within")
        
    return dataframe