## Setting Up

### Importing libs

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point
import geog
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.model_selection import train_test_split
import folium

### Defining Consts

In [None]:
### Consts
datapath = '../rawdata/sensors/'
metadata_file = datapath + 'nodes.txt'

### Loading datasets

In [None]:
## Loading 311
noiseComplaints = pd.read_pickle('../data/311/311.pkl')
noiseComplaints = gpd.GeoDataFrame(noiseComplaints, crs={'init' : 'epsg:4326'}, geometry='geometry')
noiseComplaints = noiseComplaints['2018-01-01':'2018-12-31']

## Loading taxi regions
taxi_regions = gpd.read_file('zip://../assets/taxi_zones.zip')
taxi_regions = taxi_regions.to_crs({'init':'epsg:3857'})

## Merging datasets

In [6]:
dataset = {}

count = 0

f = open(metadata_file)
for line in f:
    
    # reading sensor metadata
    s, lat, lon = line.split(' ')
    lat = float(lat)
    lon = float(lon)
    
    print('sensorID: ', s)
    
    # creating empty timeseries
    df_timeseries = pd.DataFrame()
    df_timeseries['datetime'] = pd.date_range('2018-01-01', '2018-12-31', freq="1h")
    df_timeseries.set_index(['datetime'], inplace = True)
    
    # reading sensor data
    sensorData = pd.read_pickle(datapath + s + '.pkl')
    sensorData['dbas'] = sensorData['sum'] / sensorData['count']
    
    # filtering noise complaints
    noiseComplaints_temp = noiseComplaints.to_crs({'init':'epsg:3857'})
    noiseComplaints_temp = spatialJoin(lat, lon, s, noiseComplaints_temp)            
    noiseComplaints_temp = noiseComplaints_temp.resample('H').agg({'Descriptor': 'count'})
    noiseComplaints_temp.rename({'Descriptor':'noise'}, inplace=True)
    
    ## adding noise and dbas to the dataframe
    df_timeseries['noise'] = noiseComplaints_temp
    df_timeseries['dbas'] = sensorData[['dbas']]
    
    # adding cos and sin to the dataframe
    df_timeseries['hour'] = df_timeseries.index.hour
    df_timeseries['hour_sin'] = np.sin(df_timeseries['hour'])
    df_timeseries['hour_cos'] = np.cos(df_timeseries['hour'])
    
    ## adding to the dictionary
    dataset[s] = {}
    df_timeseries = df_timeseries.dropna(subset=['dbas'])
    df_timeseries['noise'].fillna(0, inplace=True)
    dataset[s]['training'] = df_timeseries.dropna(subset=['dbas'])
    
    count += 1
    if count > 3:
        break

sensorID:  sonycnode-b827eb0d8af7.sonyc
sensorID:  sonycnode-b827eb0fedda.sonyc
sensorID:  sonycnode-b827eb122f0f.sonyc
sensorID:  sonycnode-b827eb132382.sonyc


## Running the regressor

In [14]:
count = 0

for sensor in dataset:
    
    errorDF = pd.DataFrame(columns=['actual', 'predicted', 'error', 'std_dev'])
    
    ## defining kernel
    kernel_regressor = DotProduct() + WhiteKernel()

    ## defining regressor
    gp_regressor = GaussianProcessRegressor(kernel=kernel_regressor,random_state=0)

    ## spliting into features and results
    X = dataset[sensor]['training'][['noise', 'hour_sin', 'hour_cos']]
    y = dataset[sensor]['training'][['dbas']]
    
    ## splitting into train test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

    ## training
    gp_regressor.fit(X_train, y_train)
    
    ## predicting
    y_pred, y_pred_std = gp_regressor.predict(X_test, return_std=True)
    
    y_test['predicted'] = y_pred
    y_test['std_dev'] = y_pred_std
    
    dataset[sensor]['summary'] = y_test
    dataset[sensor]['regressor'] = gp_regressor
    
    count += 1
    if count == 3:
        break

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
dataset['sonycnode-b827eb0fedda.sonyc']['summary']

Unnamed: 0_level_0,dbas,predicted,std_dev
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-08-10 09:00:00,60.615308,60.609428,3.266772
2018-08-10 10:00:00,61.988773,60.704265,3.266778
2018-08-10 11:00:00,63.308503,60.933654,3.266863
2018-08-10 12:00:00,62.333893,61.086696,3.266862
2018-08-10 13:00:00,61.918141,61.022685,3.266777
2018-08-10 14:00:00,61.105473,60.800472,3.266770
2018-08-10 15:00:00,62.536082,60.624359,3.266778
2018-08-10 16:00:00,61.154948,60.656263,3.266766
2018-08-10 17:00:00,62.902482,60.866852,3.266839
2018-08-10 18:00:00,61.795634,61.062511,3.266878


In [59]:
summaryDF_errorMap = pd.DataFrame()

regions = taxi_regions[taxi_regions['borough'] == 'Manhattan']['LocationID'].values
regressor = dataset['sonycnode-b827eb0d8af7.sonyc']['regressor']

for region in regions:
    
    currentDF = pd.DataFrame()
    
    currentRegion = taxi_regions[taxi_regions['LocationID'] == region]
#     foliumMap = folium.Map(location=[40.742, -73.956], zoom_start=12, tiles="cartodbpositron")
#     folium.GeoJson(currentRegion).add_to(foliumMap)
#     display(foliumMap)
    
    
    ## getting all noise complaints in the given region
    noiseComplaints_temp = noiseComplaints.to_crs({'init':'epsg:3857'})
    noiseComplaints_temp = gpd.tools.sjoin(noiseComplaints_temp, currentRegion, how='inner', op="within")
    noiseComplaints_temp = noiseComplaints_temp.resample('H').agg({'Descriptor': 'count'})
    noiseComplaints_temp = noiseComplaints_temp.rename({'Descriptor':'noise'}, inplace=True)
    
    ## adding noise to a single df
#     currentDF = noiseComplaints_temp
    
    # adding cos and sin to the dataframe
#     currentDF['hour'] = currentDF.index.hour
#     currentDF['hour_sin'] = np.sin(currentDF['hour'])
#     currentDF['hour_cos'] = np.cos(currentDF['hour'])
    
    ## predicting
#     y_pred, y_std = regressor.predict(currentDF[['']])
    
    
    

        
        
    print(noiseComplaints_temp.shape)
    print(noiseComplaints_temp.head())
    break
    
    

AttributeError: 'NoneType' object has no attribute 'shape'

## Helper Functions

### Geospatial Functions

In [45]:
def spatialJoin_polygon(noiseDF, geometry, locationID, sensorID):
    
    sinpoly = gpd.GeoDataFrame(crs={'init': 'epsg:4326'})
    sinpoly = sinpoly.append({'geometry': geometry, 'sensorID':sensorID}, ignore_index=True) 
    sinpoly = sinpoly.to_crs({'init':'epsg:3857'})
    
    
    plotPolygon(testing_map, sinpoly)
    
#     dataframe = gpd.tools.sjoin(noiseDF, sinpoly, how='inner', op="within")
    
#     return dataframe

def spatialJoin(sensorLat, sensorLon, sensorID, geoDataFrame):
    
    d = 500 # meters
    n_points = 20
    angles = np.linspace(0, 360, n_points)
    center = shapely.geometry.Point(sensorLon, sensorLat)
    polygon = Polygon(geog.propagate(center, angles, d))
    
    sinpoly = gpd.GeoDataFrame(crs={'init': 'epsg:4326'})
    sinpoly = sinpoly.append({'geometry': polygon, 'sensorID':sensorID}, ignore_index=True) 
    sinpoly = sinpoly.to_crs({'init':'epsg:3857'})
    
    dataframe = gpd.tools.sjoin(geoDataFrame, sinpoly, how='inner', op="within")
        
    return dataframe

### Visualization Functions

In [40]:
def plotPolygon(foliumMap, polygondf):
    folium.GeoJson(polygondf).add_to(foliumMap)
    display(foliumMap)