## Setting Up

### Importing libs

In [82]:
import pandas as pd
import numpy as np
import geopandas as gpd
import shapely
from shapely.geometry import Polygon
from shapely.geometry import Point
import geog
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.model_selection import train_test_split
import folium

### Defining Consts

In [83]:
### Consts
datapath = '../rawdata/sensors/'
metadata_file = datapath + 'nodes.txt'

### Loading datasets

In [84]:
## Loading 311
noiseComplaints = pd.read_pickle('../data/311/311.pkl')
noiseComplaints = gpd.GeoDataFrame(noiseComplaints, crs={'init' : 'epsg:4326'}, geometry='geometry')
noiseComplaints = noiseComplaints['2018-01-01':'2018-12-31']

## Loading taxi regions
taxi_regions = gpd.read_file('zip://../assets/taxi_zones.zip')
taxi_regions = taxi_regions.to_crs({'init':'epsg:3857'})

## Merging datasets

In [85]:
## Using sensors with the best data quality
selectedsensors = [ "sonycnode-b827eb0fedda.sonyc",
                    "sonycnode-b827eb42bd4a.sonyc",
                    "sonycnode-b827eb44506f.sonyc",
                    "sonycnode-b827eb73e772.sonyc",
                    "sonycnode-b827eb84deb5.sonyc",
                    "sonycnode-b827ebb40450.sonyc"]

In [86]:
dataset = {}

f = open(metadata_file)
for line in f:
    
    # reading sensor metadata
    s, lat, lon = line.split(' ')
    lat = float(lat)
    lon = float(lon)
    
    if s in selectedsensors:
        
        print('sensorID: ', s)

        # creating empty timeseries
        df_timeseries = pd.DataFrame()
        df_timeseries['datetime'] = pd.date_range('2018-01-01', '2018-12-31', freq="1h")
        df_timeseries.set_index(['datetime'], inplace = True)

        # reading sensor data
        sensorData = pd.read_pickle(datapath + s + '.pkl')
        sensorData['dbas'] = sensorData['sum'] / sensorData['count']

        # filtering noise complaints
        noiseComplaints_temp = noiseComplaints.to_crs({'init':'epsg:3857'})
        noiseComplaints_temp = spatialJoin(lat, lon, s, noiseComplaints_temp)            
        noiseComplaints_temp = noiseComplaints_temp.resample('H').agg({'Descriptor': 'count'})
        noiseComplaints_temp.rename({'Descriptor':'noise'}, inplace=True)

        ## adding noise and dbas to the dataframe
        df_timeseries['noise'] = noiseComplaints_temp
        df_timeseries['dbas'] = sensorData[['dbas']]

        # adding cos and sin to the dataframe
        df_timeseries['hour'] = df_timeseries.index.hour
        df_timeseries['hour_sin'] = np.sin(df_timeseries['hour'])
        df_timeseries['hour_cos'] = np.cos(df_timeseries['hour'])

        ## adding to the dictionary
        dataset[s] = {}
        df_timeseries = df_timeseries.dropna(subset=['dbas'])
        df_timeseries['noise'].fillna(0, inplace=True)
        dataset[s]['training'] = df_timeseries.dropna(subset=['dbas'])

sensorID:  sonycnode-b827eb0fedda.sonyc
sensorID:  sonycnode-b827eb42bd4a.sonyc
sensorID:  sonycnode-b827eb44506f.sonyc
sensorID:  sonycnode-b827eb73e772.sonyc
sensorID:  sonycnode-b827eb84deb5.sonyc
sensorID:  sonycnode-b827ebb40450.sonyc


## Running the regressor

In [87]:
for sensor in dataset:
    
    errorDF = pd.DataFrame(columns=['actual', 'predicted', 'error', 'std_dev'])
    
    ## defining kernel
    kernel_regressor = DotProduct() + WhiteKernel()

    ## defining regressor
    gp_regressor = GaussianProcessRegressor(kernel=kernel_regressor,random_state=0)

    ## spliting into features and results
    X = dataset[sensor]['training'][['noise']]
    y = dataset[sensor]['training'][['dbas']]
    
    ## splitting into train test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

    ## training
    gp_regressor.fit(X_train, y_train)
    
    ## predicting
    y_pred, y_pred_std = gp_regressor.predict(X_test, return_std=True)
    
    y_test['predicted'] = y_pred
    y_test['std_dev'] = y_pred_std
    
    dataset[sensor]['summary'] = y_test
    dataset[sensor]['regressor'] = gp_regressor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [88]:
summaryDF_errorMap = pd.DataFrame()

regions = taxi_regions[taxi_regions['borough'] == 'Manhattan']['LocationID'].values

for sensor in dataset:
    
    regressor = dataset[sensor]['regressor']

    for region in regions:

        print(region)

        currentDF = pd.DataFrame()

        currentRegion = taxi_regions[taxi_regions['LocationID'] == region]
        region_geometry = taxi_regions[taxi_regions['LocationID'] == region]['geometry'].values[0] 

        ## getting all noise complaints in the given region
        noiseComplaints_temp = noiseComplaints.to_crs({'init':'epsg:3857'})
        noiseComplaints_temp = gpd.tools.sjoin(noiseComplaints_temp, currentRegion, how='inner', op="within")
        noiseComplaints_temp = noiseComplaints_temp.resample('H').agg({'Descriptor': 'count'})
        noiseComplaints_temp.rename(columns={'Descriptor':'noise'}, inplace=True)

        if(noiseComplaints_temp.shape[0] > 0):

            # adding noise to a single df
            currentDF = noiseComplaints_temp

            # adding cos and sin to the dataframe
            currentDF['hour'] = currentDF.index.hour
            currentDF['hour_sin'] = np.sin(currentDF['hour'])
            currentDF['hour_cos'] = np.cos(currentDF['hour'])

            ## predicting
            X = currentDF[['noise']]
            y_pred, y_std = regressor.predict(X, return_std=True)

            currentDF['std_dev'] = y_std
            currentDF['pred'] = y_pred

            summaryDF_errorMap = summaryDF_errorMap.append(
                {'region':region, 
                 'model':sensor,
                 'std_dev_mean':currentDF['std_dev'].mean(),
                 'std_dev_range':(currentDF['std_dev'].max() - currentDF['std_dev'].min()),
                 'geometry': region_geometry
                }, ignore_index=True)

4
12
13
24
41
42
43
45
48
50
68
74
75
79
87
88
90
100
103
103
103
107
113
114
116
120
125
127
128
137
140
141
142
143
144
148
151
152
153
158
161
162
163
164
166
170
186
194
202
209
211
224
229
230
231
232
233
234
236
237
238
239
243
244
246
249
261
262
263
4
12
13
24
41
42
43
45
48
50
68
74
75
79
87
88
90
100
103
103
103
107
113
114
116
120
125
127
128
137
140
141
142
143
144
148
151
152
153
158
161
162
163
164
166
170
186
194
202
209
211
224
229
230
231
232
233
234
236
237
238
239
243
244
246
249
261
262
263
4
12
13
24
41
42
43
45
48
50
68
74
75
79
87
88
90
100
103
103
103
107
113
114
116
120
125
127
128
137
140
141
142
143
144
148
151
152
153
158
161
162
163
164
166
170
186
194
202
209
211
224
229
230
231
232
233
234
236
237
238
239
243
244
246
249
261
262
263
4
12
13
24
41
42
43
45
48
50
68
74
75
79
87
88
90
100
103
103
103
107
113
114
116
120
125
127
128
137
140
141
142
143
144
148
151
152
153
158
161
162
163
164
166
170
186
194
202
209
211
224
229
230
231
232
233
234
236
237
238


In [201]:
from sklearn.preprocessing import normalize

maxValue = summaryDF_errorMap['std_dev_mean'].max()
minValue = summaryDF_errorMap['std_dev_mean'].min()

test = summaryDF_errorMap[0:63]
testdf = gpd.GeoDataFrame(test, geometry='geometry', crs={'init':'epsg:3857'})
# testdf = testdf.to_crs({'init':'epsg:3857'})

testdf['region'] = testdf['region'].astype(int)
testdf['randomNum'] = testdf.apply(lambda row: randomNum(), axis=1)
testdf = testdf[testdf['std_dev_range'] < 3]

folium_map = folium.Map(location=[40.742, -73.956], zoom_start=12, tiles="cartodbpositron")

folium_map.choropleth(
    geo_data=testdf,
    name='choropleth',
    data=testdf[['region', 'std_dev_mean']],
    columns=['region', 'std_dev_mean'],
    key_on='feature.properties.region',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2
)

folium.LayerControl().add_to(folium_map)

display(folium_map)


# summaryDF_errorMap

# test['norm'] = test.apply(lambda row: normalizeArray(row['std_dev_mean'], maxValue, minValue), axis=1)

# summaryDF_errorMap.iloc[0:63]['norm'] = summaryDF_errorMap[0:63].apply(lambda row: normalizeArray(row['std_dev_mean'], maxValue, minValue), axis=1)
# normed_matrix = normalize(summaryDF_errorMap[['std_dev_mean']], axis=1, norm='l1')

# # errorMap_df = gpd.GeoDataFrame(crs={'init': 'epsg:4326'})
# folium_map = folium.Map(location=[40.742, -73.956], zoom_start=12, tiles="cartodbpositron")
# errorMap_df = gpd.GeoDataFrame(summaryDF_errorMap, geometry='geometry', crs={'init': 'epsg:4326'})
# errorMap_df = errorMap_df.to_crs({'init':'epsg:3857'})


# errorMap_df.apply(lambda row: drawPolygons(folium_map, row['geometry'], 3), axis=1)

## Helper Functions

### Processing Functions

In [168]:
import random
def normalizeArray(value, array_max, array_min):
    
    normalized = (value - array_min) / (array_max - array_min)
    return 1

def randomNum():
    
    return random.uniform(0, 1)

### Geospatial Functions

In [5]:
def spatialJoin_polygon(noiseDF, geometry, locationID, sensorID):
    
    sinpoly = gpd.GeoDataFrame(crs={'init': 'epsg:4326'})
    sinpoly = sinpoly.append({'geometry': geometry, 'sensorID':sensorID}, ignore_index=True) 
    sinpoly = sinpoly.to_crs({'init':'epsg:3857'})
    
    
    plotPolygon(testing_map, sinpoly)
    
#     dataframe = gpd.tools.sjoin(noiseDF, sinpoly, how='inner', op="within")
    
#     return dataframe

def spatialJoin(sensorLat, sensorLon, sensorID, geoDataFrame):
    
    d = 500 # meters
    n_points = 20
    angles = np.linspace(0, 360, n_points)
    center = shapely.geometry.Point(sensorLon, sensorLat)
    polygon = Polygon(geog.propagate(center, angles, d))
    
    sinpoly = gpd.GeoDataFrame(crs={'init': 'epsg:4326'})
    sinpoly = sinpoly.append({'geometry': polygon, 'sensorID':sensorID}, ignore_index=True) 
    sinpoly = sinpoly.to_crs({'init':'epsg:3857'})
    
    dataframe = gpd.tools.sjoin(geoDataFrame, sinpoly, how='inner', op="within")
        
    return dataframe

### Visualization Functions

In [78]:
def plotPolygon(foliumMap, polygondf):
    folium.GeoJson(polygondf).add_to(foliumMap)                   
                                     
def drawPolygons(foliumMap, polygon, normValue):
    folium.Polygon(polygon, popup='Ross Island Bridge',fill_color='#132b5e').add_to(foliumMap)
    pass