## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import geopy.distance
import matplotlib.pyplot as plt
from shapely.geometry import Point
import shapely

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

import matplotlib as mpl

from scipy import stats
from sklearn import metrics
import folium

import math

from descartes import PolygonPatch
import altair as alt

from pprint import pprint
from IPython.display import clear_output

# for the notebook only (not for JupyterLab) run this command once per session
alt.renderers.enable('notebook')

import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


## Data Paths

In [2]:
### Consts
datapath = '../rawdata/sensors/'
filename = datapath + 'nodes.txt'

## taxi regions
taxi_regions = '../assets/taxi_zones.zip'

## Reading taxi regions

In [3]:
taxi_regions_geodf = gpd.read_file('zip://../assets/taxi_zones.zip')
taxi_regions_geodf = taxi_regions_geodf[taxi_regions_geodf['borough'] == 'Manhattan']
taxi_regions_geodf = taxi_regions_geodf.to_crs({'init':'epsg:3857'})

In [4]:
map_osm = folium.Map(location=[40.742, -73.956], zoom_start=12, tiles="cartodbpositron")
folium.GeoJson(taxi_regions_geodf).add_to(map_osm)
display(map_osm)

## Join Sensor Position x Taxi Regions

In [5]:
f = open(filename)

sensors_geodf = gpd.GeoDataFrame(crs={'init': 'epsg:4326'}) 

for line in f:
    s, lat, lon = line.split(' ')
    
    lat = float(lat)
    lon = float(lon)
    
    sensor_point = shapely.geometry.Point(lon, lat)
    sensors_geodf = sensors_geodf.append({'geometry': sensor_point, 'sensorID':s}, ignore_index=True)        
    
sensors_geodf = sensors_geodf.to_crs({'init':'epsg:3857'})

intersection = gpd.tools.sjoin(sensors_geodf,taxi_regions_geodf, how='inner', op="within")

## Loading datasets

In [18]:
## loading taxi
taxi = pd.read_pickle('../data/taxi/taxi.pkl')
taxi = taxi['2018-01-01':'2018-05-01']

## loading 311
noiseComplaints = pd.read_pickle('../data/311/311.pkl')
noiseComplaints = noiseComplaints['2018-01-01':'2018-05-01']

## loading wind speed
windSpeed = pd.read_pickle('../data/weather/wind.pkl')
windSpeed = windSpeed.resample('H').agg({'Spd[Wind]': 'mean'})

## loading precipitation
precipitation = pd.read_pickle('../data/weather/precipitation.pkl')
precipitation = precipitation.resample('H').agg({'Amt[PrecipHourly1]': 'mean'})

In [20]:
noiseComplaints_start, noiseComplaints_end = noiseComplaints.index[0], noiseComplaints.index[-1]
taxi_start, taxi_end = taxi.index[0], taxi.index[-1]
windSpeed_start, windSpeed_end = windSpeed.index[0], windSpeed.index[-1]
precipitation_start, precipitation_end = precipitation.index[0], precipitation.index[-1]

## Calculating the largest intersection
intersection_start = max(noiseComplaints_start, taxi_start, windSpeed_start, precipitation_start)
intersection_end = min(noiseComplaints_end, taxi_end, windSpeed_end, precipitation_end)

print('311 Range: ', noiseComplaints_start, '----', noiseComplaints_end)
print('Taxi Range: ', taxi_start, '----', taxi_end)
print('Wind Speed Range: ', windSpeed_start,'----', windSpeed_end)
print('Rain Precipitation Range: ', precipitation_start,'----', precipitation_end)
print('Largest Intersection: ', intersection_start,'----', intersection_end)

311 Range:  2018-01-01 00:04:05 ---- 2018-05-01 23:59:00
Taxi Range:  2018-01-01 00:00:00 ---- 2018-05-01 23:59:59
Wind Speed Range:  2010-01-01 01:00:00 ---- 2018-04-02 00:00:00
Rain Precipitation Range:  2010-01-01 01:00:00 ---- 2018-04-02 00:00:00
Largest Intersection:  2018-01-01 00:04:05 ---- 2018-04-02 00:00:00


## Calculating Correlations

In [None]:
## training region
training_region = 79

taxi_regions = taxi_regions_geodf['LocationID'].values

correlations_dataframe = pd.DataFrame()

nOfRegions = taxi_regions.shape[0]

for region in taxi_regions:
    
    print(nOfRegions)
    
    ## filtering
    training_region_df = taxi[taxi['location'] == training_region]
    testing_region_df = taxi[taxi['location'] == region]
    
    ## resampling 
    training_region_df = training_region_df.resample('H').agg({'location': 'count'})
    testing_region_df = testing_region_df.resample('H').agg({'location': 'count'})
    

    training_region_df['testing'] = testing_region_df
    correlation = training_region_df.corr()
    correlation = correlation.iloc[0]['testing']
    
    correlations_dataframe = correlations_dataframe.append({
        'training_region': training_region,
        'testing_region':region,
        'correlation':correlation
    }, ignore_index=True)
    
    nOfRegions = nOfRegions - 1

In [None]:
correlations_dataframe.sort_values(by='correlation', ascending=True)

## Training

In [30]:
taxi_region = 79
sensors_in_region = intersection[intersection['LocationID'] == taxi_region]['sensorID']

# Load sensors
selectedsensors = sensors_in_region.values

f = open(filename)
sensors = {}

for line in f:
    s, lat, lon = line.split(' ')
    if s in selectedsensors:
        
        print('Collecting for sensor ', s)
        
        # collection sensor metadata
        sensors[s] = {}
        sensors[s]['lat'] = float(lat)
        sensors[s]['lon'] = float(lon)
        
        # calculating the taxi region to which the sensor belongs
        sensors[s]['taxi_region'] = taxi_region
        
        # loading sensor data
        sensorData = pd.read_pickle(datapath +s+ '.pkl')
        
        # calculating the intersection with the external datasets
        sensorData_start, sensorData_end = sensorData.index[0], sensorData.index[-1]
        dataframe_start, dataframe_end = max(sensorData_start, intersection_start), min(sensorData_end, intersection_end)
        
        # creating empty timeseries
        df_timeseries = pd.DataFrame()
        df_timeseries['datetime'] = pd.date_range(dataframe_start, dataframe_end, freq="1h")
        df_timeseries.set_index(['datetime'], inplace = True)
        
        # calculating the average over one hour of SPL
        sensorData['dbas'] = sensorData['sum'] / sensorData['count']
        
        # adding sensor data to the empty dataframe
        df_timeseries['dbas'] = sensorData['dbas'][dataframe_start:dataframe_end]
        
        # adding wind speed to the dataframe
        df_timeseries['wind'] = windSpeed[dataframe_start:dataframe_end]
        
        # adding rain precipitation to the dataframe
        df_timeseries['precipitation'] = precipitation[dataframe_start:dataframe_end]
        
        print('\t -Adding 311 data...')
        noiseComplaints_temp = noiseComplaints[noiseComplaints.apply(lambda row: pointWithinCircle([row['Latitude'], row['Longitude']], [lat, lon, 200]), axis=1)]
        noiseComplaints_temp = noiseComplaints_temp.resample('H').agg({'Descriptor': 'count'})
        noiseComplaints_temp.rename({'Descriptor':'noise'}, inplace=True)
        df_timeseries['noise'] = noiseComplaints_temp[dataframe_start:dataframe_end]
        
        print('\t -Adding taxi data...')
        # adding taxi data to the empty dataframe
        taxi_temp = taxi[taxi['location'] == sensors[s]['taxi_region']]
        taxi_temp = taxi_temp.resample('H').agg({'location': 'count'})
        taxi_temp.rename({'location':'trips'}, inplace=True)
        df_timeseries['taxi'] = taxi_temp[dataframe_start:dataframe_end]
                
        # filling the missing entries with 0
        df_timeseries.fillna(df_timeseries.mean(), inplace=True)
        df_timeseries['dbas'] = df_timeseries['dbas'].astype(int)
        
        # adding cos and sin to the dataframe
        df_timeseries['hour'] = df_timeseries.index.hour
        df_timeseries['hour_sin'] = np.sin(df_timeseries['hour'])
        df_timeseries['hour_cos'] = np.cos(df_timeseries['hour'])
        
        # attach the complete dataframe to a given sensor
        sensors[s]['dataframe'] = df_timeseries 

Collecting for sensor  sonycnode-b827eb18a94a.sonyc


ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

## Testing

In [31]:
## testing
taxi_region = 79

sensors_in_region = intersection[intersection['LocationID'] == taxi_region]['sensorID']
print(sensors_in_region)
testing_map = folium.Map(location=[40.742, -73.956], zoom_start=12, tiles="cartodbpositron")

folium.GeoJson(taxi_regions_geodf[taxi_regions_geodf['LocationID'] == taxi_region]).add_to(testing_map)
# sensors_points = sensors_geodf[sensors_geodf['sensorID'].isin(sensors_in_region.values)]

# sensors_points = sensors_points.to_crs({'init':'epsg:4326'})

# plotMapPoints(sensors_points.iloc[0]['geometry'],testing_map)
# plotMapPoints(sensors_points.iloc[1]['geometry'],testing_map)
# plotMapPoints(sensors_points.iloc[2]['geometry'],testing_map)

display(testing_map)

5     sonycnode-b827eb18a94a.sonyc
13    sonycnode-b827eb429cd4.sonyc
35    sonycnode-b827eb977bfb.sonyc
Name: sensorID, dtype: object


## Helper Functions

### Geospatial Functions

In [27]:
# nyc_boundaries = gpd.read_file('../data/nyc_boundaries/NYC_Boundaries.geojson')

def pointWithinCircle(point, circle):
    ## Return if a given point is within a circle
    c = (circle[0], circle[1])
    r = circle[2]
    dist = geopy.distance.distance(c, point).meters
    if dist <= r:
        return True

    return False

def getSensorLocation(lat, lng):
    ## TODO: Refactor function return
    ## get neighboorhood ID of a coordinate
    point = Point(lng, lat)
    taxi_regions['intersect'] = taxi_regions.apply(lambda row: row['geometry'].intersects(point), axis=1)
    region =  taxi_regions[taxi_regions['intersect'] == True]['LocationID']
    return region.values[0]

### Visualization Functions

In [12]:
def plotMapPoints(mapCoordGroups, map_osm):
    
#     for group in mapCoordGroups:
#         for coord in group:    

    folium.CircleMarker(
        location=[mapCoordGroups.y, mapCoordGroups.x],
        radius=5,
        fill=True,
        fill_opacity=0.5,
        fill_color="red",
        color="black").add_to(map_osm)
    
def plotCircleRegions(circleRegions, map_osm): 
    
    for circle in circleRegions: 
        
        circle_str = '('+  str(circle[0]) + ' , ' + str(circle[1]) + ')'
        
        folium.Circle(
            radius=5,
            location=[circle[0], circle[1]],
            popup='The Waterfront',
            color='crimson',
            fill=False,
            tooltip=circle_str).add_to(map_osm)

In [None]:
# areas_coords = [
#     (40.748203, -74.002728),
#     (40.744006, -73.990839),
#     (40.739258, -73.978691),
#     (40.730250, -73.981419),
#     (40.730999, -73.997317),
#     (40.736302, -74.003391),
#     (40.727475, -74.005537),
#     (40.722948, -73.995838),
#     (40.719485, -73.984144),
#     (40.718380, -74.006466),
#     (40.710166, -74.008929),
#     (40.714867, -73.993454),
#     (40.758490, -73.996376),
#     (40.754586, -73.985882),
#     (40.750930, -73.976355),
#     (40.760797, -73.967662),
#     (40.762536, -73.977212),
#     (40.765541, -73.986568)]