### NOTE

As of August 16th, 2021, it was found that when we query the heremap data, that some smaller radius has bigger number of POI counts than the bigger radius, especially transportation and office, and we do not know why that is the case, and it was decided to use a helper function later to make the POI count consistent with the radius

In [None]:
!pip install geopandas
!pip install pyprobar
!pip install catboost
!pip install xmltodict
!pip install holidays

In [None]:
import pandas as pd
import boto3
import io
import importlib
from shapely.wkt import loads
import here_maps_helper
import evaluation_helper

importlib.reload(here_maps_helper)
importlib.reload(evaluation_helper)

from here_maps_helper import *
from evaluation_helper import compare_feature_combinations

In [None]:
pd.set_option('display.max_columns', 500)

# Load open street map groundtruth data

In [None]:
client= boto3.client('s3')
csv_obj = client.get_object(Bucket='bucket-vwfs-pred-park-global-model-serving-dev', 
                            Key="input/processed/frontend/different_radius_6_radius_seattle_groundtruth_labels_with_openstreetmap_features.csv")
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')
event_data = pd.read_csv(io.StringIO(csv_string), index_col=0)

# Include Here geolocation information

In [None]:
event_data.geometry = event_data.geometry.map(loads)

In [None]:
event_data.head(1)

In [None]:
#Attention! Running below two cells produesc approx 5000 API calls against HERE discovery API, where we have monthly limits of 250k request
buildings = ['restaurant', 'shopping', 'office', 'supermarket', 'transportation', 'schools']

### Query API to get data for different radius

In [None]:
# get the data for diff radius, each call will generate 5000 api call, with a monthly budget of 250, 000
data_with_neighbourhood_25 = add_neighbourhood_info_here(event_data, 25, buildings, 'seattle')
data_with_neighbourhood_25.to_csv("new_seattle_static_map_features_25.csv")

data_with_neighbourhood_50 = add_neighbourhood_info_here(event_data, 50, buildings, 'seattle')
data_with_neighbourhood_50.to_csv("new_seattle_static_map_features_50.csv")

data_with_neighbourhood_100 = add_neighbourhood_info_here(event_data, 100, buildings, 'seattle')
data_with_neighbourhood_100.to_csv("new_seattle_static_map_features_100.csv")

data_with_neighbourhood_150 = add_neighbourhood_info_here(event_data, 150, buildings, 'seattle')
data_with_neighbourhood_150.to_csv("new_seattle_static_map_features_150.csv")

data_with_neighbourhood_250 = add_neighbourhood_info_here(event_data, 250, buildings, 'seattle')
data_with_neighbourhood_250.to_csv("new_seattle_static_map_features_250.csv")

data_with_neighbourhood_500 = add_neighbourhood_info_here(event_data, 500, buildings, 'seattle')
data_with_neighbourhood_500.to_csv("new_seattle_static_map_features_500.csv")

### Upload files of different radius to S3

In [None]:
# upload separate files to s3
files = ["new_seattle_static_map_features_25.csv", "new_seattle_static_map_features_50.csv",
         "new_seattle_static_map_features_100.csv", "new_seattle_static_map_features_150.csv", 
         "new_seattle_static_map_features_250.csv","new_seattle_static_map_features_500.csv"]

for file in files:
    print(f'uploading {file} to object here_evaluation/seattle/new_different_radius_{file} in s3')
    client.upload_file(f'{file}', 'bucket-vwfs-pred-park-global-model-serving-dev', f'Here_evaluation/seattle/different_radius_{file}')
    print(f'{file} upload finished')

### Read csv from S3 and add different radius feature together

In [None]:
data_with_neighbourhood_25 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_seattle_static_map_features_25.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_neighbourhood_50 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_seattle_static_map_features_50.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_neighbourhood_100 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_seattle_static_map_features_100.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_neighbourhood_150 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_seattle_static_map_features_150.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_neighbourhood_250 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_seattle_static_map_features_250.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_neighbourhood_500 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_seattle_static_map_features_500.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])

In [None]:
event_data = event_data.set_index(['street_id', 'observation_interval_start'])

In [None]:
# putting all the radius together and make one dataframe
# select the columns, rename the column, and putting it together

ls_neighbourhood_df = {
    'base': event_data,
    '25': data_with_neighbourhood_25,
    '50': data_with_neighbourhood_50,
    '100': data_with_neighbourhood_100, 
    '150': data_with_neighbourhood_150, 
    '250': data_with_neighbourhood_250, 
    '500': data_with_neighbourhood_500
}
# save the dataframe with selected columns to new dictionary 
dict_neighbourhood_df = {}
for key, df in ls_neighbourhood_df.items():
    if key == 'base':
        dict_neighbourhood_df[key] = df
    else:
        dict_neighbourhood_df[key] = df[buildings]

In [None]:
# rename the columns for each datafame and save to a new dict
dict_neighbourhood_df_renamed = {}
for key, df in dict_neighbourhood_df.items():
    if key == 'base':
        dict_neighbourhood_df_renamed[key] = df
    
    else:
        dict_neighbourhood_df_renamed[key] = df.rename(
            columns={
                'restaurant': f'restaurant_here_{key}',
                'shopping': f'shopping_here_{key}',
                'office': f'office_here_{key}',
                'supermarket': f'supermarket_here_{key}',
                'transportation': f'transportation_here_{key}', # need to mark the here due to overlapping name
                'schools': f'schools_here_{key}' # need to mark here due to overlapping name
        }
    )

In [None]:
different_radius_new_6_radius_data_with_neighbourhood = pd.concat(dict_neighbourhood_df_renamed.values(), axis = 1)

### Save and upload final file to S3

In [None]:
different_radius_new_6_radius_data_with_neighbourhood.to_csv('different_radius_new_6_radius_seattle_static_map_features.csv')

In [None]:
client.upload_file('different_radius_new_6_radius_seattle_static_map_features.csv', 'bucket-vwfs-pred-park-global-model-serving-dev', "Here_evaluation/seattle/different_radius_new_6_radius_seattle_static_map_features.csv")

# Compare here maps features against open street map features

In [None]:
data_with_neighbourhood = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_new_6_radius_seattle_static_map_features.csv')

In [None]:
data_with_neighbourhood.geometry = data_with_neighbourhood.geometry.map(loads)

### Evaluate and feature importance

In [None]:
os_feature_names = ['length', 'highway', 
                    'commercial_25', 'residential_25', 'transportation_25','schools_25', 'eventsites_25',
                    'commercial_50', 'residential_50', 'transportation_50','schools_50', 'eventsites_50',
                    'commercial_100', 'residential_100', 'transportation_100','schools_100', 'eventsites_100', 
                    'commercial_150', 'residential_150','transportation_150', 'schools_150', 'eventsites_150', 
                    'commercial_250','residential_250', 'transportation_250', 'schools_250','eventsites_250', 
                    'commercial_500', 'residential_500', 'transportation_500', 'schools_500', 'eventsites_500']
here_feature_names = ['restaurant_here_25','shopping_here_25', 'office_here_25', 'supermarket_here_25', 'transportation_here_25', 'schools_here_25',
                      'restaurant_here_50','shopping_here_50', 'office_here_50', 'supermarket_here_50', 'transportation_here_50', 'schools_here_50', 
                      'restaurant_here_100','shopping_here_100', 'office_here_100', 'supermarket_here_100', 'transportation_here_100', 'schools_here_100', 
                      'restaurant_here_150','shopping_here_150', 'office_here_150', 'supermarket_here_150','transportation_here_150', 'schools_here_150', 
                      'restaurant_here_250', 'shopping_here_250', 'office_here_250', 'supermarket_here_250','transportation_here_250', 'schools_here_250', 
                      'restaurant_here_500', 'shopping_here_500', 'office_here_500', 'supermarket_here_500','transportation_here_500', 'schools_here_500']
time_feat = ['hour','weekday']
cat_features = ['highway', 'hour', 'weekday'] 
feature_dict = {'os_feat': os_feature_names, 'here_feat': here_feature_names, 'here_osm': here_feature_names + os_feature_names, 
                'map and time_feat': here_feature_names + os_feature_names+ time_feat, 'cat_features': cat_features}

In [None]:
data_with_neighbourhood[cat_features] = data_with_neighbourhood[cat_features].astype("str")
data_with_neighbourhood[[feat for feat in os_feature_names if feat not in cat_features]] = data_with_neighbourhood[[feat for feat in os_feature_names if feat not in cat_features]].astype(float)

In [None]:
compare_feature_combinations(data_with_neighbourhood, 50 ,feature_dict, 'availability', val_size=0.05, test_size=0.2, disjunct_locations=False, perform_t_test=True)

# Include static parking information from HERE on-street parking API

In [None]:
data_with_capa = add_static_parking_info_here(data_with_neighbourhood)

### Evaluate and feature importance

In [None]:
map_feature_names = ['highway', 'length', 
                    'commercial_25', 'residential_25', 'transportation_25','schools_25', 'eventsites_25', 
                    'commercial_50', 'residential_50', 'transportation_50','schools_50', 'eventsites_50', 
                    'commercial_100', 'residential_100', 'transportation_100','schools_100', 'eventsites_100', 
                    'commercial_150', 'residential_150','transportation_150', 'schools_150', 'eventsites_150', 
                    'commercial_250','residential_250', 'transportation_250', 'schools_250','eventsites_250', 
                    'commercial_500', 'residential_500', 'transportation_500', 'schools_500', 'eventsites_500',
                    'restaurant_here_25','shopping_here_25', 'office_here_25', 'supermarket_here_25', 'transportation_here_25', 'schools_here_25', 
                    'restaurant_here_50','shopping_here_50', 'office_here_50', 'supermarket_here_50', 'transportation_here_50', 'schools_here_50', 
                    'restaurant_here_100','shopping_here_100', 'office_here_100', 'supermarket_here_100', 'transportation_here_100', 'schools_here_100', 
                    'restaurant_here_150','shopping_here_150', 'office_here_150', 'supermarket_here_150','transportation_here_150', 'schools_here_150', 
                    'restaurant_here_250', 'shopping_here_250', 'office_here_250', 'supermarket_here_250','transportation_here_250', 'schools_here_250', 
                    'restaurant_here_500', 'shopping_here_500', 'office_here_500', 'supermarket_here_500','transportation_here_500', 'schools_here_500']
time_feat = ['hour','weekday']
parking_feat = [ 'current_capacity']
cat_features = ['highway', 'hour', 'weekday'] 
feature_dict = {'map_feat': map_feature_names, 'map and parking_feat': parking_feat + map_feature_names, 
                'map and time_feat': time_feat + map_feature_names, 'map_time_parking_feat' : parking_feat + map_feature_names + time_feat,
                'cat_features': cat_features}

In [None]:
compare_feature_combinations(data_with_capa, 50, feature_dict, 'availability', val_size=0.05, test_size=0.2, disjunct_locations=False, perform_t_test=True)

# Include nearby off-street parking facilities

### Query API to get data for different radius

In [None]:
data_with_offstreet_25 = add_off_street_parking_here(data_with_capa, radius = 25)
data_with_offstreet_50 = add_off_street_parking_here(data_with_capa, radius = 50)
data_with_offstreet_100 = add_off_street_parking_here(data_with_capa, radius = 100)
data_with_offstreet_150 = add_off_street_parking_here(data_with_capa, radius = 150)
data_with_offstreet_250 = add_off_street_parking_here(data_with_capa, radius = 250)
data_with_offstreet_500 = add_off_street_parking_here(data_with_capa, radius = 500)

In [None]:
data_with_offstreet_25.to_csv('data_with_offstreet_25.csv')
data_with_offstreet_50.to_csv('data_with_offstreet_50.csv')
data_with_offstreet_100.to_csv('data_with_offstreet_100.csv')
data_with_offstreet_150.to_csv('data_with_offstreet_150.csv')
data_with_offstreet_250.to_csv('data_with_offstreet_250.csv')
data_with_offstreet_500.to_csv('data_with_offstreet_500.csv')

### Upload files to S3

In [None]:
# upload separate files to s3
files = ["data_with_offstreet_25.csv", "data_with_offstreet_50.csv",
         "data_with_offstreet_100.csv", "data_with_offstreet_150.csv", 
         "data_with_offstreet_250.csv","data_with_offstreet_500.csv" ]

for file in files:
    print(f'uploading {file} to object here_evaluation/seattle/different_radius_{file} in s3')
    client.upload_file(f'{file}', 'bucket-vwfs-pred-park-global-model-serving-dev', f'Here_evaluation/seattle/different_radius_{file}')
    print(f'{file} upload finished')

### Read csv from S3 and add different radius feature together

In [None]:
# read from s3
data_with_offstreet_25 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet_25.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_offstreet_50 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet_50.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_offstreet_100 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet_100.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_offstreet_150 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet_150.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_offstreet_250 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet_250.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])
data_with_offstreet_500 = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet_500.csv', index_col=0).set_index(['street_id', 'observation_interval_start'])

In [None]:
with_offstreet_cols = ['num_off_street_parking', 'off_street_capa']

In [None]:
data_with_capa = data_with_capa.set_index(['street_id', 'observation_interval_start'])

In [None]:
ls_with_offstreet_df = {
    'base': data_with_capa,
    '25': data_with_offstreet_25,
    '50': data_with_offstreet_50,
    '100': data_with_offstreet_100, 
    '150': data_with_offstreet_150, 
    '250': data_with_offstreet_250, 
    '500': data_with_offstreet_500
}
# save the dataframe with selected columns to new dictionary 
dict_with_offstreet_df = {}
for key, df in ls_with_offstreet_df.items():
    if key == 'base':
        dict_with_offstreet_df[key] = df
    else:
        dict_with_offstreet_df[key] = df[with_offstreet_cols]

In [None]:
# rename the columns for each datafame and save to a new dict
dict_with_offstreet_df_renamed = {}
for key, df in dict_with_offstreet_df.items():
    if key == 'base':
        dict_with_offstreet_df_renamed[key] = df
    
    else:
        dict_with_offstreet_df_renamed[key] = df.rename(
            columns={
                'num_off_street_parking': f'num_off_street_parking_{key}',
                'off_street_capa': f'off_street_capa_{key}',
    
        }
    )

In [None]:
different_radius_data_with_offstreet = pd.concat(dict_with_offstreet_df_renamed.values(), axis = 1)

### Save and upload final file to S3

In [None]:
different_radius_data_with_offstreet.to_csv('different_radius_data_with_offstreet.csv')

In [None]:
client.upload_file('different_radius_data_with_offstreet.csv', 'bucket-vwfs-pred-park-global-model-serving-dev', "Here_evaluation/seattle/different_radius_data_with_offstreet.csv")

### Read from s3

In [None]:
different_radius_data_with_offstreet = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_data_with_offstreet.csv')

### Evaluate and feature importance

In [None]:
off_street_parking_feat = [ 'num_off_street_parking_25', 'off_street_capa_25',
                            'num_off_street_parking_50', 'off_street_capa_50',
                            'num_off_street_parking_100', 'off_street_capa_100',
                            'num_off_street_parking_150', 'off_street_capa_150',
                            'num_off_street_parking_250', 'off_street_capa_250',
                            'num_off_street_parking_500', 'off_street_capa_500']
feature_dict = {'map_time_feat': map_feature_names + time_feat, 'map_time_on_street': map_feature_names + time_feat+ parking_feat,
                'map_time_off_street':  map_feature_names + time_feat+ off_street_parking_feat, 
                'map_time_parking': map_feature_names + time_feat+ off_street_parking_feat+parking_feat,
                'cat_features': cat_features}

In [None]:
compare_feature_combinations(different_radius_data_with_offstreet, 50 ,feature_dict, 'availability', val_size=0.05, test_size=0.2, disjunct_locations=False, perform_t_test=True)

# Include weather information

In [None]:
different_radius_data_with_offstreet.geometry = different_radius_data_with_offstreet.geometry.map(loads)

In [None]:
seattle_weather = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/input/historic_weather_data/seattle/seattle_weather_groundtruth_dates_2019.csv', index_col=0, dtype=str)

### Create a common daily key to join weather_data with feature df

In [None]:
seattle_weather['time_key'] = seattle_weather.apply(lambda x: x['date'] +'-'+ x['time'][:-2].rjust(2, "0"), axis=1 )
different_radius_data_with_offstreet['time_key'] = pd.to_datetime(different_radius_data_with_offstreet.observation_interval_start).map(lambda x: x.strftime('%Y-%m-%d-%H'))
different_radius_data_with_offstreet['street_id'] = different_radius_data_with_offstreet['street_id'].astype(str)

In [None]:
weather_features = ['tempC', 'windspeedKmph', 'precipMM']

### Match by the time key and get the columns needed

In [None]:
data_with_weather = different_radius_data_with_offstreet

In [None]:
for col in weather_features:
    data_with_weather[col] = ""

In [None]:
for index, row in data_with_weather.iterrows():
    weather_on_day = seattle_weather[seattle_weather['time_key'] == row['time_key']].iloc[0, :][weather_features]
    for feature_name in weather_features:
        data_with_weather.at[index, feature_name] = weather_on_day[feature_name]
data_with_weather.head()

### Evaluate and feature importance

In [None]:
feature_dict = {'map_time_feat': map_feature_names + time_feat,
                'map_time_parking': map_feature_names + time_feat+ parking_feat,
                'map_time_parking_weather': map_feature_names + time_feat + parking_feat + weather_features,
                'cat_features': cat_features}

In [None]:
compare_feature_combinations(data_with_weather, 10 ,feature_dict, 'availability', val_size=0.05, test_size=0.2, disjunct_locations=False, perform_t_test=True)

In [None]:
data_with_weather.to_csv('different_radius_6_radius_seattle_train_data_here.csv')

In [None]:
client.upload_file('different_radius_6_radius_seattle_train_data_here.csv', 'bucket-vwfs-pred-park-global-model-serving-dev', "Here_evaluation/seattle/different_radius_6_radius_seattle_train_data_here.csv")

# Include calendar effects

In [None]:
from calendar_helper import add_extra_time_features

In [None]:
data_with_weather = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/Here_evaluation/seattle/different_radius_6_radius_seattle_train_data_here.csv', index_col=0)

In [None]:
data_with_weather.observation_interval_start = pd.to_datetime(data_with_weather.observation_interval_start)

In [None]:
data_with_holiday = add_extra_time_features(data_with_weather, 2019)

In [None]:
data_with_holiday.to_csv('different_radius_6_radius_data_with_holiday.csv')

In [None]:
client.upload_file('different_radius_6_radius_data_with_holiday.csv', 'bucket-vwfs-pred-park-global-model-serving-dev', 'input/open_data/seattle/different_radius_6_radius_data_with_holiday.csv')

In [None]:
data_with_holiday

### Evaluate and feature importance

In [None]:
advanced_time_feat = ['month', 'day_of_month', 'time_since_last_holiday', 'time_to_next_holiday', 'time_to_next_two_day_holiday', 'time_since_last_two_day_holiday']
weather_feat = ['tempC', 'windspeedKmph', 'precipMM']
feature_dict = {'no_time_feat': map_feature_names + parking_feat + weather_feat,
                'simple_time_feat': map_feature_names + parking_feat + weather_feat +time_feat,
                'all_time_feat': map_feature_names + time_feat + parking_feat + weather_feat + advanced_time_feat,
                'cat_features': cat_features+ ['month']}

In [None]:
compare_feature_combinations(data_with_holiday, 50 ,feature_dict, 'availability', val_size=0.05, test_size=0.2, disjunct_locations=False, perform_t_test=True)