In [None]:
!pip install category_encoders
!pip install geopandas
!pip install folium
!pip install geopy
!pip install catboost

In [None]:
import pandas as pd
import geopy
import boto3
import io
import importlib
import matplotlib.pyplot as plt
import geopandas
from shapely import wkt
from sklearn.preprocessing import StandardScaler

import location_similarity_helper as lsh
import location_similarity_plots as lsp
import location_similarity_cluster as lsc
import location_similarity_train_evaluate as lste
import baseline_helper as bh
import baseline_models as bm
import baseline_plots as bp
import baseline_data_helper as bdh

importlib.reload(lsh)
importlib.reload(lsp)
importlib.reload(lsc)
importlib.reload(lste)
importlib.reload(bm)
importlib.reload(bp)
importlib.reload(bdh)
importlib.reload(bh)

pd.set_option('display.max_column', 500)

In [None]:
client= boto3.client('s3')
csv_obj = client.get_object(Bucket='bucket-vwfs-pred-park-global-model-serving-dev', Key="input/open_data/seattle/different_radius_6_radius_data_with_holiday.csv")
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')
different_radius_data_with_holiday_raw= pd.read_csv(io.StringIO(csv_string), index_col=0)
# different_radius_data_with_holiday = pd.read_csv("different_radius_6_radius_data_with_holiday.csv", index_col=0)

In [None]:
different_radius_data_with_holiday_raw = lsh.fix_maxspeed(different_radius_data_with_holiday_raw)
assert len(different_radius_data_with_holiday_raw[
               different_radius_data_with_holiday_raw['maxspeed'].str.contains("\['", na=False)]) == 0

In [None]:
different_radius_data_with_holiday_raw

In [None]:
advanced_time_feat = ['holiday','month', 'day_of_month', 'time_since_last_holiday', 'time_to_next_holiday',
                      'time_to_next_two_day_holiday', 'time_since_last_two_day_holiday']

As we have seen in the here_evaluation_seattle.ipynb the time features does not really bring lift to the model performance, therefore, we drop those features in our dataset to avoid high dimensionality

In [None]:
different_radius_data_with_holiday_raw.drop(advanced_time_feat, axis=1, inplace=True)

In [None]:
different_radius_data_with_holiday_raw

In [None]:
train_data_with_trans = pd.read_csv('s3://bucket-vwfs-pred-park-global-model-serving-dev/input/open_data/seattle/train_data_with_trans.csv', index_col=0)

In [None]:
different_radius_data_with_holiday = different_radius_data_with_holiday_raw.merge(
    train_data_with_trans[['street_id', 'observation_interval_start', 'ongoing_trans']], 
    on=['street_id', 'observation_interval_start']
)

In [None]:
different_radius_data_with_holiday

# 1. Fix the radius problem

We have found that, for the here map POIs, there are cases where the poi in a certain radius is not consistent, because, smaller number of radius should have smaller no. of POI counts in a radius. Therefore, below blocks of code is designed to 1) check the inconsistency and fix the in consistency by replace the bigger radius with its closest smaller one.

In [None]:
# get the problematic poi
problematic_poi_street_ids = {}
ls_poi_types = ['restaurant', 'shopping', 'office', 'supermarket', 'transportation', 'schools']
for poi_type in ls_poi_types:
    problematic_poi_street_ids[poi_type] = bdh.compare_here_poi_in_radius(poi_type,
                                                                          different_radius_data_with_holiday)
# a dictionary of dictionary, as below:
# POI type:{
# poi_type_radius 25 compare to 50: {
#   street_id
#   }
# }
problematic_poi_street_ids

In [None]:
problematic_poi_street_ids_without_empty = {}
for key, values in problematic_poi_street_ids.items():
    if not any(values.values()):
        print('empty!')
    else:
        problematic_poi_street_ids_without_empty[key] = values
problematic_poi_street_ids_without_empty

In [None]:
# get the list of street_ids from office_problematic_poi
dict_ls_street_id = bdh.get_list_street_id(problematic_poi_street_ids_without_empty)
# according to the list id, replace inconsistency radius count values
different_radius_data_with_holiday = bdh.radius_count_replacement(dict_ls_street_id=dict_ls_street_id,
                                                                  data=different_radius_data_with_holiday)
different_radius_data_with_holiday

In [None]:
different_radius_data_with_holiday = different_radius_data_with_holiday.reset_index(drop=True)
different_radius_data_with_holiday

In [None]:
different_radius_data_with_holiday.length.max()

In [None]:
radius_25_cols = [
    'commercial_25', 'residential_25', 'transportation_25', 'schools_25', 'eventsites_25',  # osm
    'restaurant_here_25', 'shopping_here_25', 'office_here_25', 'supermarket_here_25',  # heremaps
    'transportation_here_25', 'schools_here_25',
    'num_off_street_parking_25', 'off_street_capa_25']  # here_off parking

radius_50_cols = [
    'commercial_50', 'residential_50', 'transportation_50', 'schools_50', 'eventsites_50',  # osm
    'restaurant_here_50', 'shopping_here_50', 'office_here_50', 'supermarket_here_50',  # heremaps
    'transportation_here_50', 'schools_here_50',
    'num_off_street_parking_50', 'off_street_capa_50']  # here_off parking

radius_100_cols = [
    'commercial_100', 'residential_100', 'transportation_100', 'schools_100', 'eventsites_100',  # osm
    'restaurant_here_100', 'shopping_here_100', 'office_here_100', 'supermarket_here_100',  # heremaps
    'transportation_here_100', 'schools_here_100',
    'num_off_street_parking_100', 'off_street_capa_100',  # here_off parking
]
radius_150_cols = [
    'commercial_150', 'residential_150', 'transportation_150', 'schools_150', 'eventsites_150',  # osm
    'restaurant_here_150', 'shopping_here_150', 'office_here_150', 'supermarket_here_150',  # heremaps
    'transportation_here_150', 'schools_here_150',
    'num_off_street_parking_150', 'off_street_capa_150',  # here_off parking
]
radius_250_cols = [
    'commercial_250', 'residential_250', 'transportation_250', 'schools_250', 'eventsites_250',  # osm
    'restaurant_here_250', 'shopping_here_250', 'office_here_250', 'supermarket_here_250',  # heremaps
    'transportation_here_250', 'schools_here_250',
    'num_off_street_parking_250', 'off_street_capa_250',  # here_off parking
]
radius_500_cols = [
    'commercial_500', 'residential_500', 'transportation_500', 'schools_500', 'eventsites_500',  # osm
    'restaurant_here_500', 'shopping_here_500', 'office_here_500', 'supermarket_here_500',  # heremaps
    'transportation_here_500', 'schools_here_500',
    'num_off_street_parking_500', 'off_street_capa_500',  # here_off parking
]
cat_features = ['highway', 'hour', 'weekday']
other_feat = [
    'street_id', 'observation_interval_start', 'availability', 'length', 'geometry',
    'current_capacity',  # heremap on_street capa
    'tempC', 'windspeedKmph', 'precipMM',  # weather
    #'ongoing_trans' # pbp on-going transaction
]

In [None]:
radius_cols = radius_25_cols + radius_50_cols + radius_100_cols + radius_150_cols + radius_250_cols + radius_500_cols
radius_cols_dict = {
    'radius_25': radius_25_cols,
    'radius_50': radius_50_cols,
    'radius_100': radius_100_cols,
    'radius_150': radius_150_cols,
    'radius_250': radius_250_cols,
    'radius_500': radius_500_cols,
    'radius_all': radius_cols,
}

# 2. Plot all the data points

In [None]:
df_geometry = different_radius_data_with_holiday[['street_id', 'geometry']]

In [None]:
df_geometry

In [None]:
# prep for plotting all data
df_geometry = df_geometry.drop_duplicates(subset=['street_id']).set_index('street_id')

# get thhe centroid
df_geometry['geometry'] = df_geometry['geometry'].apply(wkt.loads)
gdf = geopandas.GeoDataFrame(
    df_geometry, geometry=df_geometry['geometry'])
gdf['line_centroid'] = gdf['geometry'].centroid
gdf.head()

df_geometry['lon'] = df_geometry.line_centroid.apply(lambda p: p.x)
df_geometry['lat'] = df_geometry.line_centroid.apply(lambda p: p.y)

df_street_coords = df_geometry[['lon', 'lat']]
df_study_area = different_radius_data_with_holiday[['street_id', 'study_area']].drop_duplicates().set_index(
    'street_id')
street_coords_study_area = pd.merge(df_street_coords, df_study_area, left_index=True, right_index=True)

df_study_area.study_area.unique().size

In [None]:
city = "Seattle"
locator = geopy.geocoders.Nominatim(user_agent="MyCoder")
location_seattle = locator.geocode(city)

location_seattle = [location_seattle.latitude, location_seattle.longitude]
print("[lat, long]:", location_seattle)

street_coords_study_area['study_area'] = street_coords_study_area['study_area'].map(
    {
        'Pike-Pine': 0,
        'First Hill': 1,
        'South Lake Union': 2,
        'Commercial Core': 3,
        'Ballard': 4,
        'Chinatown/ID': 5,
        'Greenlake': 6,
        'Pioneer Square': 7,
        'University District': 8,
        'Uptown': 9,
        'Uptown Triangle': 10,
        'Capitol Hill': 11,
        'University District': 12,
        '12th Ave': 13,
        'Fremont': 14,
        'Cherry Hill': 15,
        'Ballard Locks': 16,
        'Roosevelt': 17,
        'Westlake': 18,
        'Columbia City': 19
    }
)

# plot the map where shows the 4 districts of Seattle, the output html is called map
lsp.plot_cluster_folium(city='Seattle', data=street_coords_study_area, study_area='study_area',
                        tiles='OpenStreetMap')

As we see from the map generated, there are multiple studies areas only have very few records, which we believe will not help a lot in training and thus we decide to later remove areas where we only have few observations and in general the data concentrated in the city center and outside of the city center it is quite sparse.

In [None]:
print(
    f'There are {different_radius_data_with_holiday.study_area.unique().size} unique areas in the seattle dataset ')

In [None]:
different_radius_data_with_holiday.groupby('study_area').size().sort_values(ascending=False)

In [None]:
# below 9 areas are the areas where we have more than 300 records
different_radius_data_with_holiday.groupby('study_area').filter(lambda x: len(x) > 250).study_area.unique()

In [None]:
# we have 4 areas are the areas where we have more than 450 records
different_radius_data_with_holiday.groupby('study_area').filter(lambda x: len(x) > 450).study_area.unique()

# 3. Model Training

Here we trained model with:
1. **all the data** where we generate a random split of train, test and valid and use 3 algorithms to train, the mathew looks good but we did not have any transfer, and it does not solve the problem of model transfer to areas with little to few ground truth.
2. **9 areas's data** As we see in the plots, that our data is concentrated on few main areas, and there are areas with only very few sample, we decide to remove those study areas and the assumption is that they will not help much in the training due to the limit number of samples.
4. **4 areas data in the city center** As previous iteration of experiments where we train data with only 4 areas, we hereby also wish to explore the baseline result of it.

In [None]:
diff_radius_result_all = {}
diff_radius_result_9_areas = {}
diff_radius_result_4_areas = {}

for key, ls_cols in radius_cols_dict.items():
    # get the X, y and other aux data for later debug use
    X_raw, y, aux = bh.select_data(col=other_feat + cat_features + ls_cols,
                                   data=different_radius_data_with_holiday)

    # as for catboost, we do not need to encode categorical, here we make a copy of the data to treat catboost differently
    X_with_study_area_catboost = X_raw.join(aux, how='inner').drop(['geometry'], axis=1).copy()
    # process the data
    ## encode all the categorical features for other algorithms
    X_without_study_area_encoded, _ = lsh.encode_categorical(
        encoder='target_encoder',
        col_encoded=cat_features,
        feature=X_raw,
        target=y
    )
    X_with_study_area_encoded = X_without_study_area_encoded.join(aux, how='inner').drop(['geometry'], axis=1)
    ####################
    ## Train all data ##
    ############vv######
    X_catboost = X_with_study_area_catboost.drop(['study_area'], axis=1)  # data for catboost(not encoded cat)
    df_result_all_areas = bh.train_all_data(X=X_without_study_area_encoded, X_catboost=X_catboost, cat_feat=cat_features, y=y)
    diff_radius_result_all[key] = df_result_all_areas

    ####################
    ## Train 9 areas ##
    ############vv######
    # As we see in the plots, that our data is concentrated on few main areas, and there are areas with only very few sample, we decide to remove those study areas and **the assumption is that they will not help much in the training due to the limit number of samples.**
    df_9_areas = X_with_study_area_encoded.groupby('study_area').filter(lambda x: len(x) > 250)
    print(df_9_areas.study_area.unique())
    ## filter the y based on the index of the dataframe
    y_9_areas = y.loc[df_9_areas.index]
    # catboost X data, y is the same as none catboost
    df_9_areas_catboost = X_with_study_area_catboost.groupby('study_area').filter(lambda x: len(x) > 250)
    df_result_9_areas = bh.train_different_areas(X_different_areas=df_9_areas,
                                                 X_different_areas_catboost=df_9_areas_catboost,
                                                 cat_feat=cat_features,
                                                 y_different_areas=y_9_areas)
    diff_radius_result_9_areas[key] = df_result_9_areas

    ########################
    ## Train with 4 areas ##
    ########################
    # new data
    df_4_areas = X_with_study_area_encoded.groupby('study_area').filter(lambda x: len(x) > 450)
    y_4_areas = y.loc[df_4_areas.index]
    # for catboost
    df_4_areas_catboost = X_with_study_area_catboost.groupby('study_area').filter(lambda x: len(x) > 450)
    df_result_4_areas = bh.train_different_areas(X_different_areas=df_4_areas,
                                                 X_different_areas_catboost=df_4_areas_catboost,
                                                 cat_feat=cat_features,
                                                 y_different_areas=y_4_areas)
    diff_radius_result_4_areas[key] = df_result_4_areas

# 4. Results

## 4.1 Result when training with all the data

Even our goal is to investigate the problem of model training in areas with little to no-ground truth label, we use the whole dataset, and split it to train, test, validate just to establish a raw baseline, where we **assume that we do not have a problem of lacking target labels in the dataset**.

In [None]:
diff_radius_result_all['radius_25']

In [None]:
diff_radius_result_all['radius_50']

In [None]:
diff_radius_result_all['radius_100']

In [None]:
diff_radius_result_all['radius_150']

In [None]:
diff_radius_result_all['radius_250']

In [None]:
diff_radius_result_all['radius_500']

In [None]:
diff_radius_result_all['radius_all']

## 4.2 Result with train 9 areas

As we see in the plots, that our data is concentrated on few main areas, and there are areas with only very few sample, we decide to remove those study areas and **the assumption is that they will not help much in the training due to the limit number of samples.**

In [None]:
diff_radius_result_9_areas['radius_100'][['Matthews']]

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_25'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_50'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_100'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
bp.area_split_boxplot(diff_radius_result_9_areas['radius_100'][['Matthews']],
                      'Box Plot of Different Metrics for 9 Area Splits of 100 radius')

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_150'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_250'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_500'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_9_areas['radius_all'][['Matthews']],
                        "Mean of the Metric for the 9 Area Splits")

In [None]:
diff_radius_result_9_areas['radius_all'].Matthews.loc[:, 'Catboost'].mean()

In [None]:
diff_radius_result_9_areas['radius_100'].Matthews.loc[:, 'Catboost'].mean()

**Conclusion:**

**Catboost trained on 100 radius works the best for 9 areas, and when we tries to use all radius, it seems that it will confuse the algorithm**

## 4.2 Result with training for 4 Areas

This part we wish to tested out the direct transfer in the 4 areas which are concentrated in the city center

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_25'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_50'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_100'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_150'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_250'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_500'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

In [None]:
bp.area_split_mean_plot(diff_radius_result_4_areas['radius_100'][['Matthews']],
                        "Mean of the Metric for the 4 Area Splits")

**Conclusion:**

**4 Areas in the city center does not work so well compared to model trained on 9 areas, therefore, we have determined our baseline as:**

       the catboost model trained on 100 radius data and on the 9 areas split, with a Matthews of 17%

# 5. Select and Save the Data

Here we select the data, and push it to s3

In [None]:
train_data_with_trans_100_with_transaction = different_radius_data_with_holiday.drop(
    radius_25_cols + radius_50_cols + radius_150_cols + radius_250_cols + radius_500_cols, axis=1)

In [None]:
train_data_with_trans_100_with_transaction.head(1)

In [None]:
train_data_with_trans_100_with_transaction.to_csv('train_data_with_trans_100_with_transaction.csv')

In [None]:
print(f'uploading file to object to s3')
client.upload_file('train_data_with_trans_100_with_transaction.csv',
                   'bucket-vwfs-pred-park-global-model-serving-dev',
                   'input/open_data/seattle/train_data_with_trans_100_with_transaction.csv')
print('file upload finished')

# 6. Feature Importance of the Best Performing Model

In [None]:
df_9_areas_catboost

In [None]:
y_9_areas

In [None]:
# train the model catboost again with 9 area splits
df_9_areas_catboost_100 = df_9_areas_catboost.drop(radius_25_cols 
                                                   + radius_50_cols 
                                                   + radius_150_cols 
                                                   + radius_250_cols 
                                                   + radius_500_cols, 
                                                   axis=1)

In [None]:
df_9_areas_catboost_100

In [None]:
# train the best model and get the feature importance for every feature for every area combo
df_result_9_areas_100, feature_importance = bh.train_best_model(X_different_areas_catboost_100=df_9_areas_catboost_100,
                                                                cat_feat=cat_features,
                                                                y_different_areas_100=y_9_areas)

In [None]:
# get the feature importance and concat to a dataframe for plotting
feature_importance = {k: v.set_index('Feature Id') for k, v in feature_importance.items()}
df_feature_importance = pd.concat(feature_importance, axis=1).T
# drop one index level
df_feature_importance.index = df_feature_importance.index.droplevel(-1)

In [None]:
df_feature_importance
df_feature_importance.boxplot(
    column=list(df_feature_importance.columns), 
    rot=45, 
    figsize=(20, 8)
)
plt.title("Boxplot for All Features and All Area Combinations")

**Conclusion:**

**As we can see the off-street-capacity of the parking facility and the number of residential building in a 100 radius seems to have the highest feature importance, the precipcion of that day has the smallest feature importance**