# 1. Imports

In [1]:
!pip install category_encoders
!pip install geopandas
!pip install folium
!pip install geopy
!pip install catboost



In [5]:
# 1.1. Local imports
# # this is only used when running in sagemer mode
import location_similarity_helper as lsh
import location_similarity_plots as lsp
import location_similarity_cluster as lsc
import location_similarity_train_evaluate as lste
from baseline_helper import create_area_combinations


# 1.2. External imports
import numpy as np
import pandas as pd
import geopandas
import geopy
import json
import matplotlib.pyplot as plt
from shapely import wkt
from sklearn import preprocessing
import pprint
import os

import importlib
importlib.reload(lsp)
importlib.reload(lsh)
importlib.reload(lsc)
importlib.reload(lste)

pd.set_option('display.max_column', 500)

# 2.Load Input Data

In [6]:
sagemaker_mode = False

In [7]:
if sagemaker_mode:
    # get the import
    from setup import load_data_from_s3 
    # read data
    bucket_name = 'bucket-vwfs-pred-park-global-model-serving-dev'
    file_name = 'input/open_data/seattle/train_data_with_trans_100_with_transaction.csv'
    train_data_with_trans_100 = load_data_from_s3(bucket_name, file_name)

else: # if run locally
    print(f'current working directory: {os.getcwd()}')  # Get the current working directory (cwd)
    cwd = os.chdir("/Users/prisc/Code/pred-parking-thesis/")# change the directory
    files = os.listdir(cwd)  # Get all the files in that directory
    print("Files in %r: %s" % (cwd, files))
    # read the data
    train_data_with_trans_100 = pd.read_csv('data/train_data_with_trans_100_with_transaction.csv', index_col=0)

current working directory: /Users/prisc/Code/pred-parking-thesis/location_similarity
Files in None: ['DFA', 'preprocess', 'README.md', 'location_similarity', '.git', 'data', 'PyTorch-Deep-CORAL']


In [8]:
train_data_with_trans_100

Unnamed: 0,street_id,observation_interval_start,availability,length,highway,maxspeed,geometry,study_area,hour,weekday,commercial_100,residential_100,transportation_100,schools_100,eventsites_100,restaurant_here_100,shopping_here_100,office_here_100,supermarket_here_100,transportation_here_100,schools_here_100,capacity,hourly_capacity,current_capacity,num_off_street_parking_100,off_street_capa_100,time_key,tempC,windspeedKmph,precipMM,ongoing_trans
0,1262,2019-03-21 08:00:00,0.0,206.470,residential,25.0,"LINESTRING (-122.30894 47.6061773, -122.308940...",Cherry Hill,8,3,45.0,30.0,0.0,0.0,0.0,0,0,1,0,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,25,2019-03-21-08,12,10,0.0,1.0
1,18459,2019-03-21 08:00:00,1.0,99.091,tertiary,25.0,"LINESTRING (-122.30894 47.6061773, -122.309064...",Cherry Hill,8,3,15.0,0.0,0.0,0.0,0.0,0,1,1,2,2,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,103,2019-03-21-08,12,10,0.0,1.0
2,1257,2019-03-21 08:00:00,1.0,98.755,secondary,25.0,"LINESTRING (-122.3102402 47.6080375, -122.3103...",Cherry Hill,8,3,75.0,105.0,0.0,0.0,0.0,0,0,0,0,0,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,15,2019-03-21-08,12,10,0.0,0.0
3,1259,2019-03-21 08:00:00,1.0,96.589,secondary,25.0,"LINESTRING (-122.3089518 47.6080341, -122.3096...",Cherry Hill,8,3,60.0,60.0,0.0,0.0,0.0,0,1,0,0,0,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,0,0,2019-03-21-08,12,10,0.0,1.0
4,1262,2019-03-21 09:00:00,0.0,206.470,residential,25.0,"LINESTRING (-122.30894 47.6061773, -122.308940...",Cherry Hill,9,3,45.0,30.0,0.0,0.0,0.0,0,0,1,0,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,25,2019-03-21-09,13,11,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5852,18065,2019-06-05 19:12:00,1.0,134.269,tertiary,30.0,"LINESTRING (-122.316779 47.6074153, -122.31677...",12th Ave,19,2,20.0,20.0,0.0,0.0,0.0,7,1,0,1,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,6,2019-06-05-19,13,13,0.2,5.0
5853,18065,2019-06-05 20:10:00,1.0,134.269,tertiary,30.0,"LINESTRING (-122.316779 47.6074153, -122.31677...",12th Ave,20,2,20.0,20.0,0.0,0.0,0.0,7,1,0,1,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,6,2019-06-05-20,12,10,0.0,0.0
5854,18065,2019-06-05 21:15:00,1.0,134.269,tertiary,30.0,"LINESTRING (-122.316779 47.6074153, -122.31677...",12th Ave,21,2,20.0,20.0,0.0,0.0,0.0,7,1,0,1,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,6,2019-06-05-21,10,7,0.0,0.0
5855,18065,2019-06-05 22:12:00,1.0,134.269,tertiary,30.0,"LINESTRING (-122.316779 47.6074153, -122.31677...",12th Ave,22,2,20.0,20.0,0.0,0.0,0.0,7,1,0,1,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,6,2019-06-05-22,10,8,0.0,0.0


# 3. Filter input data

As we proved in baseline, that there are 17 areas in total but only 9 areas has a record over 250, and the rest areas are quite spreaded, therefore, we filtered out only the 9 areas

In [9]:
# Filter data for selected areas
selected_areas = [
    'Greenlake',
    'South Lake Union',
    'Commercial Core',
    'Pike-Pine',
    'Uptown',
    'Ballard',
    'First Hill',
    'Chinatown/ID',
    'Pioneer Square'
]
train_data_with_trans_100_filtered = train_data_with_trans_100[
    train_data_with_trans_100["study_area"].isin(selected_areas)
]

street_count = len(train_data_with_trans_100_filtered.street_id.unique())
raw_street_count_unique = len(train_data_with_trans_100.street_id.unique())
print(f'Filtered Data Shape: {train_data_with_trans_100_filtered.shape}')
print(f'Training data has {street_count} streets')
print(f'Original Data without filtering has {raw_street_count_unique} unique streets (ground truth)')

Filtered Data Shape: (5427, 31)
Training data has 393 streets
Original Data without filtering has 427 unique streets (ground truth)


In [10]:
train_data_with_trans_100_filtered

Unnamed: 0,street_id,observation_interval_start,availability,length,highway,maxspeed,geometry,study_area,hour,weekday,commercial_100,residential_100,transportation_100,schools_100,eventsites_100,restaurant_here_100,shopping_here_100,office_here_100,supermarket_here_100,transportation_here_100,schools_here_100,capacity,hourly_capacity,current_capacity,num_off_street_parking_100,off_street_capa_100,time_key,tempC,windspeedKmph,precipMM,ongoing_trans
205,8671,2019-04-04 08:04:00,1.0,68.835,secondary,25.0,"LINESTRING (-122.3261643 47.6789845, -122.3262...",Greenlake,8,3,128.0,0.0,0.0,0.0,0.0,3,0,0,0,1,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,20,2019-04-04-08,14,3,0.0,1.0
206,7390,2019-04-04 08:13:00,1.0,58.163,secondary,25.0,"LINESTRING (-122.3282649 47.6783132, -122.3280...",Greenlake,8,3,160.0,64.0,0.0,0.0,0.0,2,0,1,0,2,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,20,2019-04-04-08,14,3,0.0,0.0
207,23879,2019-04-04 08:25:00,0.0,181.089,secondary,35.0,"LINESTRING (-122.3406668 47.6286046, -122.3405...",South Lake Union,8,3,33.0,11.0,0.0,0.0,0.0,2,0,0,0,0,0,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,139,2019-04-04-08,14,3,0.0,3.0
208,21688,2019-04-04 08:26:00,0.0,78.891,tertiary,25.0,"LINESTRING (-122.3245469 47.6798127, -122.3245...",Greenlake,8,3,160.0,64.0,0.0,0.0,0.0,14,4,0,2,1,3,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,1,186,2019-04-04-08,14,3,0.0,2.0
209,16023,2019-04-04 08:27:00,1.0,81.134,tertiary,25.0,"LINESTRING (-122.324689 47.6805157, -122.32492...",Greenlake,8,3,35.0,28.0,0.0,0.0,0.0,12,3,0,0,1,3,0,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,2,206,2019-04-04-08,14,3,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5842,980,2019-06-01 22:19:00,1.0,97.326,tertiary,25.0,"LINESTRING (-122.3364381 47.6065823, -122.3365...",Commercial Core,22,5,60.0,30.0,0.0,0.0,0.0,12,15,5,3,3,0,67,"{'0': 2, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,5,771,2019-06-01-22,14,5,0.0,0.0
5843,19356,2019-06-01 22:21:00,1.0,98.617,tertiary,25.0,"LINESTRING (-122.3368912 47.6087555, -122.3370...",Commercial Core,22,5,30.0,60.0,0.0,0.0,0.0,9,8,2,3,5,0,14,"{'0': 9, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,4,1062,2019-06-01-22,14,5,0.0,0.0
5844,17924,2019-06-01 22:27:00,0.0,58.121,primary,25.0,"LINESTRING (-122.339962 47.6113492, -122.34001...",Commercial Core,22,5,180.0,240.0,0.0,0.0,0.0,11,4,1,1,3,0,24,"{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': ...",0,2,1409,2019-06-01-22,14,5,0.0,0.0
5845,13357,2019-06-01 22:29:00,0.0,129.517,secondary,25.0,"LINESTRING (-122.3390905 47.607833, -122.33903...",Commercial Core,22,5,30.0,60.0,0.0,0.0,0.0,20,13,3,2,1,0,33,"{'0': 13, '1': 0, '2': 0, '3': 0, '4': 0, '5':...",0,2,244,2019-06-01-22,14,5,0.0,0.0


# 4. Create Street as Similarity Entity - Calculate Distance Between Street Vectors

    1. Goal: understand the similarity of all the streets in seattle based on its vector representation, define the similarity between streets
    2. Below steps have been included in this section:

        1)Process the data
        2)compute the distance(cosine, euclinean)
        3)correlate output of the previous, correlate euclinean/cosine withh real geo distance
        4)analyze if they are really correlated

## 4.1 Preprocess Data

In [11]:
train_data_with_trans_100.columns

Index(['street_id', 'observation_interval_start', 'availability', 'length',
       'highway', 'maxspeed', 'geometry', 'study_area', 'hour', 'weekday',
       'commercial_100', 'residential_100', 'transportation_100',
       'schools_100', 'eventsites_100', 'restaurant_here_100',
       'shopping_here_100', 'office_here_100', 'supermarket_here_100',
       'transportation_here_100', 'schools_here_100', 'capacity',
       'hourly_capacity', 'current_capacity', 'num_off_street_parking_100',
       'off_street_capa_100', 'time_key', 'tempC', 'windspeedKmph', 'precipMM',
       'ongoing_trans'],
      dtype='object')

hour, weekday, current_capacity, tempC, windspeedKmph, precipMM - we did not include for it to calculate the street similarity as they are time dependent

In [13]:
# create data
# hour, weekday, current_capacity, tempC, windspeedKmph, precipMM - we did not include for it to calculate the street similarity as they are time dependent
selected_features = [
    'street_id', # note this is not as feature, but just needed to be selected
    'availability', # note this is not as feature, but just needed to be selected
    'length',
    'highway',
    'maxspeed', # input but not used for clustering
    'commercial_100',
    'residential_100',
    'transportation_100',
    'schools_100',
    'eventsites_100',
    'geometry',# note this is not as feature, but just needed to be selected
    'restaurant_here_100',
    'shopping_here_100',
    'office_here_100',
    'supermarket_here_100',
    'transportation_here_100',
    'schools_here_100',
    'num_off_street_parking_100',
    'off_street_capa_100',
    'ongoing_trans'
]
# preprocess the data, here we use the df_similarity_features as basis to cluster the streets based on their vector
# similarity
df_features, df_similarity_features, _, df_geometry = \
    lsh.preprocess_for_similarity_analysis(
        train_data_with_trans_100_filtered,
        selected_features,
        options={
            'impute_maxspeed': False,# not use maxspeed when clustering
            'encode_highway': True, # use highway when clustering
            'time_dependant_features': None, # we decide not to use time_depedent feature when clustering
        }
    )

# only take the unique streets, currently cluster only based on streets
df_similarity_features['street_id'] = df_similarity_features.index
df_similarity_features.drop_duplicates(subset=['street_id'], inplace=True)
df_similarity_features.drop(['street_id'], axis=1, inplace=True)
df_geometry = df_geometry \
    .drop_duplicates(subset=['street_id']) \
    .set_index('street_id')

  elif pd.api.types.is_categorical(cols):


In [14]:
df_features.head(1)

Unnamed: 0_level_0,availability,length,highway,maxspeed,commercial_100,residential_100,transportation_100,schools_100,eventsites_100,geometry,restaurant_here_100,shopping_here_100,office_here_100,supermarket_here_100,transportation_here_100,schools_here_100,num_off_street_parking_100,off_street_capa_100,ongoing_trans
street_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8671,1.0,68.835,secondary,25.0,128.0,0.0,0.0,0.0,0.0,"LINESTRING (-122.3261643 47.6789845, -122.3262...",3,0,0,0,1,0,1,20,1.0


In [None]:
# drop on-going transaction, as it is also time dependent feature, we do not need time-dependent feature when we have on-going trans
df_similarity_features.drop(['ongoing_trans'], axis = 1, inplace=True)

In [None]:
df_similarity_features.head(1)

In [None]:
print(f'We have used {len(df_similarity_features.columns)} features to cluster the {len(df_similarity_features)} streets.')

In [None]:
df_geometry.head(1)

In [None]:
#lsp.plot_highway(df_features)

## 4.2 Compute the pairwise distance(euclinean, cosine) of the streets based on feature vector
hereby we are normalizing the feature:

    1)so within each feature, they have the same variation
    2)different features have different variation
    3)(in which case, we keep the different magnitude of the different features, namely preserved the distribution between features)

### 4.2.1 Use l2 to normalize features(columns)

To calculate the distance between the vectors, we firstly need to normalize the vectors, multiple ways are possible, and we would use L2 below to firstly normalize and then use euclidean to calculate the distance, therefore, we try out different ways and determine a way which could distinguish the streets more.

In [None]:
df_similarity_features

In [None]:
# compute pairwise distance
# for euclinean distance -normalize first, normalize the features NOT the rows
similarity_features_l2_normalized = preprocessing.normalize(
    df_similarity_features, norm='l2', axis=0)

df_similarity_features_l2_normalized = pd.DataFrame(
    similarity_features_l2_normalized,
    index=df_similarity_features.index,
    columns=df_similarity_features.columns
)

# call the distance function
df_pair_dist_l2_normalized = lsh.street_pairwise_dist(
    df_similarity_features_l2_normalized, 'euclidean')
df_pair_cosine = lsh.street_pairwise_dist(df_similarity_features, 'cosine')

# normalize enclinean distance matrix, so that it is on same scale[0, 1] with cosine to be able to compare better with correlation plot

df_pair_dist_l2_normalized_scaled = lsh.scale_before_plot_correlation(
    df_pair_dist_l2_normalized)

# plot all the streets for both l2 normalized and cosine normalized distance
# the lighhter the color, the less distant, the more similar they are
lsp.plot_distance_matrix(
    df_pair_dist_l2_normalized_scaled,
    df_pair_cosine,
    len(df_similarity_features),
    'L2 Normalized Euclidean Distance Matrix Between Street Vectors',
    'Cosine Distance Matrix Between Street Vectors'
)

### 4.2.2 Use min max to normalize features(columns)

Use min-max normalizer first before calculate the euclidean distance

In [None]:
# use min max scaler
similarity_features_max_normalized = preprocessing.normalize(
    df_similarity_features, norm='max', axis=0)

df_similarity_features_max_normalized = pd.DataFrame(
    similarity_features_max_normalized,
    index=df_similarity_features.index,
    columns=df_similarity_features.columns

)
# get the euclinean distance
df_pair_dist_max_normalized = lsh.street_pairwise_dist(
    df_similarity_features_max_normalized, 'euclidean')

# get the scaled euclinean distance, range[0,1]
df_pair_dist_max_normalized_scaled = lsh.scale_before_plot_correlation(
    df_pair_dist_max_normalized)

lsp.plot_distance_matrix(
    df_pair_dist_max_normalized_scaled,
    df_pair_cosine,
    len(df_similarity_features),
    'Minmax Normalized Euclidean Distance Matrix Between Street Vectors',
    'Cosine Distance Matrix Between Street Vectors'
)


**Conclusion: the darker the color, the more 'distant' are the streets, thus they would be more distinguishable, and therefore minmax normalized euclindean distance can distinguish the streets more**

## 4.3 Compute the pairwise distance of the streets based on geometry

Another way to think about the distance betweent the streets are of course the geographical distance, and hereby we use the centroid of the lines as the geometry of the streets and calculate the pairwised geographical distance between the streets.

In [None]:
# get the centroid
df_geometry['geometry'] = df_geometry['geometry'].apply(wkt.loads)
gdf = geopandas.GeoDataFrame(
    df_geometry,
    geometry=df_geometry['geometry']
)
gdf['line_centroid'] = gdf['geometry'].centroid
gdf.head()

In [None]:
result = lsh.calculate_street_similarity_matrix(gdf)
df_real_dist = pd.DataFrame(data=result)
#df_real_dist.to_csv('df_real_dist.csv')
#df_real_dist = pd.read_csv('df_real_dist.csv', index_col=0)

## 4.4 Correlate the distances calculated above

The goal of this part is to decide which distance metrics to use to calculate the similarity based either on feature vectors of the streets or geometry of the streets, therefore, we correlated the below combinations:
1) L2 normalized euclinean distance
2) max normalized euclinean distance
3) cosine distance respectively with the actual geo distance

### 4.4.1 Correlation between two distance metrics (euclinean and cosine)

##### Correlate l2 normalized euclidean and cosine distance

In [None]:
corr_l2euclinean_cosine = df_pair_dist_l2_normalized.corrwith(
    df_pair_cosine, axis=0)
lsp.plot_correlation_distance(corr_l2euclinean_cosine, 'Correlate L2 Normalized Euclidean and Cosine Distance')

##### Correlate Minmax normalized euclidean and cosine distance

In [None]:
corr_maxeuclinean_cosine = df_pair_dist_max_normalized.corrwith(
    df_pair_cosine, axis=0)

lsp.plot_correlation_distance(corr_maxeuclinean_cosine, 'Correlate Minmax Normalized Euclidean and Cosine Distance')

### 4.4.2 Correlation between the l2 or max normalized euclinean distance with the real geo distance

In [None]:
df_real_dist.index = df_real_dist.index.astype('int64', False)
df_real_dist.columns = df_real_dist.columns.astype('int64', False)

##### Correlate l2 normalized euclidean distance and real geo distance

In [None]:
corr_l2euclinean_realdist = df_pair_dist_l2_normalized.corrwith(
    df_real_dist, axis=0)
lsp.plot_correlation_distance(corr_l2euclinean_realdist, 'Correlate L2 Normalized Euclidean and Real Geo Distance')

##### Correlate min max normalized euclidean distance and real geo distance

In [None]:
corr_maxeuclinean_realdist = df_pair_dist_max_normalized.corrwith(
    pd.DataFrame(df_real_dist), axis=0)
lsp.plot_correlation_distance(corr_maxeuclinean_realdist, 'Correlate Minmax Normalized Euclidean and Real Geo Distance')

In [None]:
corr_cosindist_realdist = df_pair_cosine.corrwith(
    pd.DataFrame(df_real_dist), axis=0)
lsp.plot_correlation_distance(corr_cosindist_realdist , 'Correlate Cosine Distance and Real Geo Distance')

**Conclusion: based on above graphs, we choose the method which correlated mostly to the real geographica distance, therefore, we decided to use min-max normalized Euclidean distance as street similarity distance metric.**

# 5. Create Clusters as Similarity Entity

After creating streets as similarity entity, we now cluster those streets, to generate similarity clusters, which is representive. We have chose several clustering algortims and also initializing the clustering two process for source and target areas separately.

1. use min max normalized euclinean distance to normalize street vectors(based on above analysis)
2. for each area combination:
        for each type of similarity measurement (either GPS or vector similairity)
            cluster data based on different clustering algorithm(either kmeans of ag

## 5.1 Plot the Data Points

In [None]:
df_geometry['lon'] = df_geometry.line_centroid.apply(lambda p: p.x)
df_geometry['lat'] = df_geometry.line_centroid.apply(lambda p: p.y)

df_geometry.head()

In [None]:
# get city coordinates

city = "Seattle"
locator = geopy.geocoders.Nominatim(user_agent="MyCoder")
location_seattle = locator.geocode(city)

location_seattle = [location_seattle.latitude, location_seattle.longitude]
print("[lat, long]:", location_seattle)

# map to plot the area
df_street_coords = df_geometry[['lon', 'lat']]
df_study_area = train_data_with_trans_100_filtered[['street_id', 'study_area']]\
    .drop_duplicates()\
    .set_index('street_id')
street_coords_study_area = pd.merge(
    df_street_coords, df_study_area, left_index=True, right_index=True)

street_coords_study_area['study_area'] = street_coords_study_area['study_area'].map(
    {

        'Pike-Pine': 0,
        'First Hill': 1,
        'South Lake Union': 2,
        'Commercial Core': 3,
        'Ballard': 4,
        'Chinatown/ID': 5,
        'Greenlake': 6,
        'Pioneer Square': 7,
        'University District': 8,
        'Uptown': 9, #
        'Uptown Triangle': 10,
        'Capitol Hill': 11,
        'University District': 12,
        '12th Ave': 13,
        'Fremont': 14,
        'Cherry Hill': 15,
        'Ballard Locks': 16,
        'Roosevelt': 17,
        'Westlake': 18,
        'Columbia City': 19
    }
)

# plot the map where shows the 4 districts of Seattle, the output html is called map
#lsp.plot_cluster_folium(
#    data=street_coords_study_area,
#    study_area='study_area',
#    tiles='OpenStreetMap'
#)

In [None]:
train_data_with_trans_100_filtered.study_area.unique()

In [None]:
print(f'There are {len(street_coords_study_area.study_area.unique())} unique areas in seattle after filtering out areas with small number of data')

## 5.2 Clustering

In this section, we cluster the street based on the vector similairity(the triangular matrix of the vector distance) and also on GPS(based on the geometry of the streets)

    1)we split the data into train and test, and initialize clustering process for train and test separately
    2)we tried out 3 clustering algorithms to generate cluster in result

The output is expected to be:

    for each street, there is a number of clustering labels generated by using different clustering algorithms and different street similarity(either GPS or vector similarity)

We have refered to below materials to determine the clustering algorithms:

    https://datascience.stackexchange.com/questions/761/clustering-geo-location-coordinates-lat-long-pairs
    https://community.dataiku.com/t5/Using-Dataiku-DSS/How-to-cluster-geo-points-according-to-their-pairwise-distances/m-p/2931
    https://datascience.stackexchange.com/questions/761/clustering-geo-location-coordinates-lat-long-pairs

### 5.2.1 Prepare the clustering data

In [None]:
# for GPS, we also need to divide by the km_per_radian

km_per_radian = 6371.0088

In [None]:
df_street_coords = pd.merge(
    df_street_coords, df_study_area, left_index=True, right_index=True)

In [None]:
df_street_coords

In [None]:
# split train and test area to prepare for clustering

all_area_combinations = create_area_combinations(selected_areas)

"""
area_input_data:
    - first key: area_for_train
        - second key: Source, Target
        - second key value: dataframe with streets data inside
"""
area_input_data = {}
for area_combination in all_area_combinations:
    area_name = area_combination['Target'][0]
    area_input_data[area_name] = {}
    area_input_data[area_name]['Target'] = df_street_coords[df_street_coords.study_area == area_name]
    train_street_coords_temp=[]
    for source_area in area_combination['Source']:
        train_street_coords_temp.append(df_street_coords[df_street_coords.study_area == source_area])
    area_input_data[area_name]['Source'] = pd.concat(train_street_coords_temp)

### 5.2.2 Call Clustering Function & Create Cluster Result

In [None]:
"""
Loop through different similairty metrics: either sim (represents street vector similairy) or gps(represents GPS coordinates)
    loop through the train and test data set(as we have seperate clustering process)
        loop through different clustering algorithm
"""

#Loop through different similairty metrics
for i, area_name in enumerate(area_input_data.keys()):
    print(i, '====== area_name:',area_name)
    for base in ['sim', 'gps']:
        # loop through the train and test data set, in the end append the cluster label back to them
        for i, data in enumerate([
            area_input_data[area_name]['Source'],
            area_input_data[area_name]['Target'],
        ]):
            is_train = i == 0 # check if it is data is training data or not
#            for algorithm in ['db_scan', 'kmeans', 'agg_clustering']:
            for algorithm in ['kmeans', 'agg_clustering']:
                label = algorithm + "_label_" + base  # the name of the label(clustering algo + label + similarity metrics)
                print('performing cluster labeling for', label, is_train)
                # call the clustering algorithm to generate cluster labels
                cluster_data = lsh.create_cluster_label(area_input_data, df_pair_dist_max_normalized,  area_name, base, algorithm, data, is_train)
                # append the label back to its dataframe
                data[label] = cluster_data

                # number of streets in each cluster
                result_size = data.groupby(label).size()
                print(f'The group result of {result_size}')

                # DB scan we need to check the number of outliers
                if algorithm == 'db_scan':
                    outlier_count = len(data[data[label] == -1])
                    outlier_percentage = outlier_count / len(result_size) * 100
                    print(
                        f'Percentage of data points which has been clustered as outliers: {outlier_percentage}%')

                # plot the data with folium
                lsp.plot_cluster_folium(
                    data=data,
                    cluster_label=label,
                    train=is_train
                )
print('DONE!')

## 6. Evalution of the Result


Steps briefly highlighted below:

In [None]:
cluster_col = ['kmeans_label_gps']

cluster_cols = [
    ['agg_clustering_label_gps'],
#    ['db_scan_label_gps'],
    ['kmeans_label_gps'],
#    ['db_scan_label_sim'],
    ['kmeans_label_sim'],
    ['agg_clustering_label_sim']
]

feature_col = ['length', 'tempC', 'windspeedKmph', 'precipMM', 'highway', 'hour', 'weekday',
               'commercial_100', 'residential_100', 'transportation_100', 'schools_100', 'eventsites_100',
               'restaurant_here_100','shopping_here_100', 'office_here_100', 'supermarket_here_100',
               'transportation_here_100','schools_here_100',  'current_capacity',
               'num_off_street_parking_100',  'off_street_capa_100',  #'ongoing_trans'
              ]
print(f'There are in total {feature_col} features')

target_col = ['availability']

In [None]:
# get the train and test data
area_cluster_label = {}
for area_combination in all_area_combinations:
    area_name = area_combination['Target'][0]
    print('processing: ', area_name)
    area_cluster_label[area_name] = {}
    area_cluster_label[area_name]['Target'] = train_data_with_trans_100_filtered[train_data_with_trans_100_filtered.study_area == area_name]
    area_train_temp=[]
    for source_area in area_combination['Source']:
        area_train_temp.append(train_data_with_trans_100_filtered[train_data_with_trans_100_filtered.study_area == source_area])
    area_cluster_label[area_name]['Source'] = pd.concat(area_train_temp)

In [None]:
# this can only be run once, as we are merging the dataframe
"""
below block does the following:
    1)for each area split, for each clustering algortihm, get the source and target data, by holding out one area as target are everytime
    2)merge the cluster label generated for different streets by different clustering algorithm back to the original dataframe
"""
area_source_clusters = {}
area_target_clusters = {}
for area_combination in all_area_combinations:
    area_name = area_combination['Target'][0]
    print('processing: ', area_name)


    area_input_data[area_name]['Source'] = area_input_data[area_name]['Source'].reset_index()
    area_input_data[area_name]['Target'] = area_input_data[area_name]['Target'].reset_index()

    area_source_clusters[area_name] = pd.merge(area_cluster_label[area_name]['Source'], area_input_data[area_name]['Source'], on=['street_id', 'study_area'])
    area_target_clusters[area_name] = pd.merge(area_cluster_label[area_name]['Target'], area_input_data[area_name]['Target'], on=['street_id', 'study_area'])

In [None]:
"""
Below block does the following:
    1)collect the result for model transfer 
    2)collect result for Matthew and train cluster data size correlation
"""

result_scores = {}
area_result_matthew_size_corr = {}
area_result_matthew_overfit = {}
area_result_feature_importance = {}

for area_combination in all_area_combinations:
    area_name = area_combination['Target'][0]
    print('processing area_name: ', area_name)

    result_score, result_matthew_size_corr, result_matthew_overfit, result_feat_importance = lste.train_evaluate_all_approaches(
        cluster_cols,
        feature_col,
        target_col,
        area_source_clusters[area_name],
        area_target_clusters[area_name],
        iterations=1000
    )

    result_score_df = pd.DataFrame(result_score)
    result_scores[area_name] = result_score_df

    area_result_matthew_size_corr[area_name] = result_matthew_size_corr
    area_result_matthew_overfit[area_name] = result_matthew_overfit

    area_result_feature_importance[area_name] = result_feat_importance

### 6.1 Result Analysis

In [None]:
for area_combination in all_area_combinations:
    area_name = area_combination['Target'][0]
    print('area:', area_name)
    display(result_scores[area_name])

In [None]:
# get the mattew for different area combinations
matthews = {}
for area_combination in all_area_combinations:
    area_name = area_combination['Target'][0]
    print('area:', area_name)
    #display(result_scores[area_name].loc[['matthews'],:])
    matthews[area_name] = result_scores[area_name].loc[['Matthews'],:]

In [None]:
df_matthews = pd.concat(matthews.values())

In [None]:
# get the average of matthews across different areas
df_matthews.mean(axis=0)

In [None]:
df_matthews.plot.box(figsize=(8, 8), ylim=(-0.3, 0.3), grid=True, yticks=(np.arange(-0.3, 0.3, step=0.1)),
                     title='Matthews for Minimum 25 streets in one cluster')

**As we see from the boxplot above, that our clustering algorithm cannot really beat the baseline and next step we wish to investigate the feature importance and why it is the case that it cannot outperform baseline**

### 6.2 Analyze Why clustering method is worse than baseline

#### Is it because dataset is too small?

by correlating the matthew and train cluster size

In [None]:
pprint.pprint(area_result_matthew_size_corr)

For the best performing clustering algorithm, we plot the correlation between the training data size and the matthew score on the test cluster in scatter plot

In [None]:
dict_matthews = df_matthews.mean(axis=0).to_dict()
# get the algorithm name which gives avg best matthews
best_algo = [algo for algo, value in dict_matthews.items() if value == df_matthews.mean(axis=0).max()]
#best_algo = ['kmeans_label_sim']
print(f"The algorithm which gives the best Matthew is {best_algo}")

In [None]:
# get the correlation
best_algo_matthew_size_corr = []
for area, data in area_result_matthew_size_corr.items():
    for algor, values in data.items():
        if algor == best_algo[0]:
            for test_cluster_no, value in values.items():
                best_algo_matthew_size_corr.append(value)

In [None]:
df_best_algo_corr = pd.DataFrame(best_algo_matthew_size_corr)

In [None]:
df_best_algo_corr.plot.scatter(x='train_cluster_size',
                          y='matthew',
                          c='DarkBlue',
                          figsize=(10, 8),
                          title=('Correlation between Matthew Score of Test Cluster and the Size of Its Train '
                                'Cluster For Best Algorithm'))

**As we observe from the above plot, we could see that there is a positive correlation between the number of data points in the training cluster, and the mathew score on its matched test cluster.**

In [None]:
best_algo

In [None]:
pprint.pprint(area_result_matthew_overfit)

In [None]:
best_algo_matthew_overfit = []
for area, data in area_result_matthew_overfit.items():
    for algor, values in data.items():
        if algor == best_algo[0]:
            for label, value in values.items():
                best_algo_matthew_overfit.append(value)

In [None]:
df_best_algo_matthew_overfit = pd.DataFrame(best_algo_matthew_overfit)

In [None]:
df_best_algo_matthew_overfit

In [None]:
ylim= (df_best_algo_matthew_overfit.to_numpy().min(), df_best_algo_matthew_overfit.to_numpy().max())
df_best_algo_matthew_overfit[['train_cluster_matthew', 'valid_cluster_matthew_20', 'test_cluster_matthew']].plot.bar(
    rot=0, 
    figsize=(18,5), 
    grid=True, 
    ylim=ylim, 
    title='Overfiting with in Training Data and Between Train and Test',
    xlabel = 'Clusters',
    ylabel='Matthew',
)

In [None]:
within_train = df_best_algo_matthew_overfit[['train_cluster_matthew_80', 'valid_cluster_matthew_20']]
train_test = df_best_algo_matthew_overfit[['train_cluster_matthew', 'test_cluster_matthew']]

#### How badly does the algorithm overfit within the training data?

by analyzing Matthew of model by spliting the training cluster into 80% train, 20% valid

overfitting within the train, as we splited train and valid, and they are in the same domain, if there is overfitting effect, then we could conclude that the algorithm works but due to overfit that it cannot demonstrate its ability.

In [None]:
axes_within_train =within_train.plot.bar(
    rot=0, 
    figsize=(10, 5), 
    ylim=ylim, 
    title='Overfit within Train Clusters by 80-20 Split',
    grid=True,
    xlabel = 'Clusters',
    ylabel='Matthew'
)

#### How badly does the algorithm overfit?

by analyzing Matthew of model on training cluster and test cluster

Here for the best clustering algorithm, we plot analysize the over fit from train to test clusters which includes overfit + domain shift between the train and test

In [None]:
axes_train_test = train_test.plot.bar(
    rot=0, 
    figsize=(10, 5), 
    ylim=ylim,
    title='Overfit Between Train and Test Clusters',
    grid=True,
    xlabel = 'Clusters',
    ylabel='Matthew',
    color = ['green', 'red']
)

**As we could conclude from above that there is a very severe overfitting when it comes to the train to test model transfer and also maybe domain shift, when it comes to within train clusters 80-20 split, the overfitting trend still exists but not so strong, therefore, we could conclude that the clustering approach could solve partially the problem of domain shift and at the same time suffers the problem of overfitting due to the very small dataset we have.**

**As it seems that our clustering approach does not solve the problem of domain shift compeletely, we would like to investigate other approaches which could 1) overcome the problem of small data set and 2) also align the distribution bettween when it comes to model transfer. Therefore, we will investigate domain adaptation approaches in the other notebook**

### 6.3 Feature Importance

for the best performing clustering algorithm, here we analyse the feature importance of per cluster for different target areas

In [None]:
area_result_feature_importance['Greenlake']['agg_clustering_label_gps'][-1]

In [None]:
best_algo_feature_importance = []

for area, data in area_result_feature_importance.items():
    for algo, values in data.items():
        if algo == best_algo[0]:
            for label, value in values.items():
                best_algo_feature_importance.append(value)

In [None]:
best_algo_feature_importance[8]

In [None]:
for df in best_algo_feature_importance:
    if df.index.name != 'Feature Id':
        df.set_index('Feature Id', inplace=True)

In [None]:
df_best_algo_feat_importance = pd.concat(best_algo_feature_importance, axis=1)

In [None]:
df_best_algo_feat_importance

In [None]:
print(f'We have used {len(df_best_algo_feat_importance)} features to train our model')

In [None]:
df_best_algo_feat_importance.T.plot.box(figsize=(20, 8), grid=True, title='Feature Importance by Test Cluster', rot=45)

**Here we have ignored for different areas, and plot the feature importance on all the test clusters regardless of the area, and test cluster label, we could see that hour and highway, and off-street capacity are two features of importance whereas transportation_100 counts has the lest feature importance**