# Geolocation features and Cluster model
Once we already know how the stations behave each hour, on weekdays, we will add geolocation features to our model. Do they behave the same depending on how high in the city the stations are located? Is there any relationship between how close to the city centre they are? We will find this out in this notebook.

In [1]:
#Importing required libraries
import pandas as pd
import geopy.distance
import geocoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
#importing datetime_features file generated from 1_datetime
oct_features = pd.read_csv("..\\Dataset\\datetime_features.csv",encoding="utf_8",index_col='station_id')
#importing dataframe extracted from web scrapping Bicing website, thanks to Laurent Guerguy https://github.com/laurent-guerguy
coordinates_df = pd.read_csv("..\\Dataset\\bicing_ws_laurent.csv",encoding="utf_8")

In [3]:
#We only need 'station_id', latitude and longitude columns
coordinates_df = coordinates_df[['1','4','5']]

In [4]:
#We double check column '1' refers to station_id
coordinates_df['1'].unique()
print(f"The dataframe has the stations from its id number {coordinates_df['1'].min()} to the {coordinates_df['1'].max()}")

The dataframe has the stations from its id number 1 to the 496


In [5]:
coordinates_df = coordinates_df.rename(columns={"1":'station_id',"4":"lat","5":"long"})

In [6]:
coordinates_df = coordinates_df.groupby('station_id').mean()

In [7]:
oct_features_gps = pd.merge(oct_features, coordinates_df, on="station_id")

In [8]:
#Checking if we are missing something
oct_features_gps.isnull().sum()

00:00       0
01:00       0
02:00       0
03:00       0
04:00       0
05:00       0
06:00       0
07:00       0
08:00       0
09:00       0
10:00       0
11:00       0
12:00       0
13:00       0
14:00       0
15:00       0
16:00       0
17:00       0
18:00       0
19:00       0
20:00       0
21:00       0
22:00       0
23:00       0
capacity    0
lat         0
long        0
dtype: int64

### Calculating the distance of each station to the city centre
We will use Plaça Catalunya, Lat: 41.3870154, Long: 2.1700471 as the centre of the city and will caluclate the distance, in kilometres, of the station to the centre.

In [9]:
PlCat = (41.3870154, 2.170047)

In [10]:
#Iterating between stations to find the distance from Plaça Ctalunya
stations = range(oct_features_gps.shape[0])
distances = []
for s in stations:
    location = (oct_features_gps.iloc[s]['lat'],oct_features_gps.iloc[s]['long'])
    distances.append(round(geopy.distance.vincenty(PlCat, location).km,2))

  


In [11]:
#Adding distances as feature
oct_features_gps['DisttoCentre'] = distances

In [12]:
oct_features_gps.head()

Unnamed: 0_level_0,00:00,01:00,02:00,03:00,04:00,05:00,06:00,07:00,08:00,09:00,...,18:00,19:00,20:00,21:00,22:00,23:00,capacity,lat,long,DisttoCentre
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,23.45,23.83,23.71,23.85,23.66,20.89,11.78,4.01,5.03,6.0,...,11.9,12.7,15.79,17.61,20.16,21.96,30,41.397952,2.180042,1.47
2,13.52,14.1,13.8,13.48,13.4,11.88,7.34,3.51,3.19,4.34,...,6.91,7.28,8.78,10.33,12.42,12.66,27,41.39553,2.17706,1.11
3,16.59,16.69,16.76,16.66,16.36,16.76,14.39,7.89,4.04,3.5,...,19.57,18.42,16.54,16.12,16.38,17.25,27,41.394055,2.181299,1.22
4,9.78,9.74,9.94,10.39,11.53,14.07,10.06,4.9,2.04,2.82,...,15.48,13.53,11.53,9.66,9.61,10.53,21,41.39348,2.181555,1.2
5,24.42,24.39,24.24,23.87,24.26,23.06,18.66,9.33,8.24,10.15,...,30.29,27.05,27.04,27.42,26.78,24.82,39,41.391075,2.180223,0.96


# Building the model

In [13]:
# Principal Component 
pca = PCA(n_components=4)

principalComponents = pca.fit_transform(oct_features_gps)
principalDf = pd.DataFrame(data = principalComponents
             ,columns = ['pc1', 'pc2', 'pc3', 'pc4'])
principalDf.head()    

Unnamed: 0,pc1,pc2,pc3,pc4
0,24.856437,-23.340989,12.909567,2.574829
1,-0.51341,-13.038903,0.526834,-2.015073
2,23.840721,-15.236268,-3.282567,-8.167943
3,3.339493,-8.876991,-8.139147,-12.236098
4,53.914847,-20.771258,2.61109,2.403146


In [14]:
kmeans = KMeans(n_clusters=5)
october_clusters = kmeans.fit(principalDf)
october_clusters.cluster_centers_

array([[-33.19580173,  -3.6682482 ,  -3.53521616,  -0.22970522],
       [ 37.79893832, -20.9168805 ,   1.88550838,  -1.51885047],
       [ -4.36316007,  33.7547215 ,   4.93940611,   0.08890129],
       [ 31.73961206,  14.36678515,  -6.58119716,   0.83837377],
       [ -6.61762958, -15.59084468,   4.17572449,   0.80335372]])

In [15]:
oct_features_gps['labels'] = october_clusters.fit_predict(principalDf)
oct_features_gps.head()

Unnamed: 0_level_0,00:00,01:00,02:00,03:00,04:00,05:00,06:00,07:00,08:00,09:00,...,19:00,20:00,21:00,22:00,23:00,capacity,lat,long,DisttoCentre,labels
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,23.45,23.83,23.71,23.85,23.66,20.89,11.78,4.01,5.03,6.0,...,12.7,15.79,17.61,20.16,21.96,30,41.397952,2.180042,1.47,0
2,13.52,14.1,13.8,13.48,13.4,11.88,7.34,3.51,3.19,4.34,...,7.28,8.78,10.33,12.42,12.66,27,41.39553,2.17706,1.11,4
3,16.59,16.69,16.76,16.66,16.36,16.76,14.39,7.89,4.04,3.5,...,18.42,16.54,16.12,16.38,17.25,27,41.394055,2.181299,1.22,0
4,9.78,9.74,9.94,10.39,11.53,14.07,10.06,4.9,2.04,2.82,...,13.53,11.53,9.66,9.61,10.53,21,41.39348,2.181555,1.2,4
5,24.42,24.39,24.24,23.87,24.26,23.06,18.66,9.33,8.24,10.15,...,27.05,27.04,27.42,26.78,24.82,39,41.391075,2.180223,0.96,0


In [16]:
oct_features_gps.labels.value_counts()

1    117
4     88
2     69
0     69
3     67
Name: labels, dtype: int64

In [17]:
oct_features_gps.to_csv("..\\Dataset\\clusters.csv",encoding="utf_8",decimal=',', sep=';', index=True)