# Feature Preparation for clustering trips

In [1]:
import pandas as pd

In [2]:
trips_df = pd.read_pickle('../00_data/trips.pkl')
trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,sustenance_poi_start,public_transport_poi_start,education_poi_start,arts_and_culture_poi_start,sports_poi_start,sustenance_poi_end,public_transport_poi_end,education_poi_end,arts_and_culture_poi_end,sports_poi_end
0,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,0 days 00:32:00,34.05194,...,80,28,2,4,0,67,51,0,4,4
1,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,5860,Walk-up,Main & 1st,Broadway & 9th,0 days 00:30:00,34.05194,...,80,28,2,4,0,67,51,0,4,4


In [3]:
trips_hourly_df = pd.read_pickle('../00_data/trips_hourly_selected.pkl')
trips_hourly_df.head(2)

Unnamed: 0_level_0,starting_trips,ongoing_trips_prev,in_service_bikes_rolling,available_bikes,max_temp,min_temp,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-01 01:00:00,8.0,2.0,873.0,871.0,15.6,15.6,0.0,1,1,1,1,1,True,True
2019-01-01 02:00:00,11.0,4.0,873.0,869.0,15.0,15.0,0.0,2,1,1,1,1,True,True


Let's merge the hourly data to our trip dataframe. To do that we have to have a column that has the same values as the hourly-data-index. 

In [4]:
trips_df["start_time_floored"]=trips_df['start_time'].dt.floor('H')
trips_df = trips_df.sort_values("start_time")
len_before_merge = len(trips_df)
trips_df = trips_df.merge(trips_hourly_df, left_on='start_time_floored', right_index=True)
trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,max_temp,min_temp,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday
210759,2019-01-01 01:29:00,2019-01-01 02:49:00,4211,4211,12244,Walk-up,Pacific & North Venice,Pacific & North Venice,0 days 01:20:00,33.98493,...,15.6,15.6,0.0,1,1,1,1,1,True,True
55231,2019-01-01 01:31:00,2019-01-01 01:35:00,3027,3046,5819,Walk-up,Spring & 3rd,2nd & Hill,0 days 00:04:00,34.04998,...,15.6,15.6,0.0,1,1,1,1,1,True,True


Now we can compute how many rows we lost:

In [5]:
print(len_before_merge - len(trips_df))

14


Those 14 trips were started between 0:00 am and 1:00 am at the first of january 2019. For this hour there is no hourly
data created so it was dropped when merging.
Now we will edit some columns to our needs:
- set the duration in minutes
- set an end hour
- change boolean values to 1 and 0
- change user types to numbers
- simplify the points of interest by only indicating whether there are POIs of this category at the station or not


In [6]:
trips_df['duration'] = trips_df['duration'].dt.seconds/60
trips_df["hour_end"] = trips_df['end_time'].dt.hour
trips_df["is_weekday"] = trips_df["is_weekday"].apply(lambda x: int(x))
trips_df["is_holiday"] = trips_df["is_holiday"].apply(lambda x: int(x))
types = ['Walk-up', 'Monthly Pass', 'Annual Pass', 'One Day Pass', 'Flex Pass', 'Testing']
trips_df["user_type"] = trips_df["user_type"].apply(lambda x: types.index(x) if type(x) == str else None)
for col in ["sustenance_poi_start", "public_transport_poi_start",
            "education_poi_start", "arts_and_culture_poi_start",
            "sports_poi_start", "sustenance_poi_end",
            "public_transport_poi_end", "education_poi_end",
            "arts_and_culture_poi_end", "sports_poi_end",]:
    trips_df[col] = trips_df[col].apply(lambda x: int(x>0))

trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,min_temp,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday,hour_end
210759,2019-01-01 01:29:00,2019-01-01 02:49:00,4211,4211,12244,0,Pacific & North Venice,Pacific & North Venice,80.0,33.98493,...,15.6,0.0,1,1,1,1,1,1,1,2
55231,2019-01-01 01:31:00,2019-01-01 01:35:00,3027,3046,5819,0,Spring & 3rd,2nd & Hill,4.0,34.04998,...,15.6,0.0,1,1,1,1,1,1,1,1


Lastly let us take a look at the columns and which of them we will need. We drop the others.

In [7]:
trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225280 entries, 210759 to 130228
Data columns (total 43 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   start_time                  225280 non-null  datetime64[ns]
 1   end_time                    225280 non-null  datetime64[ns]
 2   start_station_id            225280 non-null  int64         
 3   end_station_id              225280 non-null  int64         
 4   bike_id                     225280 non-null  object        
 5   user_type                   225280 non-null  int64         
 6   start_station_name          225280 non-null  object        
 7   end_station_name            225280 non-null  object        
 8   duration                    225280 non-null  float64       
 9   start_latitude              225280 non-null  float64       
 10  start_longitude             225280 non-null  float64       
 11  end_latitude                225280

In [8]:
trips_df = trips_df.drop(columns=[
    'start_time','end_time',                    # we already have start and end hour that is accurate enough
    'start_station_id', 'end_station_id',       # important for station clustering
    'start_station_name', 'end_station_name',   # identifier as id so no new information gain
    'start_latitude','start_longitude',         # important for station clustering
    'end_latitude','end_longitude',             # important for station clustering
    'start_time_floored','week','day_of_year',  # start time in start_hour and week & day_of_year correlated with month
    'starting_trips', 'ongoing_trips_prev',     # not important for single trip
    'in_service_bikes_rolling','available_bikes', # not important for single trip
    'min_temp','bike_id',                       # min_temp correlates to max_temp and bike id too specific
    'start_hex', 'end_hex'])                    # hex not for clustering
trips_df = trips_df.rename(columns={"hour": "hour_start", "max_temp": "temp"})

In [9]:
trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225280 entries, 210759 to 130228
Data columns (total 22 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   user_type                   225280 non-null  int64  
 1   duration                    225280 non-null  float64
 2   distance                    225280 non-null  float64
 3   speed                       225280 non-null  float64
 4   sustenance_poi_start        225280 non-null  int64  
 5   public_transport_poi_start  225280 non-null  int64  
 6   education_poi_start         225280 non-null  int64  
 7   arts_and_culture_poi_start  225280 non-null  int64  
 8   sports_poi_start            225280 non-null  int64  
 9   sustenance_poi_end          225280 non-null  int64  
 10  public_transport_poi_end    225280 non-null  int64  
 11  education_poi_end           225280 non-null  int64  
 12  arts_and_culture_poi_end    225280 non-null  int64  
 13  sports_po

In [10]:
trips_df.to_pickle('../00_data/trips_with_hourly_features.pkl')