# Feature Preparation for clustering trips

In [148]:
import numpy as np
import pandas as pd

In [149]:
trips_df = pd.read_pickle('../00_data/trips.pkl')
trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,start_longitude,end_latitude,end_longitude,distance,speed
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,0 days 00:32:00,34.05194,-118.24353,34.04211,-118.25619,1.498844,2.810332
3,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,5860,Walk-up,Main & 1st,Broadway & 9th,0 days 00:30:00,34.05194,-118.24353,34.04211,-118.25619,1.498844,2.997688


In [150]:
trips_hourly_df = pd.read_pickle('../00_data/trips_hourly_selected.pkl')
trips_hourly_df.head(2)

Unnamed: 0_level_0,starting_trips,ongoing_trips_prev,in_service_bikes_rolling,available_bikes,max_temp,min_temp,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-01 01:00:00,8.0,2.0,873.0,871.0,15.6,15.6,0.0,1,1,1,1,1,True,True
2019-01-01 02:00:00,11.0,4.0,873.0,869.0,15.0,15.0,0.0,2,1,1,1,1,True,True


In [151]:
#TODO data above is from feature selection in task4 move to task1?

Let's merge the hourly data to our trip dataframe. To do that we have to have a column that has the same values as the hourly-data-index. 

In [152]:
trips_df["start_time_floored"]=trips_df['start_time'].dt.floor('1H')
len_before_merge = len(trips_df)
trips_df = trips_df.merge(trips_hourly_df, left_on='start_time_floored', right_index=True)
trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,max_temp,min_temp,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday
19,2019-01-01 01:29:00,2019-01-01 02:49:00,4211,4211,12244,Walk-up,Pacific & North Venice,Pacific & North Venice,0 days 01:20:00,33.98493,...,15.6,15.6,0.0,1,1,1,1,1,True,True
20,2019-01-01 01:31:00,2019-01-01 01:35:00,3027,3046,5819,Walk-up,Spring & 3rd,2nd & Hill,0 days 00:04:00,34.04998,...,15.6,15.6,0.0,1,1,1,1,1,True,True


Now we can compute how many rows we lost:

In [153]:
print(len_before_merge - len(trips_df))

14


Those 14 trips were started between 0:00 am and 1:00 am at the first of january 2019. For this hour there is no hourly data created so it was dropped when merging.
Now we will edit some columns to our needs:
- set the duration in minutes
- set an end hour
- change boolean values to 1 and 0

In [None]:
#TODO: first hour data manually???

In [158]:
trips_df['duration'] = trips_df['duration'].dt.seconds/60
trips_df["hour_end"] = trips_df['end_time'].dt.hour
trips_df["is_weekday"] = trips_df["is_weekday"].apply(lambda x: int(x))
trips_df["is_holiday"] = trips_df["is_holiday"].apply(lambda x: int(x))

trips_df.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,start_latitude,...,precip,hour,day_of_week,month,day_of_year,week,is_weekday,is_holiday,duration_min,hour_end
19,2019-01-01 01:29:00,2019-01-01 02:49:00,4211,4211,12244,Walk-up,Pacific & North Venice,Pacific & North Venice,80.0,33.98493,...,0.0,1,1,1,1,1,1,1,80.0,2
20,2019-01-01 01:31:00,2019-01-01 01:35:00,3027,3046,5819,Walk-up,Spring & 3rd,2nd & Hill,4.0,34.04998,...,0.0,1,1,1,1,1,1,1,4.0,1


Lastly let us take a look at the columns and which of them we will need. We drop the others.

In [159]:
trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225627 entries, 19 to 290340
Data columns (total 32 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   start_time                225627 non-null  datetime64[ns]
 1   end_time                  225627 non-null  datetime64[ns]
 2   start_station_id          225627 non-null  int64         
 3   end_station_id            225627 non-null  int64         
 4   bike_id                   225627 non-null  object        
 5   user_type                 225627 non-null  object        
 6   start_station_name        225627 non-null  object        
 7   end_station_name          225627 non-null  object        
 8   duration                  225627 non-null  float64       
 9   start_latitude            225627 non-null  float64       
 10  start_longitude           225627 non-null  float64       
 11  end_latitude              225627 non-null  float64       
 12  e

In [160]:
trips_df = trips_df.drop(columns=[
    'start_time','end_time',
    'start_station_id', 'end_station_id',
    'start_station_name', 'end_station_name',
    'start_latitude','start_longitude',
    'end_latitude','end_longitude',
    'start_time_floored','bike_id','user_type',
    'starting_trips', 'ongoing_trips_prev',
    'in_service_bikes_rolling','available_bikes',
    'min_temp', 'week', 'day_of_year'])

trips_df = trips_df.rename(columns={"hour": "hour_start", "max_temp": "temp"})

In [161]:
trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225627 entries, 19 to 290340
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   duration      225627 non-null  float64
 1   distance      225627 non-null  float64
 2   speed         225627 non-null  float64
 3   temp          225627 non-null  float64
 4   precip        225627 non-null  float64
 5   hour_start    225627 non-null  int64  
 6   day_of_week   225627 non-null  int64  
 7   month         225627 non-null  int64  
 8   is_weekday    225627 non-null  int64  
 9   is_holiday    225627 non-null  int64  
 10  duration_min  225627 non-null  float64
 11  hour_end      225627 non-null  int64  
dtypes: float64(6), int64(6)
memory usage: 22.4 MB


In [162]:
trips_df.to_pickle('../00_data/trips_with_hourly_features.pkl')