In [200]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

# Workshop 5
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns
sns.set()
sns.set_style("white")
sns.set_palette("GnBu_d")

# also import these "new" libraries 
# Note: you may have to download an add them to your environment (using e.g. 'conda install -c conda-forge folium')

import folium
from folium import plugins
from folium.plugins import HeatMap
from datetime import datetime #for working with times objects
from datetime import timedelta #for working with times objects
import math
import random
#import timeit

In [201]:
data = pd.read_csv("philadelphia_2019.csv")
weatherData = pd.read_csv("weather_hourly_philadelphia.csv")

# 1.1 Quick overview

In [202]:
data.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
0,2019-01-01 00:19:00,2019-01-01 00:27:00,3049,3007,14495,Indego30,Foglietta Plaza,"11th & Pine, Kahn Park"
1,2019-01-01 00:30:00,2019-01-01 00:37:00,3005,3007,5332,Day Pass,"Welcome Park, NPS","11th & Pine, Kahn Park"
2,2019-01-01 00:52:00,2019-01-01 01:05:00,3166,3169,14623,Indego30,Frankford & Belgrade,2nd & Race
3,2019-01-01 00:55:00,2019-01-01 01:04:00,3058,3103,11706,Indego30,20th & Fairmount,"27th & Master, Athletic Recreation Center"
4,2019-01-01 01:05:00,2019-01-01 01:17:00,3182,3028,11039,Indego30,17th & Sansom,4th & Bainbridge


In [203]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744260 entries, 0 to 744259
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   start_time          744260 non-null  object
 1   end_time            744260 non-null  object
 2   start_station_id    744260 non-null  int64 
 3   end_station_id      744260 non-null  int64 
 4   bike_id             744260 non-null  int64 
 5   user_type           744225 non-null  object
 6   start_station_name  744260 non-null  object
 7   end_station_name    744260 non-null  object
dtypes: int64(3), object(5)
memory usage: 45.4+ MB


In [204]:
user_types = data['user_type'].unique()
user_types

array(['Indego30', 'Day Pass', 'Indego365', 'IndegoFlex', 'Walk-up', nan],
      dtype=object)

In [205]:
data['user_type'].value_counts()
        

Indego30      505872
Indego365     147580
Day Pass       89137
IndegoFlex      1410
Walk-up          226
Name: user_type, dtype: int64

# 1.2 Data Preparation

## 1.2.1 Dealing with missing values

In [206]:
data.isnull().sum()

start_time             0
end_time               0
start_station_id       0
end_station_id         0
bike_id                0
user_type             35
start_station_name     0
end_station_name       0
dtype: int64

because there are only 35 out of 744260 rows with missing values, we can simply drop these rows

In [207]:
data_clean = data.dropna(axis=0, inplace=False)
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 744225 entries, 0 to 744259
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   start_time          744225 non-null  object
 1   end_time            744225 non-null  object
 2   start_station_id    744225 non-null  int64 
 3   end_station_id      744225 non-null  int64 
 4   bike_id             744225 non-null  int64 
 5   user_type           744225 non-null  object
 6   start_station_name  744225 non-null  object
 7   end_station_name    744225 non-null  object
dtypes: int64(3), object(5)
memory usage: 51.1+ MB


In [208]:
data.sort_values(["bike_id","start_time"], inplace=True)
data.set_index(pd.DatetimeIndex(data["start_time"]),inplace=True)
data.drop("start_time", axis=1, inplace=True)

In [209]:
data.head()

Unnamed: 0_level_0,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
start_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-09-27 12:48:00,9/27/2019 13:04,3167,3078,1,Indego365,12th & Chestnut,19th & Market
2019-09-27 13:05:00,9/27/2019 14:46,3078,3000,1,Indego365,19th & Market,Virtual Station
2019-10-26 13:10:23,2019-10-26 14:10:23,3162,3000,42,Day Pass,27th & South,Virtual Station
2019-12-11 14:19:12,2019-12-11 14:52:43,3042,3042,44,Indego365,3042,3042
2019-12-11 15:09:05,2019-12-11 16:25:57,3042,3000,44,Indego365,3042,Virtual Station


In [210]:
# Maps time of day and day of week to our DF

def get_date (ts):
    return ts.date()

def get_weekday (ts):
    return ts.weekday()

def get_hour (ts):
    return ts.hour

In [211]:
# applies the previous defined functions to our DF
data.reset_index(inplace=True) # resets the index(start_time) back to the original row
data["Date"] = data["start_time"].apply(lambda ts: get_date (ts))
data["Weekday"]=data["start_time"].apply(lambda x: get_weekday (x))
data["Hour"]=data["start_time"].apply(lambda x: get_hour (x))

In [212]:
data.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,Date,Weekday,Hour
0,2019-09-27 12:48:00,9/27/2019 13:04,3167,3078,1,Indego365,12th & Chestnut,19th & Market,2019-09-27,4,12
1,2019-09-27 13:05:00,9/27/2019 14:46,3078,3000,1,Indego365,19th & Market,Virtual Station,2019-09-27,4,13
2,2019-10-26 13:10:23,2019-10-26 14:10:23,3162,3000,42,Day Pass,27th & South,Virtual Station,2019-10-26,5,13
3,2019-12-11 14:19:12,2019-12-11 14:52:43,3042,3042,44,Indego365,3042,3042,2019-12-11,2,14
4,2019-12-11 15:09:05,2019-12-11 16:25:57,3042,3000,44,Indego365,3042,Virtual Station,2019-12-11,2,15


In [213]:
# adds column with verbosed weekday into DF
weekday_dict = {0:"Mon", 1:"Tue", 2:"Wed", 3:"Thu", 4:"Fri", 5:"Sat", 6:"Sun"}
data["Weekday_verbose"] = data["Weekday"].apply(lambda x: weekday_dict[x])

In [214]:
data.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,Date,Weekday,Hour,Weekday_verbose
0,2019-09-27 12:48:00,9/27/2019 13:04,3167,3078,1,Indego365,12th & Chestnut,19th & Market,2019-09-27,4,12,Fri
1,2019-09-27 13:05:00,9/27/2019 14:46,3078,3000,1,Indego365,19th & Market,Virtual Station,2019-09-27,4,13,Fri
2,2019-10-26 13:10:23,2019-10-26 14:10:23,3162,3000,42,Day Pass,27th & South,Virtual Station,2019-10-26,5,13,Sat
3,2019-12-11 14:19:12,2019-12-11 14:52:43,3042,3042,44,Indego365,3042,3042,2019-12-11,2,14,Wed
4,2019-12-11 15:09:05,2019-12-11 16:25:57,3042,3000,44,Indego365,3042,Virtual Station,2019-12-11,2,15,Wed


# Fleet Size and Availability

In [215]:
fleet_size = len(data["bike_id"].unique())

print("The bike fleet from Indego consists of {} bikes (for the observed time frame)".format(fleet_size))

The bike fleet consists of 1677 bikes (for the observed time frame)
