In [1]:
from utils.path import get_git_root
from utils.load_data import LoadData

data_path = get_git_root() / "data" / "raw"
data = LoadData(data_path)
data.load_datasets(select=['bikeshare-ridership-2020'])

Dataset 'bikeshare-ridership-2020' loaded.
All datasets loaded


In [2]:
dat2020 = data.unpack()[0]

In [3]:
from utils.preprocessing import normalize_column_names
import pandas as pd

column_mapping = {
    'trip_id': 'Trip Id', 
    'trip_duration_seconds': 'Trip  Duration', 
    'from_station_id': 'Start Station Id', 
    'trip_start_time': 'Start Time', 
    'from_station_name': 'Start Station Name', 
    'to_station_id': 'End Station Id', 
    'trip_stop_time': 'End Time', 
    'to_station_name': 'End Station Name',
    'bike_id': 'Bike Id', 
    'user_type': 'User Type',
    'model': 'Model'}

column_order = normalize_column_names(pd.Index(list(column_mapping.values())))

for df in dat2020.get_data_files():
    df.rename(columns=column_mapping, inplace=True)
    df.columns = normalize_column_names(df.columns)
print(dat2020.get_unique_cols())

{'start_station_name', 'trip_duration', 'end_station_name', 'start_time', 'trip_id', 'end_station_id', 'user_type', 'end_time', 'bike_id', 'start_station_id'}


In [4]:
missing_vals = dat2020.get_na_and_dtypes(column_order)
missing_vals

Unnamed: 0,name,trip_id,trip_id_dtype,trip_duration,trip_duration_dtype,start_station_id,start_station_id_dtype,start_time,start_time_dtype,start_station_name,...,end_time,end_time_dtype,end_station_name,end_station_name_dtype,bike_id,bike_id_dtype,user_type,user_type_dtype,model,model_dtype
0,2020-01.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,88,object,0,int64,0,object,,
1,2020-02.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,54,object,0,int64,0,object,,
2,2020-03.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,67,object,0,int64,0,object,,
3,2020-04.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,44,object,0,int64,0,object,,
4,2020-05.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,164,object,0,int64,0,object,,
5,2020-06.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,75,object,0,int64,0,object,,
6,2020-07.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,84,object,0,int64,0,object,,
7,2020-08.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,115,object,0,int64,0,object,,
8,2020-09.csv,0,int64,0,int64,0,int64,0,object,0,...,0,object,61,object,0,int64,0,object,,
9,2020-10.csv,0,int64,0,int64,0,object,0,object,164,...,0,object,202,object,0,object,249,object,,


In [5]:
from utils.preprocessing import get_dtype_cols, count_col_dtypes

count_col_dtypes(missing_vals)

start_station_id_dtype
int64     11
object     1
Name: count, dtype: int64

end_station_id_dtype
float64    11
object      1
Name: count, dtype: int64

bike_id_dtype
int64      10
object      1
float64     1
Name: count, dtype: int64



In [6]:
dtype_cols = get_dtype_cols(missing_vals)
missing_vals.drop(columns=dtype_cols)

Unnamed: 0,name,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type,model
0,2020-01.csv,0,0,0,0,0,88,0,88,0,0,
1,2020-02.csv,0,0,0,0,0,54,0,54,0,0,
2,2020-03.csv,0,0,0,0,0,67,0,67,0,0,
3,2020-04.csv,0,0,0,0,0,44,0,44,0,0,
4,2020-05.csv,0,0,0,0,0,164,0,164,0,0,
5,2020-06.csv,0,0,0,0,0,75,0,75,0,0,
6,2020-07.csv,0,0,0,0,0,84,0,84,0,0,
7,2020-08.csv,0,0,0,0,0,115,0,115,0,0,
8,2020-09.csv,0,0,0,0,0,61,0,61,0,0,
9,2020-10.csv,0,0,0,0,164,60,0,202,0,249,


In [7]:
dat2020_10 = dat2020.get_data_files('2020-10.csv')
dat2020_10.head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
0,9970495,918,7418,10/01/2020 00:00,College Park - Yonge St Entrance,7007,10/01/2020 00:15,College St / Huron St,5677,Annual Member
1,9970496,662,7061,10/01/2020 00:00,Dalton Rd / Bloor St W,7143,10/01/2020 00:11,Kendal Ave / Bernard Ave,6634,Annual Member
2,9970497,525,7051,10/01/2020 00:01,Wellesley St E / Yonge St (Green P),7009,10/01/2020 00:09,King St E / Jarvis St,4560,Annual Member
3,9970498,382,7004,10/01/2020 00:01,University Ave / Elm St,7050,10/01/2020 00:07,Richmond St E / Jarvis St Green P,4948,Annual Member
4,9970499,417,7051,10/01/2020 00:01,Wellesley St E / Yonge St (Green P),7292,10/01/2020 00:08,Granby St / Church St - SMART,5400,Annual Member


In [8]:
anomaly = dat2020_10[dat2020_10['user_type'].isna()]
anomaly.head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
25640,10000084625,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member,
25837,10000306555,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member,
26029,10000519608,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member,
26248,10000755851,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member,
26560,10001076784,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member,


In [9]:
dat2020_10.iloc[anomaly.index, 1:] = dat2020_10.iloc[anomaly.index, 1:].shift(periods=1, fill_value=0, axis='columns')
dat2020_10.iloc[anomaly.index].head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
25640,10000084625,0,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member
25837,10000306555,0,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member
26029,10000519608,0,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member
26248,10000755851,0,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member
26560,10001076784,0,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member


In [10]:
from utils.preprocessing import calc_trip_duration

dat2020_10.loc[anomaly.index, 'trip_duration'] = calc_trip_duration(dat2020_10, 'start_time', 'end_time', anomaly.index)
dat2020_10.loc[anomaly.index].head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
25640,10000084625,600,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member
25837,10000306555,600,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member
26029,10000519608,600,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member
26248,10000755851,840,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member
26560,10001076784,840,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member


In [11]:
dat2020_10['start_station_id'] = pd.to_numeric(dat2020_10['start_station_id'], errors='coerce')
dat2020_10['end_station_id'] = pd.to_numeric(dat2020_10['end_station_id'], errors='coerce')
dat2020_10['bike_id'] = pd.to_numeric(dat2020_10['bike_id'], errors='coerce')

print("dat2020-10")
print(f"start_station_id: {dat2020_10['start_station_id'].dtype}")
print(f"end_station_id: {dat2020_10['end_station_id'].dtype}")
print(f"bike_id: {dat2020_10['bike_id'].dtype}")

dat2020-10
start_station_id: int64
end_station_id: float64
bike_id: int64


In [12]:
dat2020_10.isna().sum()

trip_id                 0
trip_duration           0
start_station_id        0
start_time              0
start_station_name    164
end_station_id         60
end_time                0
end_station_name      202
bike_id                 0
user_type               0
dtype: int64

In [13]:
missing_vals = dat2020.get_na_and_dtypes(column_order)
missing_vals.drop(columns=dtype_cols)

Unnamed: 0,name,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type,model
0,2020-01.csv,0,0,0,0,0,88,0,88,0,0,
1,2020-02.csv,0,0,0,0,0,54,0,54,0,0,
2,2020-03.csv,0,0,0,0,0,67,0,67,0,0,
3,2020-04.csv,0,0,0,0,0,44,0,44,0,0,
4,2020-05.csv,0,0,0,0,0,164,0,164,0,0,
5,2020-06.csv,0,0,0,0,0,75,0,75,0,0,
6,2020-07.csv,0,0,0,0,0,84,0,84,0,0,
7,2020-08.csv,0,0,0,0,0,115,0,115,0,0,
8,2020-09.csv,0,0,0,0,0,61,0,61,0,0,
9,2020-10.csv,0,0,0,0,164,60,0,202,0,0,


In [14]:
missing_vals[['name', 'bike_id', 'bike_id_dtype']]

Unnamed: 0,name,bike_id,bike_id_dtype
0,2020-01.csv,0,int64
1,2020-02.csv,0,int64
2,2020-03.csv,0,int64
3,2020-04.csv,0,int64
4,2020-05.csv,0,int64
5,2020-06.csv,0,int64
6,2020-07.csv,0,int64
7,2020-08.csv,0,int64
8,2020-09.csv,0,int64
9,2020-10.csv,0,int64


In [15]:
dat2020_12 = dat2020.get_data_files('2020-12.csv')
dat2020_12[dat2020_12['bike_id'].isna()]

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
37959,10575709,861,7089,12/11/2020 15:43,Church St / Wood St,7163.0,12/11/2020 15:57,Yonge St / Wood St,,Casual Member
37970,10575721,783,7089,12/11/2020 15:44,Church St / Wood St,7163.0,12/11/2020 15:57,Yonge St / Wood St,,Casual Member
38067,10575854,1000,7163,12/11/2020 15:58,Yonge St / Wood St,7542.0,12/11/2020 16:15,Queen St W / John St,,Casual Member
38068,10575855,1002,7163,12/11/2020 15:58,Yonge St / Wood St,7542.0,12/11/2020 16:15,Queen St W / John St,,Casual Member
38189,10575991,1400,7542,12/11/2020 16:15,Queen St W / John St,7259.0,12/11/2020 16:38,Lower Spadina Ave / Lake Shore Blvd,,Casual Member
...,...,...,...,...,...,...,...,...,...,...
74102,10619126,985,7503,12/22/2020 14:56,Gerrard St E / Malvern Ave - SMART,7337.0,12/22/2020 15:12,Gerrard Square Mall (1010 Gerrard St E),,Annual Member
74173,10619199,892,7337,12/22/2020 15:13,Gerrard Square Mall (1010 Gerrard St E),7610.0,12/22/2020 15:28,Kingston Rd / Beech Ave - SMART,,Annual Member
80018,10625844,1570,7610,12/23/2020 21:57,Kingston Rd / Beech Ave - SMART,7015.0,12/23/2020 22:23,King St W / Bay St (West Side),,Annual Member
80116,10625977,757,7015,12/23/2020 23:36,King St W / Bay St (West Side),7342.0,12/23/2020 23:49,Morse St / Eastern Ave - SMART,,Casual Member


In [16]:
for df in dat2020.get_data_files():
    df['bike_id'] = df['bike_id'].astype('Int64')
missing_vals = dat2020.get_na_and_dtypes(column_order)
missing_vals[['name', 'bike_id', 'bike_id_dtype']]

Unnamed: 0,name,bike_id,bike_id_dtype
0,2020-01.csv,0,Int64
1,2020-02.csv,0,Int64
2,2020-03.csv,0,Int64
3,2020-04.csv,0,Int64
4,2020-05.csv,0,Int64
5,2020-06.csv,0,Int64
6,2020-07.csv,0,Int64
7,2020-08.csv,0,Int64
8,2020-09.csv,0,Int64
9,2020-10.csv,0,Int64


In [17]:
count_col_dtypes(missing_vals, summarize_all=True)

trip_id_dtype
int64    12
Name: count, dtype: int64

trip_duration_dtype
int64    12
Name: count, dtype: int64

start_station_id_dtype
int64    12
Name: count, dtype: int64

start_time_dtype
object    12
Name: count, dtype: int64

start_station_name_dtype
object    12
Name: count, dtype: int64

end_station_id_dtype
float64    12
Name: count, dtype: int64

end_time_dtype
object    12
Name: count, dtype: int64

end_station_name_dtype
object    12
Name: count, dtype: int64

bike_id_dtype
Int64    12
Name: count, dtype: int64

user_type_dtype
object    12
Name: count, dtype: int64

Series([], Name: count, dtype: int64)



In [18]:
dat2020_10[dat2020_10['start_station_name'].isna()]['start_station_id'].value_counts()

start_station_id
7660    107
7659     57
Name: count, dtype: int64

In [19]:
dat2020_10[dat2020_10['end_station_name'].isna()]['end_station_id'].value_counts()

end_station_id
7660.0    104
7659.0     38
Name: count, dtype: int64

In [20]:
dat2020_10[dat2020_10['end_station_id'] == 7660].shape[0]

104

In [21]:
dat2020_10[dat2020_10['end_station_id'] == 7659].shape[0]

38

In [22]:
anomaly = dat2020_10[(dat2020_10['trip_duration'] == 0) & (dat2020_10['end_station_id'].isna()) & (dat2020_10['end_station_name'].isna())]
anomaly.shape[0]

54

In [23]:
dat2020_10.drop(anomaly.index, inplace=True)
dat2020_10[dat2020_10['end_station_id'].isna()]

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
23032,9997148,182515,7168,10/03/2020 10:30,Queens Quay / Yonge St,,10/05/2020 13:12,,4635,Annual Member
77947,10060142,587862,7458,10/08/2020 22:33,Church St / Lombard St,,10/15/2020 17:51,,5236,Casual Member
107684,10093544,775107,7557,10/11/2020 14:48,The Queensway / High St - SMART,,10/20/2020 14:07,,4142,Casual Member
116974,10104392,686918,7055,10/12/2020 15:28,Jarvis St / Carlton St,,10/20/2020 14:17,,6543,Casual Member
184246,10184376,273235,7038,10/20/2020 09:36,Dundas St W / Yonge St,,10/23/2020 13:30,,5888,Casual Member
186918,10187557,252375,7228,10/20/2020 15:34,Queen St W / Roncesvalles Ave,,10/23/2020 13:40,,6092,Casual Member


In [24]:
dat2020_10_sorted = dat2020_10.sort_values(by='trip_duration', ascending=False)
dat2020_10_sorted.head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
107684,10093544,775107,7557,10/11/2020 14:48,The Queensway / High St - SMART,,10/20/2020 14:07,,4142,Casual Member
116974,10104392,686918,7055,10/12/2020 15:28,Jarvis St / Carlton St,,10/20/2020 14:17,,6543,Casual Member
77947,10060142,587862,7458,10/08/2020 22:33,Church St / Lombard St,,10/15/2020 17:51,,5236,Casual Member
236434,10245681,564811,7156,10/26/2020 02:56,Salem Ave / Bloor St W,7156.0,11/01/2020 14:50,Salem Ave / Bloor St W,2091,Casual Member
94847,10079118,446163,7450,10/10/2020 13:31,Carlaw Ave / Danforth Ave - SMART,7387.0,10/15/2020 17:27,Mortimer Ave / Carlaw Ave SMART,4758,Annual Member


In [28]:
dat2020_10[dat2020_10['bike_id'] == 6092].tail()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
175740,10173768,671,7536,10/18/2020 16:38,Palmerston Ave / Dundas St W - SMART,7038.0,10/18/2020 16:49,Dundas St W / Yonge St,6092,Casual Member
176097,10174142,834,7038,10/18/2020 17:04,Dundas St W / Yonge St,7059.0,10/18/2020 17:18,Front St W / Blue Jays Way,6092,Annual Member
176904,10175040,600,7059,10/18/2020 18:08,Front St W / Blue Jays Way,7657.0,10/18/2020 18:18,1 Market St,6092,Annual Member
185563,10185989,2856,7657,10/20/2020 12:59,1 Market St,7228.0,10/20/2020 13:46,Queen St W / Roncesvalles Ave,6092,Annual Member
186918,10187557,252375,7228,10/20/2020 15:34,Queen St W / Roncesvalles Ave,,10/23/2020 13:40,,6092,Casual Member


In [None]:
missing_vals = dat2020.get_na_and_dtypes(column_order)
missing_vals.drop(columns=dtype_cols)