In [1]:
from utils.path import get_git_root
from utils.load_data import LoadData
from utils.preprocessing import normalize_column_names, normalize_datetimes, calc_trip_duration
import pandas as pd
import numpy as np

data_path = get_git_root() / "data" / "raw"
data = LoadData(data_path)
data.get_data_dirs()

['bikeshare-ridership-2014-2015',
 'bikeshare-ridership-2016',
 'bikeshare-ridership-2017',
 'bikeshare-ridership-2018',
 'bikeshare-ridership-2019',
 'bikeshare-ridership-2020',
 'bikeshare-ridership-2021',
 'bikeshare-ridership-2022',
 'bikeshare-ridership-2023',
 'bikeshare-ridership-2024',
 'bikeshare-ridership-readme']

In [2]:
avoid_dirs = ['bikeshare-ridership-2014-2015', 'bikeshare-ridership-2016', 'bikeshare-ridership-readme']
data.load_datasets(omit=avoid_dirs)

Dataset 'bikeshare-ridership-2017' loaded.
Dataset 'bikeshare-ridership-2018' loaded.
Dataset 'bikeshare-ridership-2019' loaded.
Dataset 'bikeshare-ridership-2020' loaded.
Dataset 'bikeshare-ridership-2021' loaded.
Dataset 'bikeshare-ridership-2022' loaded.
Dataset 'bikeshare-ridership-2023' loaded.
Dataset 'bikeshare-ridership-2024' loaded.
All datasets loaded


In [3]:
data.list_datasets()

{'bikeshare-ridership-2017': <utils.load_data.Dataset at 0x22e7f355150>,
 'bikeshare-ridership-2018': <utils.load_data.Dataset at 0x22e7e235590>,
 'bikeshare-ridership-2019': <utils.load_data.Dataset at 0x22e7f354e90>,
 'bikeshare-ridership-2020': <utils.load_data.Dataset at 0x22e0a53f5d0>,
 'bikeshare-ridership-2021': <utils.load_data.Dataset at 0x22e2203a990>,
 'bikeshare-ridership-2022': <utils.load_data.Dataset at 0x22e7ebb4890>,
 'bikeshare-ridership-2023': <utils.load_data.Dataset at 0x22e30bd4490>,
 'bikeshare-ridership-2024': <utils.load_data.Dataset at 0x22e646aaa10>}

In [7]:
for name, ds in data.list_datasets().items():
    print(name)
    # optimize so it doesnt rerun multiple loops
    ds_cols = [set(df.columns.tolist()) for df in ds.get_data_files()]
    
    union_cols = ds.get_unique_cols()
    intersection_cols = set.intersection(*ds_cols)
    difference_cols = union_cols - intersection_cols
    files_missing_col = [key for key, df in ds.list_data_files().items() if not difference_cols.issubset(df.columns)]
    
    print(f"union: {union_cols}")
    print(f"intersection: {intersection_cols}")
    print(f"difference: {difference_cols}")
    print(f"missing in: {files_missing_col}\n")

bikeshare-ridership-2017
union: {'trip_start_time', 'trip_stop_time', 'to_station_id', 'to_station_name', 'trip_duration_seconds', 'user_type', 'trip_id', 'from_station_id', 'from_station_name'}
intersection: {'trip_start_time', 'trip_stop_time', 'to_station_name', 'user_type', 'trip_duration_seconds', 'trip_id', 'from_station_name'}
difference: {'to_station_id', 'from_station_id'}
missing in: ['Bikeshare Ridership (2017 Q3).csv', 'Bikeshare Ridership (2017 Q4).csv']

bikeshare-ridership-2018
union: {'trip_start_time', 'trip_stop_time', 'to_station_id', 'to_station_name', 'trip_duration_seconds', 'user_type', 'trip_id', 'from_station_id', 'from_station_name'}
intersection: {'trip_start_time', 'to_station_id', 'trip_stop_time', 'to_station_name', 'trip_duration_seconds', 'user_type', 'trip_id', 'from_station_id', 'from_station_name'}
difference: set()
missing in: []

bikeshare-ridership-2019
union: {'Bike Id', 'End Time', 'End Station Name', 'Trip  Duration', 'End Station Id', 'Trip Id'

In [17]:
# rename first, reorder at the end
column_mapping = {
    'trip_id': 'Trip Id', 
    'trip_duration_seconds': 'Trip  Duration', 
    'from_station_id': 'Start Station Id', 
    'trip_start_time': 'Start Time', 
    'from_station_name': 'Start Station Name', 
    'to_station_id': 'End Station Id', 
    'trip_stop_time': 'End Time', 
    'to_station_name': 'End Station Name',
    'bike_id': 'Bike Id', 
    'user_type': 'User Type',
    'model': 'Model'}

column_order = normalize_column_names(pd.Index(list(column_mapping.values())))

for ds in data.get_datasets():
    for df in ds.get_data_files():
        df.rename(columns=column_mapping, inplace=True)
        df.columns = normalize_column_names(df.columns)
    print(ds.get_unique_cols())

{'end_time', 'start_time', 'end_station_name', 'start_station_id', 'user_type', 'trip_id', 'trip_duration', 'end_station_id', 'start_station_name'}
{'end_time', 'start_time', 'end_station_name', 'start_station_id', 'user_type', 'trip_id', 'trip_duration', 'end_station_id', 'start_station_name'}
{'end_time', 'start_time', 'bike_id', 'end_station_name', 'start_station_id', 'user_type', 'trip_id', 'trip_duration', 'end_station_id', 'start_station_name'}
{'end_time', 'start_time', 'bike_id', 'end_station_name', 'start_station_id', 'user_type', 'trip_id', 'trip_duration', 'end_station_id', 'start_station_name'}
{'end_time', 'start_time', 'bike_id', 'end_station_name', 'start_station_id', 'user_type', 'trip_id', 'trip_duration', 'end_station_id', 'start_station_name'}
{'end_time', 'start_time', 'bike_id', 'end_station_name', 'start_station_id', 'user_type', 'trip_id', 'trip_duration', 'end_station_id', 'start_station_name'}
{'end_time', 'start_time', 'bike_id', 'end_station_name', 'start_sta

In [19]:
# check dtype consistency
from collections import defaultdict

ds_col_dtypes = {}
for ds in data.get_datasets():    
    col_dtypes = defaultdict(set)
    for df in ds.get_data_files():
        for col, dtype in df.dtypes.items():
            col_dtypes[col].add(dtype)
    col_dtypes = dict(col_dtypes)
    
    print(f"{ds.get_name()}")
    print(col_dtypes)
    print()
    ds_col_dtypes[ds.get_name()] = col_dtypes

summary_dtypes = {k: set().union(*(col.get(k, set()) for col in ds_col_dtypes.values())) for k in {k for v in ds_col_dtypes.values() for k in v}}

print(f"Summary\n{summary_dtypes}\n")

summary_conflicts = {k: v for k, v in summary_dtypes.items() if len(summary_dtypes[k]) > 1}
print(f"Conflicting Column\n{summary_conflicts}")

bikeshare-ridership-2017
{'trip_id': {dtype('int64')}, 'start_time': {dtype('O')}, 'end_time': {dtype('O')}, 'trip_duration': {dtype('int64')}, 'start_station_id': {dtype('int64')}, 'start_station_name': {dtype('O')}, 'end_station_id': {dtype('int64')}, 'end_station_name': {dtype('O')}, 'user_type': {dtype('O')}}

bikeshare-ridership-2018
{'trip_id': {dtype('int64')}, 'trip_duration': {dtype('int64')}, 'start_station_id': {dtype('int64')}, 'start_time': {dtype('O')}, 'start_station_name': {dtype('O')}, 'end_time': {dtype('O')}, 'end_station_id': {dtype('int64')}, 'end_station_name': {dtype('O')}, 'user_type': {dtype('O')}}

bikeshare-ridership-2019
{'trip_id': {dtype('int64')}, 'trip_duration': {dtype('float64')}, 'start_station_id': {dtype('int64')}, 'start_time': {dtype('O')}, 'start_station_name': {dtype('O')}, 'end_station_id': {dtype('int64'), dtype('float64')}, 'end_time': {dtype('O')}, 'end_station_name': {dtype('O')}, 'bike_id': {dtype('int64')}, 'user_type': {dtype('O')}}

bik

In [21]:
map_conflicts = defaultdict(lambda: defaultdict(set))

for name, col_dtypes in ds_col_dtypes.items():
    for col, dtypes in col_dtypes.items():
        if col in summary_conflicts:
            for dtype in dtypes:
                if dtype in summary_conflicts[col]:
                    map_conflicts[col][dtype].add(name)
map_conflicts = dict(map_conflicts)
map_conflicts

{'trip_duration': defaultdict(set,
             {dtype('int64'): {'bikeshare-ridership-2017',
               'bikeshare-ridership-2018',
               'bikeshare-ridership-2020',
               'bikeshare-ridership-2021',
               'bikeshare-ridership-2022',
               'bikeshare-ridership-2023',
               'bikeshare-ridership-2024'},
              dtype('float64'): {'bikeshare-ridership-2019'}}),
 'start_station_id': defaultdict(set,
             {dtype('int64'): {'bikeshare-ridership-2017',
               'bikeshare-ridership-2018',
               'bikeshare-ridership-2019',
               'bikeshare-ridership-2020',
               'bikeshare-ridership-2021',
               'bikeshare-ridership-2022',
               'bikeshare-ridership-2023',
               'bikeshare-ridership-2024'},
              dtype('O'): {'bikeshare-ridership-2020'}}),
 'end_station_id': defaultdict(set,
             {dtype('int64'): {'bikeshare-ridership-2017',
               'bikeshare-rider

In [23]:
# check for missing values
for ds in data.get_datasets():
    for name, df in ds.list_data_files().items():
        num_missing = set(df.isnull().sum().tolist())
        num_missing.discard(0)
        if len(num_missing) > 0:
            print(name)
            print(df.isnull().sum())
            print()

Bikeshare Ridership (2017 Q4).csv
trip_id               0
start_time            0
end_time              0
trip_duration         0
start_station_name    0
end_station_name      1
user_type             0
dtype: int64

2019-Q1.csv
trip_id               0
trip_duration         7
start_station_id      0
start_time            0
start_station_name    0
end_station_id        0
end_time              0
end_station_name      0
bike_id               0
user_type             0
dtype: int64

2019-Q2.csv
trip_id               0
trip_duration         4
start_station_id      0
start_time            0
start_station_name    0
end_station_id        1
end_time              0
end_station_name      1
bike_id               0
user_type             0
dtype: int64

2019-Q3.csv
trip_id                0
trip_duration          3
start_station_id       0
start_time             0
start_station_name     0
end_station_id        19
end_time               0
end_station_name      19
bike_id                0
user_type      

In [25]:
dat2017, dat2018, dat2019, dat2020, dat2021, dat2022, dat2023, dat2024 = data.unpack()

In [27]:
# 2017 Q4
dat2017_q4 = dat2017.get_data_files('Bikeshare Ridership (2017 Q4).csv')
dat2017_q4[dat2017_q4['end_station_name'].isnull()]

Unnamed: 0,trip_id,start_time,end_time,trip_duration,start_station_name,end_station_name,user_type
295638,2302635,11/29/17 05:53:54,NULLNULL,0,Seaton St / Dundas St E,,Casual


In [29]:
dat2017_q4['end_time'] = pd.to_datetime(dat2017_q4['end_time'], format='%m/%d/%y %H:%M:%S', errors='coerce')
# dat2017_q4['end_time'] = pd.to_datetime(dat2017_q4['end_time'], errors='coerce')
dat2017_q4[dat2017_q4['end_time'].isna()]

Unnamed: 0,trip_id,start_time,end_time,trip_duration,start_station_name,end_station_name,user_type
295638,2302635,11/29/17 05:53:54,NaT,0,Seaton St / Dundas St E,,Casual


In [31]:
anomaly_entries = pd.DataFrame(columns=column_order)
anomaly_entries = pd.concat([anomaly_entries, dat2017_q4[dat2017_q4['end_time'].isna()]], ignore_index=True)
dat2017_q4.dropna(subset=['end_time'], inplace=True)
anomaly_entries.head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type,model
0,2302635,0,,11/29/17 05:53:54,Seaton St / Dundas St E,,,,,Casual,


In [35]:
for df in dat2017.get_data_files():
    print(df[df['trip_duration'] == 0])

Empty DataFrame
Columns: [trip_id, start_time, end_time, trip_duration, start_station_id, start_station_name, end_station_id, end_station_name, user_type]
Index: []
Empty DataFrame
Columns: [trip_id, start_time, end_time, trip_duration, start_station_id, start_station_name, end_station_id, end_station_name, user_type]
Index: []
Empty DataFrame
Columns: [trip_id, start_time, end_time, trip_duration, start_station_name, end_station_name, user_type]
Index: []
Empty DataFrame
Columns: [trip_id, start_time, end_time, trip_duration, start_station_name, end_station_name, user_type]
Index: []


In [37]:
print("trip_duration")
for fname, df in dat2019.list_data_files().items():
    print(f"\n{fname}\nNum Missing Values: {df['trip_duration'].isna().sum()}")
    print(df[df['trip_duration'].isna()].head())

trip_duration

2019-Q1.csv
Num Missing Values: 7
59771    NaN
74933    NaN
78219    NaN
86634    NaN
120157   NaN
Name: trip_duration, dtype: float64

2019-Q2.csv
Num Missing Values: 4
123305   NaN
206892   NaN
218836   NaN
303891   NaN
Name: trip_duration, dtype: float64

2019-Q3.csv
Num Missing Values: 3
449577   NaN
834139   NaN
908535   NaN
Name: trip_duration, dtype: float64

2019-Q4.csv
Num Missing Values: 2
227766   NaN
444309   NaN
Name: trip_duration, dtype: float64


In [39]:
for fname, df in dat2019.list_data_files().items():
    # adjust to fill with end - start time
    df['trip_duration'] = df['trip_duration'].fillna(0)
    print(f"{fname}\nNum Missing Values: {df['trip_duration'].isna().sum()}")
    print(f"Can convert all: {(df['trip_duration'] % 1 == 0).all()}\n")

2019-Q1.csv
Num Missing Values: 0
Can convert all: True

2019-Q2.csv
Num Missing Values: 0
Can convert all: True

2019-Q3.csv
Num Missing Values: 0
Can convert all: True

2019-Q4.csv
Num Missing Values: 0
Can convert all: True



In [41]:
for df in dat2019.get_data_files():
    df['trip_duration'] = df['trip_duration'].astype('int64')
    print(f"Trip Duration: {df['trip_duration'].dtype}")

Trip Duration: int64
Trip Duration: int64
Trip Duration: int64
Trip Duration: int64


In [43]:
print("start_station_id dtype")
for fname, df in dat2020.list_data_files().items():
    print(f"{fname}: {df['start_station_id'].dtype}")

start_station_id dtype
2020-01.csv: int64
2020-02.csv: int64
2020-03.csv: int64
2020-04.csv: int64
2020-05.csv: int64
2020-06.csv: int64
2020-07.csv: int64
2020-08.csv: int64
2020-09.csv: int64
2020-10.csv: object
2020-11.csv: int64
2020-12.csv: int64


In [45]:
df_2020_10 = dat2020.get_data_files("2020-10.csv")
df_2020_10.dtypes

trip_id                int64
trip_duration          int64
start_station_id      object
start_time            object
start_station_name    object
end_station_id        object
end_time              object
end_station_name      object
bike_id               object
user_type             object
dtype: object

In [47]:
df_2020_10 = dat2020.get_data_files("2020-10.csv")
df_2020_10_copy = df_2020_10[['start_station_id']].copy()
df_2020_10_copy['numeric'] = pd.to_numeric(df_2020_10_copy['start_station_id'], errors='coerce')

anomaly = df_2020_10_copy[df_2020_10_copy['numeric'].isna()]
anomaly[['start_station_id']]

Unnamed: 0,start_station_id
25640,10/03/2020 13:28
25837,10/03/2020 13:38
26029,10/03/2020 13:48
26248,10/03/2020 13:58
26560,10/03/2020 14:12
...,...
266879,10/30/2020 17:01
268551,10/30/2020 19:42
275974,10/31/2020 18:00
277412,10/31/2020 21:27


In [49]:
df_2020_10.head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
0,9970495,918,7418,10/01/2020 00:00,College Park - Yonge St Entrance,7007,10/01/2020 00:15,College St / Huron St,5677,Annual Member
1,9970496,662,7061,10/01/2020 00:00,Dalton Rd / Bloor St W,7143,10/01/2020 00:11,Kendal Ave / Bernard Ave,6634,Annual Member
2,9970497,525,7051,10/01/2020 00:01,Wellesley St E / Yonge St (Green P),7009,10/01/2020 00:09,King St E / Jarvis St,4560,Annual Member
3,9970498,382,7004,10/01/2020 00:01,University Ave / Elm St,7050,10/01/2020 00:07,Richmond St E / Jarvis St Green P,4948,Annual Member
4,9970499,417,7051,10/01/2020 00:01,Wellesley St E / Yonge St (Green P),7292,10/01/2020 00:08,Granby St / Church St - SMART,5400,Annual Member


In [51]:
anomaly_subset = df_2020_10.loc[anomaly.index]
anomaly_subset

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
25640,10000084625,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member,
25837,10000306555,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member,
26029,10000519608,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member,
26248,10000755851,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member,
26560,10001076784,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member,
...,...,...,...,...,...,...,...,...,...,...
266879,10281521317,7417,10/30/2020 17:01,King St W / Jordan St,7253,10/30/2020 17:06,John St / Mercer St - SMART,4888,Annual Member,
268551,10283383488,7017,10/30/2020 19:42,Widmer St / Adelaide St W,7417,10/30/2020 19:50,King St W / Jordan St,5835,Annual Member,
275974,10291748450,7417,10/31/2020 18:00,King St W / Jordan St,7474,10/31/2020 18:07,Clarence Square,3814,Annual Member,
277412,10293410950,7474,10/31/2020 21:27,Clarence Square,7015,10/31/2020 21:42,King St W / Bay St (West Side),3771,Annual Member,


In [53]:
# double check consistency
anomaly_subset['user_type'].isnull().sum() == df_2020_10['user_type'].isnull().sum()

np.True_

In [55]:
# fill with 0 instead of NaN to maintain column type int64, we will be mapping to these indexes directly
df_2020_10.iloc[anomaly.index, 1:] = df_2020_10.iloc[anomaly.index, 1:].shift(periods=1, fill_value=0, axis='columns')
df_2020_10.iloc[anomaly.index]

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
25640,10000084625,0,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member
25837,10000306555,0,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member
26029,10000519608,0,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member
26248,10000755851,0,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member
26560,10001076784,0,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member
...,...,...,...,...,...,...,...,...,...,...
266879,10281521317,0,7417,10/30/2020 17:01,King St W / Jordan St,7253,10/30/2020 17:06,John St / Mercer St - SMART,4888,Annual Member
268551,10283383488,0,7017,10/30/2020 19:42,Widmer St / Adelaide St W,7417,10/30/2020 19:50,King St W / Jordan St,5835,Annual Member
275974,10291748450,0,7417,10/31/2020 18:00,King St W / Jordan St,7474,10/31/2020 18:07,Clarence Square,3814,Annual Member
277412,10293410950,0,7474,10/31/2020 21:27,Clarence Square,7015,10/31/2020 21:42,King St W / Bay St (West Side),3771,Annual Member


In [23]:
start_time = pd.to_datetime(df_2020_10.loc[anomaly.index, 'start_time'])
end_time = pd.to_datetime(df_2020_10.loc[anomaly.index, 'end_time'])
trip_dur = (end_time - start_time).dt.total_seconds().astype('int64')

df_2020_10.loc[trip_dur.index, 'trip_duration'] = trip_dur
df_2020_10.loc[trip_dur.index]

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
25640,10000084625,600,7120,10/03/2020 13:28,Gerrard St E / River St,7120,10/03/2020 13:38,Gerrard St E / River St,5250,Annual Member
25837,10000306555,600,7120,10/03/2020 13:38,Gerrard St E / River St,7576,10/03/2020 13:48,Front St E / Bayview Avenue,5250,Annual Member
26029,10000519608,600,7576,10/03/2020 13:48,Front St E / Bayview Avenue,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,5250,Annual Member
26248,10000755851,840,7357,10/03/2020 13:58,Lake Shore Blvd E / Leslie St,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,5250,Annual Member
26560,10001076784,840,7313,10/03/2020 14:12,Coxwell Ave / Lake Shore Blvd E,7317,10/03/2020 14:26,Hubbard Blvd / Balsam Av,5250,Annual Member
...,...,...,...,...,...,...,...,...,...,...
266879,10281521317,300,7417,10/30/2020 17:01,King St W / Jordan St,7253,10/30/2020 17:06,John St / Mercer St - SMART,4888,Annual Member
268551,10283383488,480,7017,10/30/2020 19:42,Widmer St / Adelaide St W,7417,10/30/2020 19:50,King St W / Jordan St,5835,Annual Member
275974,10291748450,420,7417,10/31/2020 18:00,King St W / Jordan St,7474,10/31/2020 18:07,Clarence Square,3814,Annual Member
277412,10293410950,900,7474,10/31/2020 21:27,Clarence Square,7015,10/31/2020 21:42,King St W / Bay St (West Side),3771,Annual Member


In [24]:
df_2020_10.dtypes

trip_id                int64
trip_duration          int64
start_station_id      object
start_time            object
start_station_name    object
end_station_id        object
end_time              object
end_station_name      object
bike_id               object
user_type             object
dtype: object

In [25]:
df_2020_10.isnull().sum()

trip_id                 0
trip_duration           0
start_station_id        0
start_time              0
start_station_name    164
end_station_id         60
end_time                0
end_station_name      202
bike_id                 0
user_type               0
dtype: int64

In [26]:
df_2020_10[df_2020_10['end_station_id'].isnull()].head()

Unnamed: 0,trip_id,trip_duration,start_station_id,start_time,start_station_name,end_station_id,end_time,end_station_name,bike_id,user_type
18189,9991564,0,7235,10/02/2020 17:41,Bay St / College St (West Side) - SMART,,10/02/2020 17:41,,6677,Annual Member
18629,9992026,0,7129,10/02/2020 18:15,Davenport Rd / Avenue Rd,,10/02/2020 18:15,,1031,Annual Member
23032,9997148,182515,7168,10/03/2020 10:30,Queens Quay / Yonge St,,10/05/2020 13:12,,4635,Annual Member
23614,9997816,0,7391,10/03/2020 11:15,Yonge St / Dundas Sq,,10/03/2020 11:15,,2977,Annual Member
26512,10001030,0,7056,10/03/2020 14:10,Parliament St / Gerrard St,,10/03/2020 14:10,,6288,Annual Member


In [27]:
df_2020_10['trip_duration'].loc[df_2020_10['end_station_id'].isnull()].value_counts()

trip_duration
0         54
182515     1
587862     1
775107     1
686918     1
273235     1
252375     1
Name: count, dtype: int64

In [28]:
df_2020_10[['end_station_id', 'end_station_name']].loc[df_2020_10['end_station_id'].isnull()].isnull().sum()

end_station_id      60
end_station_name    60
dtype: int64

In [29]:
df_2020_10.loc[df_2020_10['start_station_id'] == df_2020_10['end_station_id']].shape[0]

12017

In [30]:
df_2020_10.loc[df_2020_10['trip_duration'] == 0].shape[0]

133

In [31]:
# adjust dtype of Start Station Id, End Station Id, Bike Id
# datetimes will be adjusted later with all data files, first adjustment found at [11]


Identify inconsistent column names across files within each year.

Detect presence of non-standard headers, placeholder rows, or encoding issues.

Check if date/time ranges are coherent (e.g., all monthly files span the full year

Column renaming/standardization

Missing column insertion

Type enforcement

Datetime parsing

Concate all df per year

Validate schema consistency

Save