In [2]:
import pandas as pd
import numpy as np
from glob import glob

In [None]:
#get list of all data files
data_files = sorted(glob('processed_data/*.csv'))
data_files

In [None]:
#merge datafiles
data = pd.concat((pd.read_csv(file) for file in data_files), ignore_index=True)

In [None]:
data.columns

In [None]:
df = data.drop(['Unnamed: 11', 'station_id', 'Unnamed: 12', 'Unnamed: 13'], axis=1)
df.head(5)

In [None]:
df.shape

In [None]:
# Percentage of NAN Values 
missing_values = [(c, df[c].isna().mean()*100) for c in df]
missing_values = pd.DataFrame(missing_values, columns=["column_name", "percentage"])
missing_values

In [None]:
df.isnull().sum()

In [None]:
#drop missing values
df.dropna(axis=0, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#identify duplicate rows
duplicateRows = df[df.duplicated()]
duplicateRows

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#save merged dataset
df.to_csv('data/data_merged.csv', index=False)

## Data Cleaning and Feature Engineering

In [3]:
# read merged data
df_merged =  pd.read_csv('processed_data/data_merged.csv')
df_merged.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,member_casual,day_of_week,start_station_id,end_station_id
0,CFA86D4455AA1030,classic_bike,3/16/2021 8:32,3/16/2021 8:36,Humboldt Blvd & Armitage Ave,Stave St & Armitage Ave,casual,3,15651,13266
1,30D9DC61227D1AF3,classic_bike,3/28/2021 1:26,3/28/2021 1:36,Humboldt Blvd & Armitage Ave,Central Park Ave & Bloomingdale Ave,casual,1,15651,18017
2,846D87A15682A284,classic_bike,3/11/2021 21:17,3/11/2021 21:33,Shields Ave & 28th Pl,Halsted St & 35th St,casual,5,15443,13080
3,994D05AA75A168F2,classic_bike,3/11/2021 13:26,3/11/2021 13:55,Winthrop Ave & Lawrence Ave,Broadway & Sheridan Rd,casual,5,13080,13323
4,DF7464FBE92D8308,classic_bike,3/21/2021 9:09,3/21/2021 9:27,Glenwood Ave & Touhy Ave,Chicago Ave & Sheridan Rd,casual,1,525,8


In [4]:
df_merged.dtypes

ride_id               object
rideable_type         object
started_at            object
ended_at              object
start_station_name    object
end_station_name      object
member_casual         object
day_of_week            int64
start_station_id      object
end_station_id        object
dtype: object

### Change data types

In [6]:
df_merged['day_of_week'] = pd.to_numeric(df_merged.day_of_week)
df_merged['started_at'] = pd.to_datetime(df_merged['started_at'], infer_datetime_format=True)
df_merged['ended_at'] = pd.to_datetime(df_merged['ended_at'], infer_datetime_format=True)
df_merged['ride_duration'] = df_merged.ended_at - df_merged.started_at

In [7]:
df_merged.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,member_casual,day_of_week,start_station_id,end_station_id,ride_duration
0,CFA86D4455AA1030,classic_bike,2021-03-16 08:32:00,2021-03-16 08:36:00,Humboldt Blvd & Armitage Ave,Stave St & Armitage Ave,casual,3,15651,13266,0 days 00:04:00
1,30D9DC61227D1AF3,classic_bike,2021-03-28 01:26:00,2021-03-28 01:36:00,Humboldt Blvd & Armitage Ave,Central Park Ave & Bloomingdale Ave,casual,1,15651,18017,0 days 00:10:00
2,846D87A15682A284,classic_bike,2021-03-11 21:17:00,2021-03-11 21:33:00,Shields Ave & 28th Pl,Halsted St & 35th St,casual,5,15443,13080,0 days 00:16:00
3,994D05AA75A168F2,classic_bike,2021-03-11 13:26:00,2021-03-11 13:55:00,Winthrop Ave & Lawrence Ave,Broadway & Sheridan Rd,casual,5,13080,13323,0 days 00:29:00
4,DF7464FBE92D8308,classic_bike,2021-03-21 09:09:00,2021-03-21 09:27:00,Glenwood Ave & Touhy Ave,Chicago Ave & Sheridan Rd,casual,1,525,8,0 days 00:18:00


In [9]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype          
---  ------              --------------    -----          
 0   ride_id             1048575 non-null  object         
 1   rideable_type       1048575 non-null  object         
 2   started_at          1048575 non-null  datetime64[ns] 
 3   ended_at            1048575 non-null  datetime64[ns] 
 4   start_station_name  1048575 non-null  object         
 5   end_station_name    1048575 non-null  object         
 6   member_casual       1048575 non-null  object         
 7   day_of_week         1048575 non-null  int64          
 8   start_station_id    1048575 non-null  object         
 9   end_station_id      1048575 non-null  object         
 10  ride_duration       1048575 non-null  timedelta64[ns]
dtypes: datetime64[ns](2), int64(1), object(7), timedelta64[ns](1)
memory usage: 88.0+ MB


In [10]:
df_merged.shape

(1048575, 11)

In [17]:
# filter for negative ride_duraton

df_merged.drop(df_merged[df_merged['ended_at'] < df_merged['started_at']].index, inplace=True)


In [18]:
df_merged.shape

(1048573, 11)