In [45]:
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

# Load data
In this step I load the first csv from 202207 and try to understand the format of the data.

In [46]:
df = pd.read_csv("/Users/kangchieh/Project/divvy_bike/data/divvy_202207-202306/csv/202207-divvy-tripdata.csv")
df.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,954144C2F67B1932,classic_bike,2022-07-05 08:12:47,2022-07-05 08:24:32,Ashland Ave & Blackhawk St,13224,Kingsbury St & Kinzie St,KA1503000043,41.907066,-87.667252,41.889177,-87.638506,member
1,292E027607D218B6,classic_bike,2022-07-26 12:53:38,2022-07-26 12:55:31,Buckingham Fountain (Temp),15541,Michigan Ave & 8th St,623,41.869621,-87.623981,41.872773,-87.623981,casual
2,57765852588AD6E0,classic_bike,2022-07-03 13:58:49,2022-07-03 14:06:32,Buckingham Fountain (Temp),15541,Michigan Ave & 8th St,623,41.869621,-87.623981,41.872773,-87.623981,casual
3,B5B6BE44314590E6,classic_bike,2022-07-31 17:44:21,2022-07-31 18:42:50,Buckingham Fountain (Temp),15541,Woodlawn Ave & 55th St,TA1307000164,41.869621,-87.623981,41.795264,-87.596471,casual
4,A4C331F2A00E79E0,classic_bike,2022-07-13 19:49:06,2022-07-13 20:15:24,Wabash Ave & Grand Ave,TA1307000117,Sheffield Ave & Wellington Ave,TA1307000052,41.891466,-87.626761,41.936253,-87.652662,member
5,579D73BE2ED880B3,electric_bike,2022-07-01 17:04:35,2022-07-01 17:13:18,Desplaines St & Randolph St,15535,Clinton St & Roosevelt Rd,WL-008,41.884614,-87.644564,41.867118,-87.641088,member
6,EFE518CCEE333669,classic_bike,2022-07-18 18:11:01,2022-07-18 18:22:30,Marquette Ave & 89th St,20239,East End Ave & 87th St,20231,41.733669,-87.558342,41.736815,-87.582801,member
7,315FEBB7B3F6D2EA,classic_bike,2022-07-28 20:38:18,2022-07-28 21:09:11,Wabash Ave & Grand Ave,TA1307000117,Dearborn Pkwy & Delaware Pl,TA1307000128,41.891466,-87.626761,41.898969,-87.629912,casual
8,EE3C4A1E66766B56,classic_bike,2022-07-10 22:55:59,2022-07-10 23:01:32,Wabash Ave & Grand Ave,TA1307000117,Dearborn Pkwy & Delaware Pl,TA1307000128,41.891466,-87.626761,41.898969,-87.629912,member
9,1EE6C93A547A187C,electric_bike,2022-07-10 09:35:58,2022-07-10 09:47:25,Ashland Ave & Blackhawk St,13224,Orleans St & Merchandise Mart Plaza,TA1305000022,41.907093,-87.667247,41.888243,-87.63639,member


# Missing values

In [47]:
df.info() # see how many null do we have in each columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823488 entries, 0 to 823487
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             823488 non-null  object 
 1   rideable_type       823488 non-null  object 
 2   started_at          823488 non-null  object 
 3   ended_at            823488 non-null  object 
 4   start_station_name  711457 non-null  object 
 5   start_station_id    711457 non-null  object 
 6   end_station_name    702537 non-null  object 
 7   end_station_id      702537 non-null  object 
 8   start_lat           823488 non-null  float64
 9   start_lng           823488 non-null  float64
 10  end_lat             822541 non-null  float64
 11  end_lng             822541 non-null  float64
 12  member_casual       823488 non-null  object 
dtypes: float64(4), object(9)
memory usage: 81.7+ MB


## Conclusion
As we can see there are missing values in the columns: start_station_name, start_station_id, end_station_name, end_station_id, end_lat, end_lng

# Data validity

## Check if the length of ride id is equal to 16

In [48]:
def len_rideid(id):
    return len(id)==16

print(f"There are {(df['ride_id'].apply(len_rideid)==False).sum()} id does not have length of 16")

There are 0 id does not have length of 16


## Check the categories of bikes 

In [49]:
print(f"Types of bikes: {df['rideable_type'].unique()}")

Types of bikes: ['classic_bike' 'electric_bike' 'docked_bike']


## Check if started_at, ended_at are in time format	

In [57]:
df["started_at"] = pd.to_datetime(df["started_at"], errors='coerce')
print(f"There are {(-df['started_at'].dt.time.notna()).sum()} values in started_at that are not in time format")

There are 0 values in started_at that are not in time format


In [58]:
df["ended_at"] = pd.to_datetime(df["ended_at"], errors='coerce')
print(f"There are {(-df['ended_at'].dt.time.notna()).sum()} values in ended_at that are not in time format")

There are 0 values in ended_at that are not in time format


## Check latitude and longitude

In [66]:
print(f"There are {(-df['start_lat'].between(-90, 90)).sum()} values in start_lat that do not match the format")
print(f"There are {(-df['start_lng'].between(-180, 180)).sum()} values in start_lng that do not match the format")
print(f"There are {(-df['end_lat'].between(-90, 90)).sum()} values in end_lat that do not match the format")
print(f"There are {(-df['end_lng'].between(-180, 180)).sum()} values in end_lng that do not match the format")

There are 0 values in start_lat that do not match the format
There are 0 values in start_lng that do not match the format
There are 947 values in end_lat that do not match the format
There are 947 values in end_lng that do not match the format


In [65]:
df['end_lng'][-df['end_lng'].between(-180, 180)] # the values that do not match the format are nan values

36126    NaN
36267    NaN
36298    NaN
36335    NaN
36357    NaN
          ..
812854   NaN
814289   NaN
814952   NaN
815137   NaN
819961   NaN
Name: end_lng, Length: 947, dtype: float64

## Check the categories of member_casual

In [67]:
print(f"Categories of member_casual: {df['member_casual'].unique()}")

Categories of member_casual: ['member' 'casual']
