In [87]:
import pandas as pd

from datetime import datetime
from geopy import distance

# Load data
In this step I load the first csv from 202207 and try to understand the format of the data.

In [2]:
df = pd.read_csv("/Users/kangchieh/Project/divvy_bike/data/divvy_202207-202306/csv/202207-divvy-tripdata.csv")
df.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,954144C2F67B1932,classic_bike,2022-07-05 08:12:47,2022-07-05 08:24:32,Ashland Ave & Blackhawk St,13224,Kingsbury St & Kinzie St,KA1503000043,41.907066,-87.667252,41.889177,-87.638506,member
1,292E027607D218B6,classic_bike,2022-07-26 12:53:38,2022-07-26 12:55:31,Buckingham Fountain (Temp),15541,Michigan Ave & 8th St,623,41.869621,-87.623981,41.872773,-87.623981,casual
2,57765852588AD6E0,classic_bike,2022-07-03 13:58:49,2022-07-03 14:06:32,Buckingham Fountain (Temp),15541,Michigan Ave & 8th St,623,41.869621,-87.623981,41.872773,-87.623981,casual
3,B5B6BE44314590E6,classic_bike,2022-07-31 17:44:21,2022-07-31 18:42:50,Buckingham Fountain (Temp),15541,Woodlawn Ave & 55th St,TA1307000164,41.869621,-87.623981,41.795264,-87.596471,casual
4,A4C331F2A00E79E0,classic_bike,2022-07-13 19:49:06,2022-07-13 20:15:24,Wabash Ave & Grand Ave,TA1307000117,Sheffield Ave & Wellington Ave,TA1307000052,41.891466,-87.626761,41.936253,-87.652662,member
5,579D73BE2ED880B3,electric_bike,2022-07-01 17:04:35,2022-07-01 17:13:18,Desplaines St & Randolph St,15535,Clinton St & Roosevelt Rd,WL-008,41.884614,-87.644564,41.867118,-87.641088,member
6,EFE518CCEE333669,classic_bike,2022-07-18 18:11:01,2022-07-18 18:22:30,Marquette Ave & 89th St,20239,East End Ave & 87th St,20231,41.733669,-87.558342,41.736815,-87.582801,member
7,315FEBB7B3F6D2EA,classic_bike,2022-07-28 20:38:18,2022-07-28 21:09:11,Wabash Ave & Grand Ave,TA1307000117,Dearborn Pkwy & Delaware Pl,TA1307000128,41.891466,-87.626761,41.898969,-87.629912,casual
8,EE3C4A1E66766B56,classic_bike,2022-07-10 22:55:59,2022-07-10 23:01:32,Wabash Ave & Grand Ave,TA1307000117,Dearborn Pkwy & Delaware Pl,TA1307000128,41.891466,-87.626761,41.898969,-87.629912,member
9,1EE6C93A547A187C,electric_bike,2022-07-10 09:35:58,2022-07-10 09:47:25,Ashland Ave & Blackhawk St,13224,Orleans St & Merchandise Mart Plaza,TA1305000022,41.907093,-87.667247,41.888243,-87.63639,member


# Missing values

In [3]:
df.info() # see how many nulls do we have in each columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 823488 entries, 0 to 823487
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             823488 non-null  object 
 1   rideable_type       823488 non-null  object 
 2   started_at          823488 non-null  object 
 3   ended_at            823488 non-null  object 
 4   start_station_name  711457 non-null  object 
 5   start_station_id    711457 non-null  object 
 6   end_station_name    702537 non-null  object 
 7   end_station_id      702537 non-null  object 
 8   start_lat           823488 non-null  float64
 9   start_lng           823488 non-null  float64
 10  end_lat             822541 non-null  float64
 11  end_lng             822541 non-null  float64
 12  member_casual       823488 non-null  object 
dtypes: float64(4), object(9)
memory usage: 81.7+ MB


## Conclusion
As we can see there are missing values in the columns: start_station_name, start_station_id, end_station_name, end_station_id, end_lat, end_lng

# Data validity

## Check if the length of ride id is equal to 16

In [74]:
def len_rideid(id):
    return len(id)==16

print(f"There are {(-df['ride_id'].apply(len_rideid)).sum()} id does not have length of 16")

There are 0 id does not have length of 16


## Check the categories of bikes 

In [5]:
print(f"Types of bikes: {df['rideable_type'].unique()}")

Types of bikes: ['classic_bike' 'electric_bike' 'docked_bike']


## Check if started_at, ended_at are in time format	

In [6]:
df["started_at"] = pd.to_datetime(df["started_at"], errors='coerce')
print(f"There are {(-df['started_at'].dt.time.notna()).sum()} values in started_at that are not in time format")

There are 0 values in started_at that are not in time format


In [7]:
df["ended_at"] = pd.to_datetime(df["ended_at"], errors='coerce')
print(f"There are {(-df['ended_at'].dt.time.notna()).sum()} values in ended_at that are not in time format")

There are 0 values in ended_at that are not in time format


## Check latitude and longitude

In [8]:
print(f"There are {(-df['start_lat'].between(-90, 90)).sum()} values in start_lat that do not match the format")
print(f"There are {(-df['start_lng'].between(-180, 180)).sum()} values in start_lng that do not match the format")
print(f"There are {(-df['end_lat'].between(-90, 90)).sum()} values in end_lat that do not match the format")
print(f"There are {(-df['end_lng'].between(-180, 180)).sum()} values in end_lng that do not match the format")

There are 0 values in start_lat that do not match the format
There are 0 values in start_lng that do not match the format
There are 947 values in end_lat that do not match the format
There are 947 values in end_lng that do not match the format


In [86]:
# the values that do not match the format are nan values
print(f"There are {df['end_lat'][-df['end_lat'].between(-180, 180)].notnull().sum()} not null end_lat values")
print(f"There are {df['end_lng'][-df['end_lng'].between(-180, 180)].notnull().sum()} not null end_lng valuest")


There are 0 not null end_lat values
There are 0 not null end_lng valuest


## Check the categories of member_casual

In [10]:
print(f"Categories of member_casual: {df['member_casual'].unique()}")

Categories of member_casual: ['member' 'casual']


## Check if there is two stations with same id but different name (misspell...)

In [73]:
# create a dataframe with a lists of stations
start = df.loc[:,["start_station_name",	"start_station_id", "start_lat", "start_lng"]]
end = df.loc[:,["end_station_name", "end_station_id", "end_lat", "end_lng"]]
stations_df = pd.concat([start.rename(columns={'start_station_name':'name', 'start_station_id': 'id', 'start_lat': 'lat', 'start_lng': 'lng'}), end.rename(columns={'end_station_name':'name', 'end_station_id': 'id', 'end_lat': 'lat', 'end_lng': 'lng'})])

# find if a id matches two different station names
result = stations_df.groupby('id')['name'].nunique() > 1
rows_with_different_names = stations_df[stations_df['id'].isin(result[result].index)]

# keep only one instance
stations_df_drop = rows_with_different_names[['name', 'id']].drop_duplicates()

# get the latitude and longitude for stations by taking the mode
stations_df_lat_lng = stations_df.groupby(['name', 'id'])['lat', 'lng'].agg(lambda x: pd.Series.mode(x)[0])

# get a list of ids
id_list = stations_df['id'].unique()

  stations_df_lat_lng = stations_df.groupby(['name', 'id'])['lat', 'lng'].agg(lambda x: pd.Series.mode(x)[0])


In [88]:
# decide if two station names with same id are indeed the same by calculating their distance
modify = {"change_name": {}, "change_id": {}} # change_name: distance is less than 50 meters, so we only need to change name, change_id: distance is greater than 50 meters
for id, name in stations_df_drop.groupby('id', as_index=False)['name']:
    distances = distance.distance(stations_df_lat_lng.loc[(name.values[0], id)].values, stations_df_lat_lng.loc[(name.values[1], id)].values).km
    if distances <= 0.05:
        modify["change_name"][id] = name.values[0]
    else:
        modify["change_id"][name.values[0]] = "A" + id + "-1" # create new id 

In [89]:
modify

{'change_name': {'13053': 'Green St & Randolph St',
  'TA1306000015': 'Morgan St & Lake St'},
 'change_id': {'Bissell St & Armitage Ave': 'A13059-1',
  'Ridge Blvd & Howard St': 'A514-1',
  'Paulina St & Howard St': 'A515-1',
  'Clark St & Jarvis Ave': 'A517-1',
  'Public Rack - Keystone Ave & North Ave': 'A518-1',
  'Wolcott Ave & Fargo Ave': 'A519-1',
  'Greenview Ave & Jarvis Ave': 'A520-1',
  'Eastlake Ter & Rogers Ave': 'A523-1',
  'Glenwood Ave & Touhy Ave': 'A525-1',
  'Western Ave & Howard St': 'A527-1',
  'Pulaski Rd & Lake St': 'A528-1',
  'Pulaski Rd & Congress Pkwy': 'A535-1',
  'Kostner Ave & Lake St': 'A536-1',
  'Laramie Ave & Madison St': 'A540-1',
  'Public Rack - Cicero Ave & Roscoe St': 'A543-1',
  'Kostner Ave & Adams St': 'A545-1',
  'Public Rack - Cicero Ave & Wellington Ave': 'A546-1',
  'Marshfield Ave & 44th St': 'A549-1',
  'Elizabeth St & 47th St': 'A553-1',
  'Damen Ave & 51st St': 'A554-1',
  'Throop St & 52nd St': 'A556-1',
  'Racine Ave & Garfield Blvd': 

In [98]:
for item in modify['change_name'].items():
    df1.loc[df1['start_station_id'] == item[0], 'start_station_name'] = item[1]
    df1.loc[df1['end_station_id'] == item[0], 'end_station_name'] = item[1]

In [106]:
for item in modify['change_id'].items():
    df1.loc[df1['start_station_name'] == item[0], 'start_station_id'] = item[1]
    df1.loc[df1['end_station_name'] == item[0], 'end_station_id'] = item[1]