table of contents
1. data acquisition
2. data wrangling and cleaning
3. data exploration and visualization
4. recommendations and conclusions

## 1. Data Acquisition

### 1.1 Load Packages and Data

In [1]:
# import libraries
import os

import pandas as pd
import numpy as np
from datetime import datetime
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
# read data (website: https://www.lyft.com/bikes/bay-wheels/system-data)

# Walk through all directories and subdirectories
dfs = []
for root, dirs, files in os.walk('data/2022/'):
    for file in files:
        # Check if the file is a CSV file
        if file.endswith('.csv'):
            # Read the file into a data frame and append it to the list of data frames
            df = pd.read_csv(os.path.join(root, file))
            dfs.append(df)

# concat all data frames into a single data frame
df0 = pd.concat(dfs, ignore_index=True)

### 1.2 Inspect Data

In [3]:
# make a working copy
df1 = df0.copy()

In [4]:
# get preview
df1.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0DD008BC62836D27,classic_bike,2022-01-13 19:12:23,2022-01-13 19:22:13,Washington St at Van Ness Ave,SF-E22,Natoma St at New Montgomery St,SF-G28-2,37.79298,-122.423302,37.786456,-122.399749,member


In [5]:
# get info about data
df1.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2614800 entries, 0 to 2614799
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             2614800 non-null  object 
 1   rideable_type       2614800 non-null  object 
 2   started_at          2614800 non-null  object 
 3   ended_at            2614800 non-null  object 
 4   start_station_name  2230302 non-null  object 
 5   start_station_id    2227601 non-null  object 
 6   end_station_name    2174934 non-null  object 
 7   end_station_id      2172199 non-null  object 
 8   start_lat           2614800 non-null  float64
 9   start_lng           2614800 non-null  float64
 10  end_lat             2612230 non-null  float64
 11  end_lng             2612230 non-null  float64
 12  member_casual       2614800 non-null  object 
dtypes: float64(4), object(9)
memory usage: 259.3+ MB


***

## 2. Data Cleaning, Wrangling and Pre-processing

### 2.1 Update Data Types

In [6]:
# update data types
dtype= {'started_at': 'datetime64', 
        'ended_at': 'datetime64'}

df1 = df1.astype(dtype)

# Test
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2614800 entries, 0 to 2614799
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           object        
 9   start_lng           object        
 10  end_lat             object        
 11  end_lng             object        
 12  member_casual       object        
dtypes: datetime64[ns](2), object(11)
memory usage: 259.3+ MB


### 2.2 Check Duplicate and Missing Values

In [7]:
# check duplicates
df1.duplicated().sum()

0

In [8]:
# check missing values
df1.isnull().sum()

ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    384498
start_station_id      387199
end_station_name      439866
end_station_id        442601
start_lat                  0
start_lng                  0
end_lat                 2570
end_lng                 2570
member_casual              0
dtype: int64

In [9]:
# drop all rides missing end latitude
df1 = df1.dropna(subset=['end_lat'])

# check update
df1.isnull().sum()

ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    384498
start_station_id      387195
end_station_name      437296
end_station_id        440031
start_lat                  0
start_lng                  0
end_lat                    0
end_lng                    0
member_casual              0
dtype: int64

### 2.3 Handle Missing Values Using Station Name And Station ID

**Missing values in `start_station_name` and `start_station_id`**

In [11]:
# scope out rows with missing data
df1[df1['start_station_name'].isnull()].head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
1091,95B9A7847720F560,electric_bike,2022-01-18 20:14:43,2022-01-18 20:19:39,,,18th St at Noe St,SF-O19,37.76,-122.42,37.761047,-122.432642,member
1092,2A653EF4CDCE1342,electric_bike,2022-01-10 08:54:10,2022-01-10 09:13:52,,,Terry Francois Blvd at Mission Bay Blvd N,SF-L31-1,37.81,-122.42,37.771767,-122.386689,member
1093,0665F2FA09B5C666,electric_bike,2022-01-08 20:33:19,2022-01-08 20:36:18,,,Terry Francois Blvd at Mission Bay Blvd N,SF-L31-1,37.77,-122.39,37.771767,-122.386689,casual
1547,6CAE891EE8D630A5,electric_bike,2022-01-30 10:59:49,2022-01-30 11:28:04,,,48th Ave at Cabrillo St,SF-J1,37.8,-122.43,37.772954,-122.509071,casual
1548,39568245D8151714,electric_bike,2022-01-20 17:49:45,2022-01-20 17:55:13,,,Laguna St at Hayes St,SF-J21,37.78,-122.42,37.776247,-122.426203,casual


In [13]:
# rows with station id, but without station name
df1[(~df1['start_station_id'].isnull())
   &(df1['start_station_name'].isnull())]['start_station_id'].value_counts()

Series([], Name: start_station_id, dtype: int64)

In [14]:
# rows with station name, but without station id
df1[(~df1['start_station_name'].isnull())
   &(df1['start_station_id'].isnull())]['start_station_name'].value_counts()

Howard St at Grace St           1524
Berry St at 4th St Station 2     960
Howard Internal Monolith         119
Howard Grace #2                   91
Berry St at 4th St*                3
Name: start_station_name, dtype: int64

In [16]:
# get list of station names that are missing station id
stns = df1[(~df1['start_station_name'].isnull())
          &(df1['start_station_id'].isnull())]['start_station_name'] \
          .value_counts().index.tolist()

# the check how many station names are missing station id
for idx,stn in enumerate(stns):
    stn_id = df1[df1['start_station_name']==stn]['start_station_id'].value_counts().index
    print(stn, stn_id)

In [19]:
# since there is only 1, update missing station id
df1.loc[(df1['start_station_id'].isna()) 
       &(df1['start_station_name']=='Howard St at Grace St'), 'start_station_id'] = 'SF-K24-2'

# then update the rest of the station_id with the station name
for idx,stn in enumerate(stns):
    df1.loc[(df1['start_station_id'].isna()) 
       &(df1['start_station_name']==stn), 'start_station_id'] = stn

In [21]:
# check update
df1[(~df1['start_station_name'].isnull())
   &(df1['start_station_id'].isnull())]['start_station_name'].value_counts()

Series([], Name: start_station_name, dtype: int64)

**Missing values in `end_station_name` and `end_station_id`**

In [24]:
# rows with station id, but without station name
df1[(~df1['end_station_id'].isnull())
   &(df1['end_station_name'].isnull())]['end_station_name'].value_counts()

Series([], Name: end_station_name, dtype: int64)

In [25]:
# rows with station name, but without station id
df1[(~df1['end_station_name'].isnull())
   &(df1['end_station_id'].isnull())]['end_station_name'].value_counts()

Howard St at Grace St           1529
Berry St at 4th St Station 2     986
Howard Internal Monolith         129
Howard Grace #2                   88
Berry St at 4th St*                3
Name: end_station_name, dtype: int64

In [26]:
# apply the same method as we did for the start stations
# get list of station names that are missing station id
stns = df1[(~df1['end_station_name'].isnull())
          &(df1['end_station_id'].isnull())]['end_station_name'].value_counts().index.tolist()

# since there is only 1, update missing station id
df1.loc[(df1['end_station_id'].isna()) 
       &(df1['end_station_name']=='Howard St at Grace St'), 'end_station_id'] = 'SF-K24-2'

# then update the rest of the station_id with the station name
for idx,stn in enumerate(stns):
    df1.loc[(df1['end_station_id'].isna()) &(df1['end_station_name']==stn), 'end_station_id'] = stn

In [27]:
# check update
df1.isnull().sum()

ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    384498
start_station_id      384498
end_station_name      437296
end_station_id        437296
start_lat                  0
start_lng                  0
end_lat                    0
end_lng                    0
member_casual              0
dtype: int64

In [28]:
# now, all staions missing names are also missing id
# from here on, we will be working solely with station name

In [29]:
# let's drop station id columns
df1 = df1.drop(columns=['start_station_id', 'end_station_id'])

# check update
df1.head()

### 2.4 Handling Missing Values Using Latitude and Longitude

In [31]:
# create new columns for full coordinates of stations
df1['start_coord'] = df1['start_lat'].astype('str') + ', ' + df1['start_lng'].astype('str')
df1['end_coord'] = df1['end_lat'].astype('str') + ', ' + df1['end_lng'].astype('str')

**Missing values in `start_station_name`**

In [32]:
# check coordinates for missing start station names
df1[df1['start_station_name'].isnull()]['start_coord'].value_counts()

37.79, -122.4     22812
37.79, -122.41    14006
37.76, -122.42    12579
37.78, -122.42    12326
37.78, -122.41    11930
                  ...  
37.3, -121.83         1
37.3, -121.96         1
37.31, -121.82        1
37.26, -121.81        1
37.28, -121.86        1
Name: start_coord, Length: 331, dtype: int64

In [33]:
# get list of coordinates for missing stations
start_coords = df1[df1['start_station_name'].isnull()]['start_coord'].value_counts().index.tolist()

# create a dictionary for coordinates, look up corresponding station names from other sections of the data
stn_names = {}
for coords in start_coords: # loop over coords in start_coords list
    stn = df1[(df1['start_coord']==coords) & (~df1['start_station_name'].isnull())]
    
    #if not empty, append station id; else continue
    if len(stn): stn_names[coords] = stn['start_station_name'].value_counts().index[0] 
    else: continue

# check output        
stn_names

{'37.78, -122.42': 'Howard St at Grace St',
 '37.77, -122.43': 'Howard Internal Monolith',
 '37.77, -122.42': 'Howard St at Grace St',
 '37.77, -122.41': 'Howard St at Grace St',
 '37.78, -122.39': 'Berry St at 4th St Station 2'}

In [34]:
# some of these coordinates have more than 1 station associated with them, but that means they are all very near each other and 
# it doesnt make a huge difference if the station name is wrong bc it's being replaced by a station name that is also close by

In [35]:
# using the dictionary, fill in rows missing station name
for coord in stn_names:
    df1.loc[(df1['start_coord']==coord) & (df1['start_station_name'].isnull()), 'start_station_name'] = stn_names[coord]

# check update
df1.isnull().sum()

**Missing values in `end_station_name`**

In [38]:
# check coordinates for missing end station names
df1[df1['end_station_name'].isnull()]['end_coord'].value_counts()

37.79, -122.4     27175
37.79, -122.41    15825
37.78, -122.42    14456
37.78, -122.41    14421
37.76, -122.42    13930
                  ...  
37.84, -122.26        1
37.39, -121.98        1
37.32, -122.01        1
37.26, -121.95        1
37.55, -122.31        1
Name: end_coord, Length: 461, dtype: int64

In [39]:
# get list of coordinates for missing stations
start_coords = df1[df1['end_station_name'].isnull()]['end_coord'].value_counts().index.tolist()

# create a dictionary for coordinates, look up corresponding station names from other sections of the data
stn_names = {}
for coords in start_coords: # loop over coords in start_coords list
    stn = df1[(df1['end_coord']==coords) & (~df1['end_station_name'].isnull())]
    
    #if not empty, append station id; else continue
    if len(stn): stn_names[coords] = stn['end_station_name'].value_counts().index[0] #if not empty, append station id
    else: continue

# check output         
stn_names

{'37.77, -122.41': 'Howard St at Grace St',
 '37.78, -122.39': 'Berry St at 4th St Station 2'}

In [40]:
# using the dictionary, fill in rows missing station name
for coord in stn_names:
    df1.loc[(df1['end_coord']==coord) & (df1['end_station_name'].isnull()), 'end_station_name'] = stn_names[coord]

# check update
df1.isnull().sum()

**Rows Still Missing Data in `start_station_name` And `end_station_name`**

In [42]:
# check % of data missing both start and end stations
len(df1[(df1['start_station_name'].isnull()) & (df1['end_station_name'].isnull())]) / 2614799 * 100

5.843661405714168

In [43]:
# drop rows
df1 = df1.dropna(subset=['start_station_name', 'end_station_name'], how='all')

# check update
df1.isnull().sum()

ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    181135
end_station_name      265558
start_lat                  0
start_lng                  0
end_lat                    0
end_lng                    0
member_casual              0
start_coord                0
end_coord                  0
dtype: int64

In [44]:
# not ideal but acceptable

### 2.5 Create Time and Datetime Columns

In [84]:
# create `ride_duration_m` column - ride duration in minutes
df1['ride_duration_m'] = (df1['ended_at'] - df1['started_at']).astype('timedelta64[m]')

# check update
df1.head()

In [86]:
# create month, day and hour columns
df1['month'] = df1['started_at'].dt.month_name()
df1['start_day'] = df1['started_at'].dt.day_name()
df1['hour'] = df1['started_at'].dt.hour

# check update
df1.head()

### 2.6 Create Dataframe of Station Locations

In [88]:
# get the average coordinates of each start station
start_stations_coords = df1.groupby('start_station_name')[['start_lat', 'start_lng']] \
                           .mean().reset_index().rename(columns={'start_station_name': 'station_name',
                                                                 'start_lat': 'lat',
                                                                 'start_lng': 'lng'})

# check dataframe
start_stations_coords.head()

In [89]:
# check that no 2 stations have the same coordinates
start_stations_coords.duplicated(subset=['start_lat', 'start_lng']).sum()

0

In [92]:
# get the average coordinates of each end station
end_stations_coords = df1.groupby('end_station_name')[['end_lat', 'end_lng']] \
                         .mean().reset_index().rename(columns={'end_station_name': 'station_name',
                                                               'end_lat': 'lat',
                                                               'end_lng': 'lng'})

# check dataframe
end_stations_coords.head()

0

In [None]:
# check that no 2 stations have the same coordinates
end_stations_coords.duplicated(subset=['end_lat', 'end_lng']).sum()

In [95]:
# concat start and end stations into a long dataframe and get average coordinates again
stations = pd.concat([start_stations_coords, end_stations_coords], ignore_index=True) \
             .groupby('station_name')[['lat', 'lng']].mean().reset_index()

# check dataframe
stations.head()

In [99]:
# check that no 2 stations have the same coordinates
stations.duplicated(subset=['lat', 'lng']).sum()

0

In [None]:
# summarize main features of the data at this point

In [83]:
# create a copy so we can work without having to restart the entire notebook
df_clean = df1.copy()

## 3. Data Exploration and Visualization

In [108]:
# groupby start station
# groupby end station
start_count = df_clean.groupby('start_station_name')[['ride_id']].count() \
                      .sort_values(by='ride_id', ascending=False).reset_index()

In [109]:
end_count = df_clean.groupby('end_station_name')[['ride_id']].count().sort_values(by='ride_id', ascending=False).reset_index()

In [116]:
ride_count_diff = start_count.merge(end_count, 
                                    left_on='start_station_name', 
                                    right_on='end_station_name', 
                                    suffixes=('_start', '_end')).drop(columns=['end_station_name']) \
                             .rename(columns={'start_station_name' : 'station_name',
                                              'ride_id_start': 'start_count',
                                              'ride_id_end': 'end_count'})

In [117]:
ride_count_diff

Unnamed: 0,station_name,start_count,end_count
0,Howard St at Grace St,35740,13991
1,Market St at 10th St,30934,30900
2,Market St at Steuart St,28825,30129
3,Powell St BART Station (Market St at 4th St),26315,27451
4,Powell St BART Station (Market St at 5th St),21831,22639
...,...,...,...
520,West at Brockhurst,10,10
521,Battery Street at California St,8,11
522,Berry St at 4th St*,3,3
523,X-DEV-MTL-01,3,3


In [None]:
# now we have 2 extra tables: stations and ride_count_diff

In [50]:
'''
explore by
- ride duration
- customer/member
- station
- time series
'''

'\nexplore by\n- ride duration\n- customer/member\n- station\n- time series\n'

In [102]:
# groupby `member_casual`
df_clean['member_casual'].value_counts(normalize=True)

member    0.589057
casual    0.410943
Name: member_casual, dtype: float64

In [118]:
df_clean['ride_duration_m'].describe() #why are there negative ride durations?????

count    2.459430e+06
mean     1.434708e+01
std      5.113678e+01
min     -3.810000e+02
25%      5.000000e+00
50%      9.000000e+00
75%      1.600000e+01
max      2.454000e+04
Name: ride_duration_m, dtype: float64