In [1]:
import time
import os
from glob2 import glob
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
def calc_progress(processed_files, total_files, start_time):
    """
    Calculate and print the progress information.

    Args:
        processed_files (int): Number of files processed so far.
        total_files (int): Total number of files being processed.
        start_time (float): Start time of the file processing.

    """
    elapsed_time = time.time() - start_time
    avg_time_per_file = elapsed_time / processed_files
    remaining_files = total_files - processed_files
    estimated_remaining_time = avg_time_per_file * remaining_files
    
    # Convert elapsed time to hours, minutes, and seconds
    elapsed_hours, elapsed_rem = divmod(elapsed_time, 3600)
    elapsed_minutes, elapsed_seconds = divmod(elapsed_rem, 60)
    elapsed_hours, elapsed_minutes, elapsed_seconds = int(elapsed_hours), int(elapsed_minutes), int(elapsed_seconds)

    # Convert average time per file to hours, minutes, and seconds
    avg_hours, avg_rem = divmod(avg_time_per_file, 3600)
    avg_minutes, avg_seconds = divmod(avg_rem, 60)
    avg_hours, avg_minutes, avg_seconds = int(avg_hours), int(avg_minutes), int(avg_seconds)

    # Convert estimated remaining time to hours, minutes, and seconds
    est_hours, est_rem = divmod(estimated_remaining_time, 3600)
    est_minutes, est_seconds = divmod(est_rem, 60)
    est_hours, est_minutes, est_seconds = int(est_hours), int(est_minutes), int(est_seconds)

    # Print the elapsed time, average time per file, and estimated remaining time
    print(f"Elapsed Time:             {elapsed_hours:02d}:{elapsed_minutes:02d}:{elapsed_seconds:02d}")
    print(f"Avg. Time per File:       {avg_hours:02d}:{avg_minutes:02d}:{avg_seconds:02d}")
    print(f"Estimated Remaining Time: {est_hours:02d}:{est_minutes:02d}:{est_seconds:02d}\n")

In [3]:
# Get a list of files in the directory
files = sorted(glob(os.path.join('../raw_data/bikeshare', '*.parquet')))

In [4]:
# Gather station data
stations = pd.read_parquet('../clean_data/bike_stations.parquet')

In [5]:
stations

Unnamed: 0,station_id,station_name,latitude,longitude,borough
0,1000,1 Ave & E 110 St,40.78125,-73.9375,Manhattan
1,1001,1 Ave & E 16 St,40.71875,-74.0000,Manhattan
2,1002,1 Ave & E 18 St,40.71875,-74.0000,Manhattan
3,1003,1 Ave & E 30 St,40.75000,-74.0000,Manhattan
4,1004,1 Ave & E 39 St,40.75000,-74.0000,Manhattan
...,...,...,...,...,...
1803,2803,Wyckoff Ave & Gates Ave,40.68750,-73.9375,Brooklyn
1804,2804,Wyckoff St & 3 Ave,40.68750,-74.0000,Brooklyn
1805,2805,Wythe Ave & Metropolitan Ave,40.71875,-73.9375,Brooklyn
1806,2806,Wythe Ave & N 13 St,40.71875,-73.9375,Brooklyn


In [6]:
# Create a dictionary from station name and station id
map_station_id = stations[['station_name', 'station_id']].set_index('station_name')['station_id'].to_dict()

In [7]:
# Create a dictionary from station id and borough
map_borough = stations[['station_id', 'borough']].set_index('station_id')['borough'].to_dict()

In [8]:
# Create an empty DataFrame to compile data
tripdata = pd.DataFrame()

# Initialize the start time
start_time = time.time()

# Iterate over each file
for i, file in enumerate(files):
    print(f"Processing file {i+1}/{len(files)}")
    df = pd.read_parquet(file)
    
    # Calculate the ETA
    calc_progress(i+1, len(files), start_time)
    
    # Standardize names and drop segmented columns
    if 'starttime' in df.columns:
        # Rename columns
        df.rename(columns={
            'start station id': 'start_station_id',
            'start station name': 'start_station_name',
            'start station latitude': 'start_lat',
            'start station longitude': 'start_lng',
            'end station id': 'end_station_id',
            'end station name': 'end_station_name',
            'end station latitude': 'end_lat',
            'end station longitude': 'end_lng'
        }, inplace=True)
        
        # Drop columns
        df.drop(['stoptime', 'tripduration', 'bikeid', 'birth year', 'gender'], axis=1, inplace=True)
        
        # Map usertype column values
        df['usertype'] = df['usertype'].map({'Subscriber': 'member', 'Customer': 'casual'})
        
    if 'started_at' in df.columns:
        # Rename columns
        df.rename(columns={
            'started_at': 'starttime',
            'member_casual': 'usertype'
        }, inplace=True)
        
        # Drop columns
        df.drop(['ride_id', 'rideable_type', 'ended_at'], axis=1, inplace=True)
    
    # Map station_id columns and add borough data
    df['start_station_id'] = df['start_station_name'].map(map_station_id)
    df['end_station_id'] = df['end_station_name'].map(map_station_id)
    df['start_borough'] = df['start_station_id'].map(map_borough)
    df['end_borough'] = df['end_station_id'].map(map_borough)
    
    # Drop station name and spatial columns
    df.drop(['start_station_id', 'end_station_id', 'start_station_name', 'end_station_name', 'start_lat', 'start_lng',
             'end_lat', 'end_lng'], axis=1, inplace=True)
    
    # Convert to datetime
    df['starttime'] = pd.to_datetime(df['starttime'])
    
    # Set a datetime index
    df.set_index('starttime', inplace=True)
    
    # Concatenate the clean data to the tripdata
    tripdata = pd.concat([tripdata, df])

Processing file 1/60
Elapsed Time:             00:00:01
Avg. Time per File:       00:00:01
Estimated Remaining Time: 00:01:45

Processing file 2/60
Elapsed Time:             00:00:04
Avg. Time per File:       00:00:02
Estimated Remaining Time: 00:02:23

Processing file 3/60
Elapsed Time:             00:00:07
Avg. Time per File:       00:00:02
Estimated Remaining Time: 00:02:21

Processing file 4/60
Elapsed Time:             00:00:10
Avg. Time per File:       00:00:02
Estimated Remaining Time: 00:02:23

Processing file 5/60
Elapsed Time:             00:00:12
Avg. Time per File:       00:00:02
Estimated Remaining Time: 00:02:20

Processing file 6/60
Elapsed Time:             00:00:15
Avg. Time per File:       00:00:02
Estimated Remaining Time: 00:02:16

Processing file 7/60
Elapsed Time:             00:00:16
Avg. Time per File:       00:00:02
Estimated Remaining Time: 00:02:06

Processing file 8/60
Elapsed Time:             00:00:18
Avg. Time per File:       00:00:02
Estimated Remaining 

In [10]:
# Convert data types
for col in ['start_borough', 'end_borough']:
    tripdata[col] =tripdata[col].fillna('Unknown')
    tripdata[col] =tripdata[col].astype('category')

In [11]:
tripdata['usertype'] = tripdata['usertype'].astype('category')

In [12]:
# Get the number of original observations
og_rows = len(tripdata)
print(f"Total Rows: {og_rows:,.0f}")

Total Rows: 122,523,911


In [13]:
# **Exclude observations originating or ending in the Bronx, New Jersey, or Unknown**
tripdata = tripdata[(tripdata['start_borough'] != 'Bronx') & (tripdata['start_borough'] != 'New Jersey') & 
        (tripdata['start_borough'] != 'Unknown') & (tripdata['end_borough'] != 'Bronx') & 
        (tripdata['end_borough'] != 'New Jersey') & (tripdata['end_borough'] != 'Unknown')]

In [14]:
# Remove the Bronx, Unknown, and New Jersey categories
for col in ['start', 'end']:
    tripdata[f'{col}_borough'].cat.remove_categories(['Bronx', 'Unknown', 'New Jersey'], inplace=True)

In [15]:
# Get the number of observations after removing New Jersey stations
no_nj = len(df)

# Calculate the number of observations removed
nj_dropped = og_rows - no_nj

print(f"Rows Removed: {nj_dropped:,.0f}")
print(f"Rows Remaining: {no_nj:,.0f}")

Rows Removed: 28,072,547
Rows Remaining: 94,451,364


In [16]:
print(f"Percent Loss: {(nj_dropped/og_rows)*100:.2f}%")

Percent Loss: 22.91%


In [17]:
tripdata.isna().sum()

usertype         0
start_borough    0
end_borough      0
dtype: int64

In [18]:
pd.DataFrame.from_records(
    [(col, tripdata[col].nunique(), tripdata[col].dtype, 
      round((tripdata[col].memory_usage(deep=True)/1024)/1024, 2)) for col in tripdata.columns], 
    columns=['Column Name', 'Unique', 'Data Type','Memory Usage']
)

Unnamed: 0,Column Name,Unique,Data Type,Memory Usage
0,usertype,2,category,810.68
1,start_borough,3,category,810.68
2,end_borough,3,category,810.68


In [19]:
tripdata

Unnamed: 0_level_0,usertype,start_borough,end_borough
starttime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-06-01 00:52:16.289,member,Manhattan,Manhattan
2018-06-01 01:09:49.458,member,Manhattan,Brooklyn
2018-06-01 05:27:20.142,member,Manhattan,Manhattan
2018-06-01 07:05:02.579,member,Manhattan,Manhattan
2018-06-01 08:18:14.582,member,Manhattan,Manhattan
...,...,...,...
2023-05-11 17:37:54.000,member,Brooklyn,Brooklyn
2023-05-21 12:36:04.000,member,Queens,Manhattan
2023-05-08 09:37:54.000,member,Brooklyn,Brooklyn
2023-05-27 19:55:19.000,member,Manhattan,Manhattan


In [20]:
tripdata.to_parquet('../clean_data/clean_tripdata.parquet')