## Initial Data Cleaning

- Fix convert the datetime column to datetime type
- Add a column for date of week of checkout
- Add a column for the different bins, we defined for the checkout time
  -  0: 00:00AM - 5:00AM
  -  1: 5AM-10AM
  -  2: 10AM-2PM
  -  3: 2PM-6PM
  -  4: 6PM-10:30PM
  -  5: 10:30PM-00:00AM


In [4]:
import datetime
import os
import dotenv
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

dotenv.load_dotenv()

True

In [5]:
bike_trips = os.path.join(os.environ['MOHAMMAD_SHARED_PATH'], 'Backup', 'Bike_Trips.xlsx')

In [6]:
df_bike_share = pd.read_excel(bike_trips, sheet_name="Data_Reformatted")   
#### Remove Duration minute == 0
df_bike_share = df_bike_share[df_bike_share['Duration (Minutes)'] != 0].copy()

df_bike_share = df_bike_share.fillna(0)

#### 2 of the rows in the df where misaligned

In [7]:
# Filter out only datetime.time objects
time_mask_ok = df_bike_share['Checkout Time'].apply(lambda x: isinstance(x, datetime.time))
time_df_ok = df_bike_share[time_mask_ok].copy()

time_mask_notOk = df_bike_share['Checkout Time'].apply(lambda x: not isinstance(x, datetime.time))
time_df_notOk = df_bike_share[time_mask_notOk].copy()
time_df_notOk = time_df_notOk.shift(axis=1)

df_bike_share = pd.concat([time_df_ok, time_df_notOk], axis=0)

#there is no some return date time
mask = df_bike_share['Return Time'].apply(lambda x: isinstance(x, datetime.time))
df_bike_share = df_bike_share[mask].copy()

#### Convert the date and time column to a single datetime column of 
`type(object), type(object)` -> `type(datetime)`

In [8]:
# Convert 'Checkout Date' to datetime
df_bike_share['Checkout Date'] = pd.to_datetime(df_bike_share['Checkout Date'])
df_bike_share['Return Date'] = pd.to_datetime(df_bike_share['Return Date']).copy()

# Extract time component from 'Checkout Time'
time_component_checkout = df_bike_share['Checkout Time'].apply(lambda x: x.strftime('%H:%M:%S'))

time_component_return = df_bike_share['Return Time'].apply(lambda x: x.strftime('%H:%M:%S'))

# Combine date from 'Checkout Date' and time from 'Checkout Time'
df_bike_share['Checkout DateTime'] = df_bike_share['Checkout Date'] + pd.to_timedelta(time_component_checkout)
df_bike_share['Return DateTime'] = df_bike_share['Return Date'] + pd.to_timedelta(time_component_return)

df_bike_share = df_bike_share.drop(columns=['Checkout Date', 'Checkout Time', 'Return Date', 'Return Time'])

### Add day of the week
df_bike_share['Day_of_week'] = df_bike_share['Checkout DateTime'].dt.day_of_week

##### Breakdown the checkout time into bins

In [9]:
# 0: 00:00AM - 5:00AM
# 1: 5AM-10AM
# 2: 10AM-2PM
# 3: 2PM-6PM
# 4: 6PM-10:30PM
# 5: 10:30PM-00:00AM



# Define the bins and labels for time frames
bins = [-1, 300, 600, 840, 1080, 1350, 1440]  # Bins covering the entire day in minutes
labels = [0, 1, 2, 3, 4, 5]  # Labels for each time frame

# Convert 'Checkout DateTime' to minutes since midnight
checkout_minutes = df_bike_share['Checkout DateTime'].dt.hour * 60 + df_bike_share['Checkout DateTime'].dt.minute

# Cut the 'Checkout DateTime' column into specified time frames
df_bike_share['checkout_time_bin'] = pd.cut(checkout_minutes, bins=bins, labels=labels)
df_bike_share.sort_values(by=['checkout_time_bin'], inplace=True)


In [10]:
needed_columns = ['Membership Type', 'Bike', 'Checkout DateTime', 'Checkout Kiosk', 'Return DateTime', 'Return Kiosk', 'Duration (Minutes)', 'Adjusted Duration (Minutes)', 'Day_of_week', 'checkout_time_bin']

df_bike_share['Membership Type'] = df_bike_share['Membership Type'].astype(str)
df_bike_share['Bike'] = df_bike_share['Bike'].astype(str)
df_bike_share['Checkout Kiosk'] = df_bike_share['Checkout Kiosk'].astype(str)
df_bike_share['Return Kiosk'] = df_bike_share['Return Kiosk'].astype(str)
df_bike_share['Adjusted Duration (Minutes)'] = df_bike_share['Adjusted Duration (Minutes)'].astype(int)

df_bike_share[needed_columns].to_parquet(os.path.join(os.environ['OUTPUT_PATH'], 'Cleaned_Bike_Trips.parquet'))