### Core analysis comparing behaviours (eg. trip length, day-of-week patterns, start/end stations heatmaps)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys

In [None]:
sys.path.append('/Users/kanghong/Documents/Developer/Google-Capstone/cyclist-case-study/src/')

from trip_analysis import season_match, filter_unfeasible_rides, 
from data_viz import generate_prop_plot, generate_pie, generate_hist_plot, generate_stack_hist
from data_cleaning import time_to_minutes, extract_hour, extract_month

In [None]:
cyclistic_data = pd.read_csv('/Users/kanghong/Documents/Developer/Google-Capstone/cyclist-case-study/data/processed/cyclistic_data_cleaned.csv')

cyclistic_data.head()

In [None]:
cyclistic_data['membership_status'].value_counts()

In [None]:
generate_pie(cyclistic_data, 'membership_status')


In [None]:
generate_pie(cyclistic_data, 'rideable_type')


In [None]:
'''casual_cyclistic_data = cyclistic_data.loc[cyclistic_data['membership_status'] == 'casual']
member_cyclistic_data = cyclistic_data.loc[cyclistic_data['membership_status'] == 'member']'''

#### Compare rideables, ride length and ride distance

In [None]:
cyclistic_data['rideable_type'].value_counts()

In [108]:
cyclistic_data['ride_length(min)'] = cyclistic_data['ride_length'].apply(time_to_minutes)

In [7]:
cyclistic_data = filter_unfeasible_rides(cyclistic_data)

In [None]:
generate_stack_hist(cyclistic_data, 'membership_status', 'rideable_type')

In [None]:
sns.boxplot(x='membership_status', y='ride_length(min)', data=cyclistic_data, palette='viridis')

# Customize the plot
plt.title('Categorized Box Plot')
plt.xlabel('Category')
plt.ylabel('Value')

# Show the plot
plt.show()

In [None]:
sns.kdeplot(data=cyclistic_data, x='ride_length(min)', hue='membership_status', palette='viridis', fill=True, common_norm=False)

# Customize the plot
plt.title('Distribution Curve by Category')
plt.xlabel('Value')
plt.ylabel('Density')

# Show the plot
plt.show()

In [None]:
sns.kdeplot(data=cyclistic_data, x='ride_distance(km)', hue='membership_status', palette='viridis', fill=True, common_norm=False)

# Customize the plot
plt.title('Distribution Curve by Category')
plt.xlabel('Value')
plt.ylabel('Density')

# Show the plot
plt.show()

In [None]:
sns.violinplot(x='membership_status', y='ride_length(min)', data=cyclistic_data, palette='viridis')

# Customize the plot
plt.title('Violin Plot of Ride Length by Category')
plt.xlabel('Category')
plt.ylabel('Ride Length')

# Show the plot
plt.show()

### Compare days of riding

##### Members ride more throughout the week, casuals ride more on the weekends

In [None]:
generate_hist_plot(cyclistic_data, 'membership_status', 'start_day')

#### Casual rides longer than members, especially on the weekends

In [None]:

pivot_table = cyclistic_data.pivot_table(
    values='ride_length(min)', 
    index='membership_status', 
    columns='start_day', 
    aggfunc='median')


print(pivot_table)


In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Average Ride Length by Day of Week and Membership Status')
plt.xlabel('Day of Week')
plt.ylabel('Average Ride Length (minutes)')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

In [127]:
generate_stack_hist(cyclistic_data, 'membership_status', 'ride_length(min)')

NameError: name 'generate_stack_hist' is not defined

### Number of rides for users by day

In [None]:
pivot_table = cyclistic_data.pivot_table(
    index='start_day', 
    columns='membership_status', 
    aggfunc='size')


print(pivot_table)

##### Members ride more frequently compared to members

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('No.of Rides by Day of Week and Membership Status')
plt.xlabel('Day of Week')
plt.ylabel('No. of Rides')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

In [None]:
'''
map_center = [cyclistic_data['start_lat'].mean(), cyclistic_data['start_lng'].mean()]
mymap = folium.Map(location=map_center, zoom_start=2)

# Add markers for each location
for _, row in cyclistic_data.iterrows():
    folium.Marker([row['start_lat'], row['start_lng']]).add_to(mymap)

# Save the map to an HTML file
mymap.save('testmap.html')
'''

### Explore seasonal trends, holidays


In [None]:
cyclistic_data.shape

In [None]:
'''sampled_data = cyclistic_data.sample(frac=0.01)
sampled_data.shape'''


In [None]:
'''import geopandas as gpd

gdf = gpd.GeoDataFrame(sampled_data, geometry=gpd.points_from_xy(sampled_data.start_lng, sampled_data.start_lat))'''

In [None]:
'''from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
gdf['cluster'] = kmeans.fit_predict(gdf[['start_lat', 'start_lng']])'''

In [None]:
'''from folium.plugins import HeatMap

map_center = [gdf['start_lat'].mean(), gdf['start_lng'].mean()]
mymap = folium.Map(location=map_center, zoom_start=12)

# Create a heatmap layer
heat_data = [[row['start_lat'], row['start_lng']] for _, row in sampled_data.iterrows()]
HeatMap(heat_data).add_to(mymap)

# Save the map to an HTML file
mymap.save('bike-sharing-heatmap.html')'''

In [None]:
cyclistic_data.head()

In [None]:
cyclistic_data = extract_month(cyclistic_data)
cyclistic_data.head()

In [30]:
spring_cyclistic_data = season_match(cyclistic_data, 'spring')
summer_cyclistic_data = season_match(cyclistic_data, 'summer')
autumn_cyclistic_data = season_match(cyclistic_data, 'autumn')
winter_cyclistic_data = season_match(cyclistic_data, 'winter')

##### No electric scooters in the spring

In [None]:
generate_prop_plot(spring_cyclistic_data, 'membership_status', 'rideable_type', 'spring')

In [None]:
generate_prop_plot(spring_cyclistic_data, 'membership_status', 'start_day', 'spring')

In [None]:
generate_prop_plot(summer_cyclistic_data, 'membership_status', 'rideable_type', 'summer')

In [None]:
generate_prop_plot(summer_cyclistic_data, 'membership_status', 'start_day', 'summer')

##### Casuals enjoy riding electric scooters in autumn

In [None]:
generate_prop_plot(autumn_cyclistic_data, 'membership_status', 'rideable_type', 'autumn')

In [None]:
generate_prop_plot(autumn_cyclistic_data, 'membership_status', 'start_day', 'autumn')

In [None]:
generate_prop_plot(winter_cyclistic_data, 'membership_status', 'rideable_type', 'winter')

In [None]:
generate_prop_plot(winter_cyclistic_data, 'membership_status', 'start_day', 'winter')

### Explore time of rides, peak hours

In [None]:
'''Morning Rush Hour
Traffic congestion typically builds between 6 a.m. and 9 a.m., with the heaviest delays occurring from 7 a.m. to 9 a.m..
Lunch Hour Rush
Chicago experiences a midday traffic surge between 1 p.m. and 2 p.m. due to lunch breaks, particularly in the city center.
Evening Rush Hour
The evening rush extends from 3 p.m. to 7 p.m., peaking between 4 p.m. and 5 p.m..
Busiest Days of the Week
Fridays: Traffic volumes increase on Fridays, especially in the evenings, as residents and visitors head into the city for events and nightlife.
Thursdays: Traffic data indicates that more drivers are on the road on Thursdays, contributing to increased congestion.'''

In [None]:
cyclistic_data = extract_hour(cyclistic_data)
cyclistic_data.head()

#### Morning Rush Hour Analysis

In [41]:
morning_rush_hours = [7,8,9]
morning_rush_data = cyclistic_data.loc[cyclistic_data['start_hour'].isin(morning_rush_hours)]

###### Casual riders utilize electric scooters more often in morning rush hours

In [None]:
generate_prop_plot(morning_rush_data, 'membership_status', 'rideable_type', 'Morning Rush Hours')

###### Casual riders ride longer, starting from 09:00

In [None]:
pivot_table = morning_rush_data.pivot_table(
    values='ride_length(min)', 
    index='membership_status', 
    columns='start_hour', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Riding Time by Hour of Morning Rush and Membership Status')
plt.xlabel('Morning Hour')
plt.ylabel('Median Riding Time')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### No significant difference in riding distances

In [None]:
pivot_table = morning_rush_data.pivot_table(
    values='ride_distance(km)', 
    index='membership_status', 
    columns='start_hour', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Riding Distance by Hour of Morning Rush and Membership Status')
plt.xlabel('Morning Hour')
plt.ylabel('Median Riding Distance(km)')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### Riders in general, have higher riding frequencies, from 08:00 to 09:00

In [None]:
generate_prop_plot(morning_rush_data, 'membership_status', 'start_hour', 'Morning Rush Hour')

#### Midday Traffic Surge

In [None]:
midday_rush_hours = [12,13,14]
midday_rush_data = cyclistic_data.loc[cyclistic_data['start_hour'].isin(midday_rush_hours)]
midday_rush_data.head()

###### Casual members tend to utilize electric scooters more often than members do, in midday rush hours

In [None]:
generate_prop_plot(midday_rush_data, 'membership_status', 'rideable_type', 'Midday Rush Hours')

###### Casual riders ride A LOT more longer in the midday hours

In [None]:
pivot_table = midday_rush_data.pivot_table(
    values='ride_length(min)', 
    index='membership_status', 
    columns='start_hour', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Riding Time by Hour of Morning Rush and Membership Status')
plt.xlabel('Midday Hour')
plt.ylabel('Median Riding Time')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### Casual riders ride for longer distances in the midday

In [None]:
pivot_table = midday_rush_data.pivot_table(
    values='ride_distance(km)', 
    index='membership_status', 
    columns='start_hour', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Riding Distance by Hour of Morning Rush and Membership Status')
plt.xlabel('Midday Hour')
plt.ylabel('Median Riding Distance(km)')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### No observable difference in riding frequencies in midday

In [None]:
generate_prop_plot(midday_rush_data, 'membership_status', 'start_hour', 'Midday Rush Hour')

#### Evening Rush Hour Analysis

In [None]:
evening_rush_hours = [16, 17, 18]
evening_rush_data = cyclistic_data.loc[cyclistic_data['start_hour'].isin(evening_rush_hours)]
evening_rush_data.head()

###### Casuals utilize electric scooters more often than members, in evening rush hours

In [None]:
generate_prop_plot(evening_rush_data, 'membership_status', 'rideable_type', 'Evening Rush Hour')

###### Casuals ride A LOT more longer in evening rush hours

In [None]:
pivot_table = evening_rush_data.pivot_table(
    values='ride_length(min)', 
    index='membership_status', 
    columns='start_hour', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Riding Time by Hour of Morning Rush and Membership Status')
plt.xlabel('Midday Hour')
plt.ylabel('Median Riding Time')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### Ride Distances, no observable differences

In [None]:
pivot_table = evening_rush_data.pivot_table(
    values='ride_distance(km)', 
    index='membership_status', 
    columns='start_hour', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Riding Distance by Hour of Morning Rush and Membership Status')
plt.xlabel('Midday Hour')
plt.ylabel('Median Riding Distance(km)')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### Casual riders ride most frequently 17:00

In [None]:
generate_prop_plot(evening_rush_data, 'membership_status', 'start_hour', 'Evening rush hour')

#### Busiest Days of the Week (Thursdays and Fridays)

In [None]:
busyDays = ['Thursday', 'Friday']

busy_day_data = cyclistic_data.loc[cyclistic_data['start_day'].isin(busyDays)]
busy_day_data.head()

###### Compare rideable types

In [None]:
generate_prop_plot(busy_day_data, 'membership_status', 'rideable_type', 'Busiest days')

###### Casual riders ride more on Fridays

In [None]:
generate_prop_plot(busy_day_data, 'membership_status', 'start_day', 'Busiest days')

###### Casuals ride for longer periods on busy days of the week

In [None]:
pivot_table = busy_day_data.pivot_table(
    values='ride_length(min)', 
    index='membership_status', 
    columns='start_day', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Average Ride Length by Day of Week and Membership Status')
plt.xlabel('Day of Week')
plt.ylabel('Average Ride Length (minutes)')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

###### No observable differences in ride distances on busy days of the week

In [None]:
pivot_table = busy_day_data.pivot_table(
    values='ride_distance(km)', 
    index='membership_status', 
    columns='start_day', 
    aggfunc='median')


print(pivot_table)

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('Median Ride Distance by Day of Week and Membership Status')
plt.xlabel('Day of Week')
plt.ylabel('Average Ride Distance (km)')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

### Explore monthly trends


In [None]:
cyclistic_data['month'].value_counts()

###### General trends

In [None]:
values = cyclistic_data['month'].value_counts()

plt.bar(cyclistic_data['month'].unique(), values, color='skyblue')

plt.xticks(values.index, values.index)  

# Add titles and labels
plt.title('Distribution of Ride Frequency')
plt.xlabel('Month')
plt.ylabel('Frequency')

# Show the plot
plt.show()

###### Member vs Casual Trend

In [None]:
pivot_table = cyclistic_data.pivot_table(
    index='month', 
    columns='membership_status', 
    aggfunc='size')


print(pivot_table)

###### Grossing periods: June to September

In [None]:
pivot_table.plot(kind='bar', figsize=(10, 6))

# Customize the plot
plt.title('No.of Rides by Day of Week and Membership Status')
plt.xlabel('Day of Week')
plt.ylabel('No. of Rides')
plt.legend(title='Membership Status')

# Show the plot
plt.show()

In [None]:
cyclistic_data.loc[cyclistic_data['month']== 'January']

In [None]:
# List of months
months = ['January', 'February', 'March', 'April', 'May', 'June', 
          'July', 'August', 'September', 'October', 'November', 'December']

# Create a 3x4 grid of subplots
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))
axes = axes.flatten()  # Flatten the 2D array of axes to 1D for easier iteration

# Create a histogram for each month in a separate subplot
for i, month in enumerate(months):
    month_data = cyclistic_data[cyclistic_data['month'] == month]
    axes[i].hist(month_data['ride_distance(km)'], bins=30, color='skyblue', edgecolor='black')
    axes[i].set_title(month)
    axes[i].set_xlabel('Ride Distance (km)')
    axes[i].set_ylabel('Frequency')

# Adjust layout to prevent overlap
plt.tight_layout()


In [None]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))
axes = axes.flatten()  # Flatten the 2D array of axes to 1D for easier iteration

# Create a histogram for each month in a separate subplot
for i, month in enumerate(months):
    month_data = cyclistic_data[cyclistic_data['month'] == month]
    axes[i].hist(month_data['ride_length(min)'], bins=30, color='skyblue', edgecolor='black')
    axes[i].set_title(month)
    axes[i].set_xlabel('Ride Length (min)')
    axes[i].set_ylabel('Frequency')

# Adjust layout to prevent overlap
plt.tight_layout()

### Explore hotspots, and geographic clustering

In [79]:
from geocode import generate_heatmap

casual_cyclistic_data = cyclistic_data.loc[cyclistic_data['membership_status'] == 'casual']

generate_heatmap(casual_cyclistic_data, 'Casual')

In [80]:
member_cyclistic_data = cyclistic_data.loc[cyclistic_data['membership_status'] == 'member']

generate_heatmap(member_cyclistic_data, 'Member')