# Table of Contents

## 01. Library and Data Import
## 02. Top 20 stations bar chart
## 03. Bike Trips & Temperature Dual-Axis Line Chart
## 04. Reducing the Dataset
        Creating seasonal columns
        Creating a random split for df_1

### 01. Library and Data Import

In [1]:
# Streamlit: Main framework for creating interactive dashboards
import streamlit as st

# Data Handling: Pandas for data manipulation, NumPy for numerical operations
import pandas as pd
import numpy as np

# Plotly for Interactive Visualizations
import plotly.express as px  # High-level interface for Plotly visualizations
from plotly.subplots import make_subplots  # Allows creating subplots
import plotly.graph_objects as go  # Low-level interface for detailed chart customization

# Matplotlib for Static Visualizations
import matplotlib.pyplot as plt  # Used for additional plotting functionalities

# Date and Time Handling
from datetime import datetime as dt  # Helps with date and time manipulations

# Kepler.gl for Interactive Geospatial Mapping in Streamlit
from streamlit_keplergl import keplergl_static  # Allows embedding Kepler.gl maps in Streamlit

In [2]:
# Define the file path for the 'newyork_data_cleaned_v3.pkl' dataset
file_path = r"C:\Users\HP\Citi_Bike_Dashboard\02 Data\Prepared Data\newyork_data_cleaned_v3.pkl"

# Load the dataset into a Pandas DataFrame
df = pd.read_pickle(file_path)

# Display the first few rows to confirm successful loading (optional)
df.head()

Unnamed: 0_level_0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,avgTemp,bike_rides_daily,trip_duration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-01-01,88237B34DAFDB069,electric_bike,2022-01-01 00:00:13.532,2022-01-01 00:30:28.482,Broadway & W 160 St,8157.06,Broadway & W 160 St,8157.06,40.835773,-73.943398,40.835773,-73.943398,casual,11.6,20198,30.249167
2022-01-01,4545E384F601A7E9,classic_bike,2022-01-01 00:00:32.146,2022-01-01 00:02:15.504,Forsyth St & Broome St,5453.05,Forsyth St & Broome St,5453.05,40.718941,-73.992661,40.718941,-73.992661,member,11.6,20198,1.722633
2022-01-01,7CE9B56332A37463,electric_bike,2022-01-01 00:00:42.801,2022-01-01 00:11:38.969,Myrtle Ave & Marcy Ave,4707.03,Willoughby St & Fleet St,4628.05,40.695396,-73.949547,40.691967,-73.9813,member,11.6,20198,10.936133
2022-01-01,AEAF2FFEBB00845B,electric_bike,2022-01-01 00:00:50.604,2022-01-01 02:39:37.688,W 15 St & 7 Ave,6030.06,W 33 St & 7 Ave,6407.07,40.739357,-73.999321,40.750198,-73.990929,casual,11.6,20198,158.784733
2022-01-01,9B01074C8CABA851,classic_bike,2022-01-01 00:01:05.031,2022-01-01 00:11:53.032,W 15 St & 7 Ave,6030.06,W 36 St & 7 Ave,6483.06,40.739357,-73.999321,40.752148,-73.98954,member,11.6,20198,10.800017


In [3]:
# Resetting the index
df = df.reset_index().rename(columns={"index": "date"})

In [4]:
# Review the shape of the dataset
df.shape

(29767925, 17)

## 02. Top 20 stations bar chart

In [7]:
# The code below will minimize data size when loading `df` into the browser

# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d", errors='coerce')  
# 'errors="coerce"' converts invalid dates into NaT instead of throwing an error

In [8]:
# Create a new column to count each row
df['value'] = 1  # Assigns '1' to each row, enabling trip count aggregation

# Group data by 'start_station_name' and sum the trip counts
df_groupby_bar = df.groupby('start_station_name', as_index=False, observed=False).agg({'value': 'sum'})
# Explicitly set observed=False to retain current behavior and prevent the warning

# Select the top 20 stations with the highest trip counts
top20 = df_groupby_bar.nlargest(20, 'value')  
# Retrieves the 20 most popular bike stations based on trip volume

In [None]:
# Create a bar chart

fig = go.Figure(
    go.Bar(
        x=top20['start_station_name'],  # X-axis: Station names
        y=top20['value'],  # Y-axis: Number of trips
        marker={'color': top20['value'], 'colorscale': 'Blues'}  # Apply color scale
    )
)

# Update the layout with correct x-axis title spacing
fig.update_layout(
    title="Top 20 most popular bike stations in New York City",  # Chart title
    xaxis=dict(
        title=dict(
            text="Start Stations",  # X-axis title
            standoff=20  # Adds space between x-axis title and tick labels
        ),
        tickangle=-45  # Rotates x-axis labels to avoid overlap
    ),
    yaxis_title="Sum of trips",  # Y-axis title
    width=900, 
    height=600
)

# Display the figure
fig.show()

In [9]:
# Save the top 20 stations as a csv file 

top20.to_csv('top20.csv')

### 03. Bike Trips & Temperature Dual-Axis Line Chart

In [None]:
# Create a figure with a secondary Y-axis
fig_2 = make_subplots(specs=[[{"secondary_y": True}]])

# Add first trace: Line chart for daily bike rides (Primary Y-axis)
fig_2.add_trace(
    go.Scatter(
        x=df['date'],  # X-axis: Date
        y=df['bike_rides_daily'],  # Y-axis: Number of bike rides
        name='Daily bike rides',  # Legend label
        marker={'color': df['bike_rides_daily'],'color': 'blue'}), # Apply color scale 
    ),
    secondary_y=False  # Assign to primary Y-axis
)

# Add second trace: Line chart for daily temperature (Secondary Y-axis)
fig_2.add_trace(
    go.Scatter(
        x=df['date'],  # X-axis: Date
        y=df['avgTemp'],  # Y-axis: Average daily temperature
        name='Daily temperature',  # Legend label
        marker={'color': df['avgTemp'],'color': 'red'}), # Apply color scale
    ),
    secondary_y=True  # Assign to secondary Y-axis
)

fig_2.update_layout(
    title = 'Daily bike trips and temperatures in 2022',
    height = 600
)

st.plotly_chart(fig_2, use_container_width=True)

### 04. Reducing the Dataset

#### Creating seasonal columns

In [None]:
# # Creating a `month` column

# # Convert the 'date' column to a DateTime format
# df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# # Extract the month from the 'date' column and create a new 'month' column
# df['month'] = df['date'].dt.month

# # Ensure the 'month' column is stored as an integer type
# df['month'] = df['month'].astype(int)

In [None]:
# # Create the 'season' column based on the 'month' column

# df['season'] = [
#     "winter" if (month == 12 or 1 <= month <= 4)  # December to April → Winter
#     else "spring" if (4 < month <= 5)  # May → Spring
#     else "summer" if (6 <= month <= 9)  # June to September → Summer
#     else "fall"  # October & November → Fall
#     for month in df['month']  # Iterate over each value in the 'month' column
# ]

In [4]:
# Viewing the columns
df.columns

Index(['date', 'ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'avgTemp', 'bike_rides_daily', 'trip_duration'],
      dtype='object')

In [5]:
# Create a copy with fewer columns

df_1 = df.drop(columns = {'ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'trip_duration'})

In [6]:
df_1.columns

Index(['date', 'avgTemp', 'bike_rides_daily'], dtype='object')

#### Creating a random split for df_1

In [7]:
# Set the random seed to ensure reproducibility of results
np.random.seed(32)

# Generate an array of random values between 0 and 1 (one value per row in df_1)
# Then, create a boolean mask where ~75% of the values are True and ~25% are False
# I had to lower this from 92 > 75 due to the large dataset having difficulty loading via Streamlit
red = np.random.rand(len(df_1)) <= 0.75

In [8]:
# Use the boolean mask 'red' to filter df_1 and create a smaller dataset
# '~red' inverts the mask, selecting the ~25% of rows that were marked as False
small = df_1[~red]

In [9]:
small.shape

(7441694, 3)

In [10]:
small.to_csv('reduced_data_lect_2.6.csv',index = False)

In [11]:
df_1.to_csv('reduced_data_to_plot_2.6.csv')