# Table of Contents

## 01. Library and Data Import
## 02. Top 20 stations bar chart
## 03. Creating a random sample of `df`

### 01. Library and Data Import

In [1]:
# Streamlit: Main framework for creating interactive dashboards
import streamlit as st

# Data Handling: Pandas for data manipulation, NumPy for numerical operations
import pandas as pd
import numpy as np

# Plotly for Interactive Visualizations
import plotly.express as px  # High-level interface for Plotly visualizations
from plotly.subplots import make_subplots  # Allows creating subplots
import plotly.graph_objects as go  # Low-level interface for detailed chart customization

# Matplotlib for Static Visualizations
import matplotlib.pyplot as plt  # Used for additional plotting functionalities

# Date and Time Handling
from datetime import datetime as dt  # Helps with date and time manipulations

# Kepler.gl for Interactive Geospatial Mapping in Streamlit
from streamlit_keplergl import keplergl_static  # Allows embedding Kepler.gl maps in Streamlit

In [2]:
# Define the file path for the 'newyork_data_cleaned_v3.pkl' dataset
file_path = r"C:\Users\HP\Citi_Bike_Dashboard\02 Data\Prepared Data\newyork_data_cleaned_v3.pkl"

# Load the dataset into a Pandas DataFrame
df = pd.read_pickle(file_path)

# Display the first few rows to confirm successful loading (optional)
df.head()

Unnamed: 0_level_0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,avgTemp,bike_rides_daily,trip_duration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-01-01,88237B34DAFDB069,electric_bike,2022-01-01 00:00:13.532,2022-01-01 00:30:28.482,Broadway & W 160 St,8157.06,Broadway & W 160 St,8157.06,40.835773,-73.943398,40.835773,-73.943398,casual,11.6,20198,30.249167
2022-01-01,4545E384F601A7E9,classic_bike,2022-01-01 00:00:32.146,2022-01-01 00:02:15.504,Forsyth St & Broome St,5453.05,Forsyth St & Broome St,5453.05,40.718941,-73.992661,40.718941,-73.992661,member,11.6,20198,1.722633
2022-01-01,7CE9B56332A37463,electric_bike,2022-01-01 00:00:42.801,2022-01-01 00:11:38.969,Myrtle Ave & Marcy Ave,4707.03,Willoughby St & Fleet St,4628.05,40.695396,-73.949547,40.691967,-73.9813,member,11.6,20198,10.936133
2022-01-01,AEAF2FFEBB00845B,electric_bike,2022-01-01 00:00:50.604,2022-01-01 02:39:37.688,W 15 St & 7 Ave,6030.06,W 33 St & 7 Ave,6407.07,40.739357,-73.999321,40.750198,-73.990929,casual,11.6,20198,158.784733
2022-01-01,9B01074C8CABA851,classic_bike,2022-01-01 00:01:05.031,2022-01-01 00:11:53.032,W 15 St & 7 Ave,6030.06,W 36 St & 7 Ave,6483.06,40.739357,-73.999321,40.752148,-73.98954,member,11.6,20198,10.800017


In [3]:
# Resetting the index
df = df.reset_index().rename(columns={"index": "date"})

In [4]:
# Review the shape of the dataset
df.shape

(29767925, 17)

### 02. Top 20 stations bar chart

In [5]:
# # Create a bar chart

# fig = go.Figure(
#     go.Bar(
#         x=top20['start_station_name'],  # X-axis: Station names
#         y=top20['value'],  # Y-axis: Number of trips
#         marker={'color': top20['value'], 'colorscale': 'Blues'}  # Apply color scale
#     )
# )

# # Update the layout with correct x-axis title spacing
# fig.update_layout(
#     title="Top 20 most popular bike stations in New York City",  # Chart title
#     xaxis=dict(
#         title=dict(
#             text="Start Stations",  # X-axis title
#             standoff=20  # Adds space between x-axis title and tick labels
#         ),
#         tickangle=-45  # Rotates x-axis labels to avoid overlap
#     ),
#     yaxis_title="Sum of trips",  # Y-axis title
#     width=900, 
#     height=600
# )

# # Display the figure
# fig.show()

# This code might not be used

### 03. Creating a random sample of `df`

In [6]:
# Create a month column 

df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
df['month'] = df['date'].dt.month
df['month'] = df['month'].astype('int')

In [7]:
# Create the season column

df['season'] = [
"winter" if (month == 12 or 1 <= month <= 4)
    else "spring" if (4 < month <= 5)
    else "summer" if (6 <= month <= 9)
    else "fall"
for month in df['month']
    ]

In [8]:
df.columns

Index(['date', 'ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'avgTemp', 'bike_rides_daily', 'trip_duration',
       'month', 'season'],
      dtype='object')

In [9]:
# Create a copy with fewer columns

df_1 = df.drop(columns = {'ride_id', 'rideable_type', 'ended_at',
                           'start_station_id', 'end_station_id', 'start_lat', 'start_lng',
                           'end_lat', 'end_lng', 'member_casual', 'trip_duration', 'month'})

In [10]:
df_1.columns

Index(['date', 'started_at', 'start_station_name', 'end_station_name',
       'avgTemp', 'bike_rides_daily', 'season'],
      dtype='object')

In [11]:
# Set the random seed to ensure reproducibility of results
np.random.seed(32)

# Generate an array of random values between 0 and 1 (one value per row in df_1)
# Then, create a boolean mask where ~92% of the values are True and ~8% are False
red = np.random.rand(len(df_1)) <= 0.92

In [12]:
# Use the boolean mask 'red' to filter df_1 and create a smaller dataset
# '~red' inverts the mask, selecting the ~8% of rows that were marked as False
small = df_1[~red]

In [13]:
small.shape

(2379383, 7)

In [14]:
# Save the cleaned and reduced dataset to a CSV file
# This file will be used for visualization in the Streamlit dashboard
small.to_csv('reduced_data_to_plot_7.csv', index=False)