# Prepare Bike Dataset
Source: https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset

In [1]:
import great_expectations as ge
import pandas as pd
import urllib.request
from zipfile import ZipFile
import os

metric column.standard_deviation.aggregate_fn is being registered with different metric_provider; overwriting metric_provider


In [2]:
# Better table visalization
# pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust width to avoid cutting off columns
pd.set_option('display.max_colwidth', None)  # Show full content in each column

## Dataset and Explanations for the properties
https://www.kaggle.com/datasets/lakshmi25npathi/bike-sharing-dataset

In [3]:
# Download and extract the dataset => First Download only
# data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
# data_path = "data/Bike-Sharing-Dataset.zip"

# if not os.path.exists("day.csv"):
#    urllib.request.urlretrieve(data_url, data_path)
#    with ZipFile(data_path, 'r') as zip_ref:
#        zip_ref.extractall()



In [5]:
# Load the dataset
df = pd.read_csv("data/base_bike_data/hour.csv")

In [6]:
# Convert normalized values back to real-world values
def denormalize_temp(temp):
    return temp * 47 - 8

def denormalize_atemp(atemp):
    return atemp * 66 - 16

def denormalize_hum(hum):
    return hum * 100

def denormalize_windspeed(windspeed):
    return windspeed * 67

# Apply conversions
df['temp'] = df['temp'].apply(denormalize_temp)
df['felt_temp'] = df['atemp'].apply(denormalize_atemp)
df['humidity'] = df['hum'].apply(denormalize_hum)
df['windspeed'] = df['windspeed'].apply(denormalize_windspeed)

# Map season values to labels and years to actual year and weather
df['season'] = df['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'})
df['year'] = df['yr'].map({0: 2011, 1: 2012})
df['weather'] = df['weathersit'].map({1: 'Good', 2: 'Okay', 3: 'Bad', 4: 'Super Bad'})
df['weather'] = df['weekday'].map({0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'})

# Rename Columns
df = df.rename(columns={"instant" : "id", "cnt" : "total", 'hr':'hour'})

In [7]:
# Get cleaned columns only
df_cleaned = df[['id', 'dteday', 'season', 'year', 'mnth', 'hour', 'holiday', 'weekday', 'workingday', 'weather', 'temp', 'felt_temp', 'humidity', 'windspeed', 'casual', 'registered', 'total']]

In [8]:
# Take a look at the Dataset
df_cleaned

Unnamed: 0,id,dteday,season,year,mnth,hour,holiday,weekday,workingday,weather,temp,felt_temp,humidity,windspeed,casual,registered,total
0,1,2011-01-01,Spring,2011,1,0,0,6,0,Saturday,3.28,3.0014,81.0,0.0000,3,13,16
1,2,2011-01-01,Spring,2011,1,1,0,6,0,Saturday,2.34,1.9982,80.0,0.0000,8,32,40
2,3,2011-01-01,Spring,2011,1,2,0,6,0,Saturday,2.34,1.9982,80.0,0.0000,5,27,32
3,4,2011-01-01,Spring,2011,1,3,0,6,0,Saturday,3.28,3.0014,75.0,0.0000,3,10,13
4,5,2011-01-01,Spring,2011,1,4,0,6,0,Saturday,3.28,3.0014,75.0,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,Spring,2012,12,19,0,1,1,Monday,4.22,1.0016,60.0,11.0014,11,108,119
17375,17376,2012-12-31,Spring,2012,12,20,0,1,1,Monday,4.22,1.0016,60.0,11.0014,8,81,89
17376,17377,2012-12-31,Spring,2012,12,21,0,1,1,Monday,4.22,1.0016,60.0,11.0014,7,83,90
17377,17378,2012-12-31,Spring,2012,12,22,0,1,1,Monday,4.22,1.9982,56.0,8.9981,13,48,61


In [9]:
for (year, season), sub_df in df_cleaned.groupby(['year', 'season']):
    filename = f"data/bike_rental_{year}_{season}.csv"
    sub_df.to_csv(filename, index=False)
    print(f"Saved: {filename}")

Saved: data/bike_rental_2011_Fall.csv
Saved: data/bike_rental_2011_Spring.csv
Saved: data/bike_rental_2011_Summer.csv
Saved: data/bike_rental_2011_Winter.csv
Saved: data/bike_rental_2012_Fall.csv
Saved: data/bike_rental_2012_Spring.csv
Saved: data/bike_rental_2012_Summer.csv
Saved: data/bike_rental_2012_Winter.csv
