In [7]:
# Bike Sharing Demand Prediction
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
#import mlflow
#import mlflow.sklearn


In [8]:
# Load the data
data = pd.read_csv('hour.csv')

In [9]:
# 1. Data Preprocessing and Cleaning
print("1. Data Preprocessing and Cleaning")

1. Data Preprocessing and Cleaning


In [10]:
# Check for missing values
print(data.isnull().sum())

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


In [12]:
# Convert datetime
data['datetime'] = pd.to_datetime(data['dteday'])

In [13]:
# Extract additional time-based features
data['hour'] = data['datetime'].dt.hour
data['day'] = data['datetime'].dt.day
data['month'] = data['datetime'].dt.month
data['year'] = data['datetime'].dt.year
data['dayofweek'] = data['datetime'].dt.dayofweek

In [14]:
# 2. Feature Engineering and Selection
print("\n2. Feature Engineering and Selection")


2. Feature Engineering and Selection


In [15]:
#Create interaction features
data['temp_feel_diff'] = data['atemp'] - data['temp']
data['humidity_wind_interaction'] = data['hum'] * data['windspeed']

In [None]:
# Data preprocessing function

def preprocess_hourly_dataset(hourly_dataset):
    """
    Preprocess the hourly dataset by renaming columns, adding target variable, and checking for missing values.

    Parameters:
    - hourly_dataset (pandas.DataFrame): The input hourly dataset containing features and targets.

    Returns:
    - pandas.DataFrame: The preprocessed hourly dataset.
    - pandas.Series: A summary of missing values in the dataset.
    """
    # Accessing the features and target
    features = hourly_dataset.data.features
    targets = hourly_dataset.data.targets
    
    # Renaming column names
    hourly_df = features.rename(columns={'weathersit':'weather',
                                           'yr':'year',
                                           'mnth':'month',
                                           'hr':'hour',
                                           'hum':'humidity',
                                           'temp': 'temperature'})
    
    # Adding 'count' column from targets
    hourly_df['count'] = targets['cnt']
    
    # Checking for missing values
    missing_values_summary = hourly_df.isnull().sum()
    
    return hourly_df, missing_values_summary



In [16]:
# Select features for modeling
features = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'hum', 'windspeed',
            'hour', 'day', 'month', 'year', 'dayofweek', 'temp_feel_diff', 'humidity_wind_interaction']

In [17]:
X = data[features]
y = data['cnt']

KeyError: "['weather'] not in index"

In [None]:
# One-hot encode categorical variables
X = pd.get_dummies(X, columns=['season', 'weather'], drop_first=True)

In [4]:
# 3. Model Training and Validation
print("\n3. Model Training and Validation")



3. Model Training and Validation


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize and train the model
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")