In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from scipy.stats import describe
from scipy.stats import yeojohnson
from scipy.stats import boxcox

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
bike_train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv", parse_dates = True)
bike_test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv", parse_dates = True)

bike_train.sample(5)

In [None]:
def get_info(dataframe: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    
    """
    Explain the Basic Information for each feature in the dataframe.

    Parameters:
    dataframe (pd.DataFrame): The input dataframe containing features.

    Returns:
    pd.DataFrame: A dataframe with features and their info
    """
    
    info = pd.DataFrame()
    
    info['Columns'] = dataframe.columns.values
    
    info['Data Type'] = dataframe.dtypes.values
    
    info['Missing Values'] = dataframe.isnull().sum().values
    
    info['Missing Percentage'] = np.round(info['Missing Values'] / len(dataframe))
    
    means = []
    variance = []
    skewness = []
    kurtosis = []
    
    for col in dataframe.columns:
        
        if dataframe[col].dtype != 'object':
            
            desc = describe(dataframe[col], nan_policy = 'omit')
            
            means.append(np.round(desc.mean, 1))
            variance.append(np.round(desc.variance, 1))
            skewness.append(np.round(desc.skewness, 1))
            kurtosis.append(np.round(desc.kurtosis, 1))
            
        else:
            means.append(np.nan)
            variance.append(np.nan)
            skewness.append(np.nan)
            kurtosis.append(np.nan)
            
    info['Mean'] = means
    info['Variance'] = variance
    info['Skewness'] = skewness
    info['Kurtosis'] = kurtosis
    
    return info

In [None]:
get_info(bike_train)

In [None]:
# Convert the datetime column from object to datetime
combine = [bike_train, bike_test]

for dataset in combine:
    dataset['datetime'] = pd.to_datetime(dataset['datetime'])

bike_train['datetime'].dtype

In [None]:
# Creating new features from datetime
for dataset in combine:
    dataset['day_of_week'] = dataset['datetime'].dt.dayofweek
    dataset['month'] = dataset['datetime'].dt.month
    dataset['hour'] = dataset['datetime'].dt.hour

In [None]:
# Creating a copy of the data
bike_copy = bike_train.copy()

season_dict = {
    1: "spring",
    2: "summer",
    3: "fall",
    4: "winter"
}

weather_dict = {
    1: "Clear",
    2: "Mist",
    3: "Light Snow",
    4: "Heavy Rain"
}

# Replacing numerical values
bike_copy['season'].replace(season_dict, inplace = True)
bike_copy['weather'].replace(weather_dict, inplace = True)

bike_copy.sample(5)

### Total Bike Rentals

In [None]:
np.sum(bike_copy['count'])

In [None]:
# calculate_rentals_by_feature("season")

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 7))

for i, column_name in enumerate(['season', 'holiday', 'workingday', 'weather']):
    rentals = bike_copy.pivot_table(values='count', index=column_name, aggfunc=np.sum).reset_index()
    rentals['percentage'] = np.round(100 * rentals['count'] / np.sum(rentals['count']))
    
    row, col = divmod(i, 2)
    barplot = sb.barplot(data=rentals, x=column_name, y='count', ax=ax[row, col])
    
    ax[row, col].set_title(f"Bike Rentals by {column_name}")
    sb.despine()

plt.tight_layout()
plt.show()

### Observations:

1. **Fall** season witnessed the most bike rentals.
2. Seems like people like to rent bike mostly during **holidays**.
3. **Working Days** witnesses the most bike rentals.
4. People prefer **Clear Weather** as the most suitable weather for renting bikes.

In [None]:
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (10, 8))

for i, column in enumerate(['temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered']):
    
    row, col = divmod(i, 3)
    sb.boxplot(data = bike_train, x = column, ax = ax[row, col])
    sb.despine()
    
plt.tight_layout()
plt.show()

### Observations:

1. The columns **windspeed, casual and registered** has huge outliers.

In [None]:
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (10, 8))

for i, column in enumerate(['temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered']):
    
    row, col = divmod(i, 3)
    sb.kdeplot(data = bike_train, x = column, ax = ax[row, col])
    sb.despine()
    
plt.tight_layout()
plt.show()

In [None]:
for col in ['windspeed', 'casual', 'registered']:
    bike_train[col] = np.log(bike_train[col])

In [None]:
fig, ax = plt.subplots(nrows = 2, ncols = 3, figsize = (10, 8))

for i, column in enumerate(['temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered']):
    
    row, col = divmod(i, 3)
    sb.kdeplot(data = bike_train, x = column, ax = ax[row, col])
    sb.despine()
    
plt.tight_layout()
plt.show()

## Data Cleaning

### Missing Values

In [None]:
bike_train.isnull().sum()

### Observations:

There are no missing values in the data

### Outlier Treatment

In [None]:
def remove_outliers(
    dataframe: pd.DataFrame, 
    column_name: str, 
    inplace=False, threshold=0.3
):
    if column_name not in dataframe.columns:
        print("Column doesn't exist")
        return dataframe
    
    while True:
        Q1 = dataframe[column_name].quantile(0.25)
        Q3 = dataframe[column_name].quantile(0.75)
        IQR = Q3 - Q1

        outliers = dataframe[((dataframe[column_name] < (Q1 - 1.5 * IQR)) | 
                              (dataframe[column_name] > (Q3 + 1.5 * IQR)))]

        percentage_outliers = np.round(len(outliers) / len(dataframe), 2)
        
        if percentage_outliers > threshold:
            print("Percentage of outliers exceeds the threshold value! Can't remove outliers")
            return dataframe
        
        if len(outliers) == 0:
            break
        
        dataframe.drop(outliers.index, inplace=True)

    if not inplace:
        return dataframe

In [None]:
for outlier_col in ['windspeed', 'casual', 'registered']:
    remove_outliers(dataframe = bike_train,column_name = outlier_col, inplace = True)

In [None]:
# Checking for duplicate values
bike_train.duplicated().sum()

In [None]:
plt.figure(figsize = (10, 9))
sb.heatmap(bike_train.corr(), annot = True)

In [None]:
corr_matrix = bike_train.corr()
threshold = 0.6

count_corr = corr_matrix['count']
high_corr_features = count_corr[abs(count_corr) > threshold].index.tolist()

high_corr_features.remove('count')

print("Features highly correlated with 'count' (threshold > 0.8):")
print(high_corr_features)

In [None]:
x = bike_train[high_corr_features]
y = bike_train['count']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 22)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regressor': SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor()
    
}

for key in models.keys():
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('key', models[key])
    ])

    pipeline.fit(x_train, y_train)

    y_pred = pipeline.predict(x_test)

    mse = np.round(mean_squared_error(y_test, y_pred), 2)
    print(f'Mean Squared Error: {mse}')

    r2score = np.round(r2_score(y_test, y_pred), 2)
    
    print(key,r2score)
    print("---------------------------------------------------------------------")
    print()