## Required Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pylab as plt
import dmba
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score
import seaborn as sns

%matplotlib inline

no display found. Using non-interactive Agg backend


### Import first dataset = day.csv

In [3]:
# Load the CSV file into a DataFrame
df_day = pd.read_csv('day.csv')

# Display the DataFrame
df_day.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


### Import second dataset = hour

In [6]:
# Load the CSV file into a DataFrame
df_hour = pd.read_csv('hour.csv')

# Display the DataFrame
df_hour.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [7]:
df_day.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,731.0,366.0,211.165812,1.0,183.5,366.0,548.5,731.0
season,731.0,2.49658,1.110807,1.0,2.0,3.0,3.0,4.0
yr,731.0,0.500684,0.500342,0.0,0.0,1.0,1.0,1.0
mnth,731.0,6.519836,3.451913,1.0,4.0,7.0,10.0,12.0
holiday,731.0,0.028728,0.167155,0.0,0.0,0.0,0.0,1.0
weekday,731.0,2.997264,2.004787,0.0,1.0,3.0,5.0,6.0
workingday,731.0,0.683995,0.465233,0.0,0.0,1.0,1.0,1.0
weathersit,731.0,1.395349,0.544894,1.0,1.0,1.0,2.0,3.0
temp,731.0,0.495385,0.183051,0.05913,0.337083,0.498333,0.655417,0.861667
atemp,731.0,0.474354,0.162961,0.07907,0.337842,0.486733,0.608602,0.840896


### Make day  confusion Matrix 

In [None]:
df_day.info()

In [8]:
df_hour.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [None]:
df_hour.describe().T

Need to add cost data to get expected revenue. Casual use will be per 1 hour and will cost the user \\$3.50. 

Registered users operate the bike at a discounted rate. A 24 hour pass is \\$5.00, a 30 day pass is \\$17.00, and a 365 day pass is \\$150.00. 

We cannot determine which registered passes are being used so we will use an average cost per hour using all three. 

(24 hr pass: 5.00 / 24hours 
30 day pass: 17.00 / (30days * 24hours) 
365 day pass: 150.00 / (365days * 24hours))/3

= \\$0.08/hour for registered users

Data Source: https://bikeshare.metro.net/ 

## EDA and Preprocessing

In [None]:
# Identify null values
df_day.isna().sum()

In [None]:
# Identify null values
df_hour.isna().sum()

In [None]:
df_day.shape

In [None]:
df_hour.shape

In [None]:
# Create a dictionary to map numerical values to season names
season_mapping = {
    1: 'spring',
    2: 'summer',
    3: 'fall',
    4: 'winter'
}

# Create a copy of the DataFrame
new_day = df_day.copy()

# Map the 'season' column using the dictionary
new_day['season'] = new_day['season'].map(season_mapping)

# Convert the 'season' column to categorical type
new_day['season'] = pd.Categorical(new_day['season'], categories=['spring', 'summer', 'fall', 'winter'], ordered=True)

# Visualize the distribution of seasons using a count plot
sns.set(style="darkgrid")
sns.countplot(x=new_day['season'])
plt.title('Season Distribution')
plt.show()

In [None]:
# Create a dictionary to map numerical values to season names
season_mapping = {
    1: 'spring',
    2: 'summer',
    3: 'fall',
    4: 'winter'
}

# Create a copy of the DataFrame
new_hour = df_hour.copy()

# Map the 'season' column using the dictionary
new_hour['season'] = new_hour['season'].map(season_mapping)

# Convert the 'season' column to categorical type
new_hour['season'] = pd.Categorical(new_hour['season'], categories=['spring', 'summer', 'fall', 'winter'], ordered=True)

# Visualize the distribution of seasons using a count plot
sns.set(style="darkgrid")
sns.countplot(x=new_day['season'])
plt.title('Season Distribution')
plt.show()

In [None]:
def print_box_hist(data, data1, data2):
    sns.set(style="darkgrid")
    f, axes = plt.subplots(2, 2, figsize=(12, 4), gridspec_kw={"height_ratios": (.15, .85)})

    sns.boxplot(data, x=data1, orient="h", ax=axes[0, 0])
    sns.histplot(data, x=data1, ax=axes[1, 0])

    sns.boxplot(data, x=data2, orient="h", ax=axes[0, 1])
    sns.histplot(data, x=data2, ax=axes[1, 1])

    axes[0, 0].set(xlabel='')
    axes[0, 1].set(xlabel='')

    plt.show()

print_box_hist(df_day, "season", "cnt")
print_box_hist(df_day, "registered", "cnt")
print_box_hist(df_day, "temp", "cnt")
print_box_hist(df_day, "registered", "season")


In [None]:
def print_box_hist(data, data1, data2):
    sns.set(style="darkgrid")
    f, axes = plt.subplots(2, 2, figsize=(12, 4), gridspec_kw={"height_ratios": (.15, .85)})

    sns.boxplot(data, x=data1, orient="h", ax=axes[0, 0])
    sns.histplot(data, x=data1, ax=axes[1, 0])

    sns.boxplot(data, x=data2, orient="h", ax=axes[0, 1])
    sns.histplot(data, x=data2, ax=axes[1, 1])

    axes[0, 0].set(xlabel='')
    axes[0, 1].set(xlabel='')

    plt.show()

print_box_hist(df_hour, "season", "cnt")
print_box_hist(df_hour, "registered", "cnt")
print_box_hist(df_hour, "temp", "cnt")
print_box_hist(df_hour, "registered", "season")

In [None]:
# Numerical columns for the correlation matrix
numerical_columns = df_hour.select_dtypes(include=[np.number])

# Compute the correlation matrix
correlation_matrix = numerical_columns.corr()

# Create a heatmap for the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# Numerical columns for the correlation matrix
numerical_columns = df_day.select_dtypes(include=[np.number])

# Compute the correlation matrix
correlation_matrix = numerical_columns.corr()

# Create a heatmap for the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# build histograms to get distribtuions

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import warnings

# Suppress specific Seaborn warnings
warnings.filterwarnings("ignore", category=UserWarning, module="seaborn")

fig = plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
sns.histplot(df_day['registered'], kde=True)
plt.ylabel("cnt")
plt.title("Histogram of Registered people")

plt.subplot(2, 3, 2)
stats.probplot(df_day['registered'], dist="norm", plot=plt)
plt.title('Registered QQ Plot')

plt.subplot(2, 3, 3)
sns.histplot(df_day['mnth'], kde=True)
plt.ylabel("cnt")
plt.title("Histogram of Month")

plt.subplot(2, 3, 4)
stats.probplot(df_day['mnth'], dist="norm", plot=plt)
plt.title('Month QQ Plot')

plt.subplot(2, 3, 5)
sns.histplot(df_day['temp'], kde=True)
plt.ylabel("cnt")
plt.title("Histogram of Temperature")

plt.subplot(2, 3, 6)
stats.probplot(df_day['temp'], dist="norm", plot=plt)
plt.title('Temperature QQ Plot')

plt.tight_layout()
plt.show()


In [None]:
#Check for data types
df_day.info()

In [None]:
df_hour.info()

In [None]:
# Create a pivot table for the heatmap
heatmap_data = df_hour.pivot_table(index='weekday', columns='hr', values='cnt', aggfunc='mean')

# Create the heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(heatmap_data, cmap='magma', annot=True, fmt='.1f')
plt.title('Hourly Bike Rentals by Day of the Week')
plt.show()

In [None]:
heatmap_data = df_hour.pivot_table(index='season', columns='weekday', values='cnt', aggfunc='mean')

# Define season labels
season_labels = ['Spring', 'Summer', 'Fall', 'Winter']

# Create the heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(heatmap_data, cmap='magma', annot=True, fmt='.1f', xticklabels=range(7))
plt.title('Hourly Bike Rentals by Season')
plt.xlabel('Day of Week')
plt.ylabel('Season')
plt.xticks(rotation=0) 
plt.yticks(ticks=range(1, 5), labels=season_labels, rotation=45) 
plt.show()

In [None]:
# Create a pivot table for the heatmap
heatmap_data = df_day.pivot_table(index='weekday', columns='season', values='cnt', aggfunc='mean')

# Create the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap='magma', annot=True, fmt='.1f')
plt.title('Daily Bike Rentals by Season')
plt.show()

In [None]:
# Create a pivot table for the heatmap
heatmap_data = df_hour.pivot_table(index='weekday', columns='season', values='cnt', aggfunc='mean')

# Create the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap='magma', annot=True, fmt='.1f')
plt.title('Hourly Bike Rentals by Season')
plt.show()

In [None]:
#Plot average Hourly bike rentals by time of day

hourly_average = df_hour.groupby('hr')['cnt'].mean()

plt.figure(figsize=(10, 6))
plt.plot(hourly_average.index, hourly_average.values, marker='o', linestyle='-')
plt.xlabel('Hour of the Day')
plt.ylabel('Average Bike Rentals (cnt)')
plt.title('Average Hourly Bike Rentals by Time of Day')
plt.grid(True)
plt.show()

In [None]:
#Plot average Hourly bike rentals by time of day

hourly_average = df_hour.groupby('temp')['cnt'].mean()

plt.figure(figsize=(10, 6))
plt.plot(hourly_average.index, hourly_average.values, marker='o', linestyle='-')
plt.xlabel('Temperature C')
plt.ylabel('Average Bike Rentals (cnt)')
plt.title('Average Hourly Bike Rentals by Temperature')
plt.grid(True)
plt.show()

In [None]:
#Plot average daily bike rentals by season
season_names = ['Spring', 'Summer', 'Fall', 'Winter']

hourly_average = df_hour.groupby('season')['cnt'].mean()

plt.figure(figsize=(10, 6))
plt.plot(hourly_average.index, hourly_average.values, marker='o', linestyle='-', color='b', markersize=8, label='Average Rentals')
plt.xlabel('Season')
plt.ylabel('Average Bike Rentals (cnt)')
plt.title('Average Hourly Bike Rentals by Season')
plt.xticks(hourly_average.index, season_names)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
#Plot average daily bike rentals by season
season_names = ['Spring', 'Summer', 'Fall', 'Winter']

hourly_average = df_day.groupby('season')['cnt'].mean()

plt.figure(figsize=(10, 6))
plt.plot(hourly_average.index, hourly_average.values, marker='o', linestyle='-', color='b', markersize=8, label='Average Rentals')
plt.xlabel('Season')
plt.ylabel('Average Bike Rentals (cnt)')
plt.title('Average Daily Bike Rentals by Season')
plt.xticks(hourly_average.index, season_names)
plt.grid(True)
plt.legend()
plt.show()

In [None]:
df_hour['rental_type'] = df_hour['holiday'].apply(lambda x: 'Holiday' if x == 1 else 'Non-Holiday')

# Group the data by rental type and calculate the average rentals for each group
rental_type_average = df_hour.groupby('rental_type')['cnt'].mean()

# Create a bar plot
plt.figure(figsize=(10, 6))
rental_type_average.plot(kind='bar', color=['red', 'lightblue'])
plt.xlabel('Rental Type')
plt.ylabel('Average Bike Rentals (cnt)')
plt.title('Average Bike Rentals on Holidays vs. Non-Holidays')
plt.xticks(rotation=0)  # Rotate x-axis labels if needed
plt.show()


In [None]:

# Create a new column 'date' to store the date in datetime format
df_day['date'] = pd.to_datetime(df_day['dteday'])
df_day.sort_values('date', inplace=True)

# Create a figure and axis
fig, ax = plt.subplots(figsize=(14, 6))

# Plot the cumulative counts for casual and registered users as stacked areas
ax.fill_between(df_day['date'], df_day['casual'].cumsum(), label='Casual', alpha=0.7)
ax.fill_between(df_day['date'], df_day['registered'].cumsum(), df_day['casual'].cumsum(), label='Registered', alpha=0.7)

# Add labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Cumulative Count of Rentals')
ax.set_title('Cumulative Count of Rentals Over Time')

# Add a legend
ax.legend()
plt.grid(True)

plt.show()
