In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df0=pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")

In [None]:
df0.head()

In [None]:
df0.describe()

In [None]:
df0.shape

In [None]:
df0.isnull().sum()

In [None]:
df0['datetime'] = pd.to_datetime(df0['datetime'])

def extract_feature(df):
    df["year"] = df['datetime'].dt.year
    df["day"] = df['datetime'].dt.day
    df["month"] = df['datetime'].dt.month
    df["hour"] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek


extract_feature(df0)

In [None]:
df_corr=df0.corr()
plt.figure(figsize=(12,8))
sns.heatmap(df_corr,annot=True,fmt=".2f",cmap="turbo")
plt.show()

In [None]:
filtered_df = df0[(df0['datetime'] >= '2011-01-01') & (df0['datetime'] <= '2011-02-28')]
daily_counts = filtered_df.groupby(filtered_df['datetime'].dt.date)['count'].sum()
dates = daily_counts.index
counts = daily_counts.values
plt.figure(figsize=(8,6))
plt.bar(dates, counts, color='green')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Daily Counts for Jan-Feb 2011')
plt.xticks(rotation=45)
plt.show()


In [None]:
df0['datetime'] = pd.to_datetime(df0['datetime'])
df_2011 = df0[df0['datetime'].dt.year == 2011]
daily_counts = df_2011.groupby(df_2011['datetime'].dt.date)['count'].sum()
dates = daily_counts.index
counts = daily_counts.values
plt.figure(figsize=(8,6))
plt.bar(dates, counts, color='crimson')
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('Daily Rentals for 2011')
plt.xticks(rotation=45)
plt.show()


In [None]:
group_hour = df0.groupby(['hour'])
average_hour = group_hour['count'].mean()
plt.figure(figsize=(8,6))
plt.style.use("dark_background")
plt.plot(average_hour.index,average_hour,color="darkorange")

plt.xlabel('Hour')
plt.ylabel('Rental Count')
plt.xticks(np.arange(24))
# plt.grid(True)
plt.title('Average Hourly Rental Count')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
df_2011 = df0[df0['datetime'].dt.year == 2011]
df_2012 = df0[df0['datetime'].dt.year == 2012]

plt.plot(df_2011['datetime'], df_2011['count'], label='2011',color="navy",lw=4)
plt.plot(df_2012['datetime'], df_2012['count'], label='2012',color="brown",lw=4)
plt.xticks(fontsize=14, rotation=45)
plt.xlabel('Date')
plt.ylabel('Rental Count')
plt.title('2011 and 2012 Rentals')
plt.legend()
plt.show()

In [None]:
group_year_month = df0.groupby(['year','month'])
average_year_month = group_year_month['count'].mean()
average_year_month

In [None]:
plt.figure(figsize=(8,6)) 
plt.style.use("classic")

import matplotlib.cm as cm


colors = cm.Set1.colors


for i, year in enumerate(average_year_month.index.levels[0]):
    color = colors[i]  
    plt.plot(average_year_month[year].index, average_year_month[year], label=year, color=color)

plt.legend()    
plt.xlabel('Month')
plt.ylabel('Count')
plt.grid(True)
plt.title('Average Monthly Rental Count for 2011, 2012')
plt.show()


In [None]:
plt.figure(figsize=(8,6)) 
group_year_hour = df0.groupby(['year','hour'])
average_year_hour = group_year_hour['count'].mean()
for year in average_year_hour.index.levels[0]:
    
    plt.plot(average_year_hour[year].index,average_year_hour[year],label=year)
    
plt.legend()    
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(np.arange(24))

plt.title('Average Hourly Rental Count - 2011, 2012')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
group_workingday_hour = df0.groupby(['workingday','hour'])
average_workingday_hour = group_workingday_hour['count'].mean()
for workingday in average_workingday_hour.index.levels[0]:
    
    plt.plot(average_workingday_hour[workingday].index,average_workingday_hour[workingday],label=workingday)
    
plt.legend()    
plt.xlabel('Hour')
plt.ylabel('Count')
plt.xticks(np.arange(24))
plt.grid(True)
plt.title('Average Hourly Rental Count by Working Day')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x="atemp",y="count",data=df0,color="navy")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x="temp",y="count",data=df0,color="green")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x="humidity",y="count",data=df0,color="brown")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=df0[["temp","atemp","humidity"]],orient="h")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df0, y="temp",color="red",bins=50)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=df0, x="registered",color="green",bins=50)
plt.show()

In [None]:
df0.drop(columns="datetime",axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train=df0.drop(columns=["count"],axis=1)
label=df0["count"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train,label, test_size=0.2, random_state=42)

In [None]:
import xgboost as xgb

regressor = xgb.XGBRegressor(max_depth=5, n_estimators=150)

In [None]:
regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_test, y_test)])

In [None]:
eval_result = regressor.evals_result()
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error',color="red")
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error',color="green")
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,6))
xgb.plot_importance(regressor)
plt.show()

In [None]:
y_pred=regressor.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
print("*********************************************************")
print("Root Mean Squared Error :",rmse)
print("=========================================================")
print("R2 Score :",r2)
print("=========================================================")
print("Mean Absolute Error :",mae)
print("##########################################################")

# Test Data Analysis

In [None]:
df1=pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

In [None]:
df1.head()

In [None]:
df1['datetime'] = pd.to_datetime(df1['datetime'])

def extract_feature(df):
    df["year"] = df['datetime'].dt.year
    df["day"] = df['datetime'].dt.day
    df["month"] = df['datetime'].dt.month
    df["hour"] = df['datetime'].dt.hour
    df['dayofweek'] = df['datetime'].dt.dayofweek


extract_feature(df1)

In [None]:
df1.shape

In [None]:
df1.isnull().sum()

In [None]:
df1.describe()

In [None]:
df1.info()

In [None]:
datetime=df1.datetime

In [None]:
df1.drop("datetime",axis=1,inplace=True)

In [None]:
casual=df0["casual"]
registered=df0["registered"]
df1["casual"] = casual
df1["registered"] = registered
df1 = df1[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'year', 'day', 'month', 'hour', 'dayofweek']]

In [None]:
df1.head()

In [None]:
my_prediction=regressor.predict(df1)

my_submission=pd.DataFrame({'datetime':datetime,'count':my_prediction})
my_submission.to_csv('submission.csv',index=False)

In [None]:
my_submission.head(20)

# References 
**https://sophiesu.net/prj-bike-sharing-demand-prediction-kaggle/**

<center><img src="https://www.icegif.com/wp-content/uploads/2023/01/icegif-405.gif" alt="Thank You GIF" width="800"></center>
