##total count of bikes rented during each hour

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

# loading dataset

In [None]:
path="../bikes/train_bikes.csv"
dataset=pd.read_csv(path)
df=dataset.copy()
df.head()

# Basic statistic 

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

# Numerical features

In [None]:
numerical_features=[feature for feature in df.columns if df[feature].dtype!='O']

In [None]:
df[numerical_features]

In [None]:
df['season'].unique()<25

## Discreate Features

In [None]:
discreate_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]

In [None]:
for feature in discreate_feature:
    data=df.copy()
    data.groupby(feature)['count'].median().plot.bar()
    plt.title(feature)
    plt.show()

## Contious feature

In [None]:
continous_feature=[feature for feature in numerical_features if len(df[feature].unique())>=25]

In [None]:
df[continous_feature]

In [None]:
for feature in continous_feature:
    data=df.copy()
    plt.scatter(data[feature],data['count'])
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

In [None]:
# hist plot
for feature in continous_feature:
    data=df.copy()
    data[feature].hist(bins=25)
    plt.title(feature)
    plt.show()

# Logarithmic transformation

In [None]:
for feature in continous_feature:
    data=df.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data['count']= np.log(data['count'])
        plt.scatter(df[feature],df['count'])
        plt.title(feature)
        plt.show()

# Outliers

In [None]:
for feature in numerical_features:
    data=df.copy()
    data.boxplot(feature)
    plt.title(feature)
    plt.show()

# Missing value

In [None]:
df.isnull().sum()

In [None]:
features_with_na=[feature for feature in df.columns if df[feature].isnull().mean()]
for feature in features_with_na:
    print("missing value of {} is {}".format(feature,np.round(df[feature].isnull().mean())*100,4),'% missing')
else:
    print('there is no missing value')

In [None]:
df.head()

# Converting datetime to Year, Month, Day, Hours

In [None]:
df['datetime_new']=pd.to_datetime(df['datetime'])

In [None]:
df.head()

In [None]:
df['Year']=df['datetime_new'].map(lambda x: x.year)
df['Month']=df['datetime_new'].map(lambda x:x.month)
df['Day']=df['datetime_new'].map(lambda x: x.day)
df['Hour']=df['datetime_new'].map(lambda x:x.hour)

In [None]:
df.sample(4)

In [None]:
df['Day'].unique()

## Define hourly plot for working day and non working day in whole year 
 here, 1:working day
       0: non working day

In [None]:
df.groupby(['Hour','workingday'])['count'].sum().unstack().plot.bar(figsize=(15,4), width=0.9)

In [None]:
#df[df.Year==2011]

## Define hourly plot for working day and non working day in 2011 and 2012

In [None]:
def define_hourly_work_plot(df, year=None,agg='sum'):
    data=df[df.Year==year]
    hourly_data=data.groupby(['Hour','workingday'])['count'].agg(agg).unstack()
    hourly_plot=hourly_data.plot(kind='bar', ylim=(0,110000),
                                 figsize=(15,5),
                                 width=0.9,
                                 title='Year={}'.format(year))

    return hourly_plot
    

In [None]:
define_hourly_work_plot(df,year=2011)
define_hourly_work_plot(df,year=2012)

# Define hourly plot in 2011 and 2012

In [None]:
df.groupby(['Hour'])['count'].sum().plot.bar()

In [None]:
def define_hourly_plot(df, year=None,agg='sum'):
    data=df[df.Year==year]
    hourly_data=data.groupby(['Hour'])['count'].agg(agg)
    hourly_plot=hourly_data.plot(kind='bar', ylim=(0,140000),
                                 figsize=(15,5),
                                 width=0.9,
                                 title='Year={}'.format(year))

    return hourly_plot

In [None]:
define_hourly_plot(df,year=2011)

In [None]:
define_hourly_plot(df,year=2012)

# Comparison Year to Year data

In [None]:
def houly_plot_years(attr,title):
    data=df.copy()
    hourly_data_year=data.groupby([attr,'Year'])['count'].agg('sum').unstack()
    hourly_data_year_plot=hourly_data_year.plot(kind='bar',figsize=(15,5),width=0.9, title=title)
    
    return hourly_data_year_plot

In [None]:
df['Hour'].unique()

In [None]:
houly_plot_years('Hour',"Rent bikes per hour in 2011 and 2012")
houly_plot_years('Day',"Rent bikes per Day in 2011 and 2012")
houly_plot_years('Month',"Rent bikes per Month in 2011 and 2012")

# Daywise count for specific hour

In [None]:
df[(df['Year']==2011) & (df['Month']==1)& (df['Day']==1)&(df['Hour']==0)]['count'].values

In [None]:
data=df.copy()
data1=data[(data['Year']==2011) & (data['Month']==1)]
data1[(data1.Day==1)&(data1.Hour==0)]['count'].values

In [None]:
data1[(data1.Day==2)&(data1.Hour==0)]['count'].values

In [None]:
data1[(data1.Day==19)&(data1.Hour==0)]['count'].values

In [None]:
data1[(data1.Day==20)&(data1.Hour==0)]['count'].values

In [None]:
days={}
data=df.copy()
data1=data[(data['Year']==2011) & (data['Month']==1)]

for day in range(1,20):
    days[day]=data1[(data1.Day==day)&(data1.Hour==0)]['count'].values
#print(hours)

plt.figure(figsize=(15,5))
for key,val in days.items():
    #print(key,val)
    plt.bar(key, val)
    plt.xticks(range(0,20))
    plt.xlabel('Day')
    plt.ylabel('count')
    plt.title('bikes rent at 1st hour of January, 2011')

In [None]:
days={}
data=df.copy()
data1=data[(data['Year']==2011) & (data['Month']==1)]
for i in range(1,24):
    for day in range(1,20):
        days[day]=data1[(data1.Day==day)&(data1.Hour==i)]['count'].values
    #print(hours)

    plt.figure(figsize=(15,5))
    for key,val in days.items():
        #print(key,val)
        plt.bar(key, val)
        plt.xticks(range(0,20))
        plt.xlabel('Day')
        plt.ylabel('count')
        plt.title('bikes rent at {} hour of January, 2011'.format(i))

# Using Box Plot

# Yearwise hourly count

In [None]:
def box_year_hourly(df,message=''):
    data=df.copy()
    hours={}
    for hour in range(24):
        hours[hour]=data[data.Hour==hour]['count'].values
    
    plt.figure(figsize=(20,10))
    plt.xlabel('Hours')
    plt.ylabel('Count rent')
    plt.title('count vs hours\n'+message)
    plt.boxplot([hours[hour] for hour in range(24)])
    
    axis=plt.gca()
    axis.set_ylim([1,1100])

In [None]:
# box plot for hourly count for the mentioned year
box_year_hourly(df[df['Year']==2011],'year 2011')
box_year_hourly(df[df['Year']==2012],'year 2012')

In [None]:

box_year_hourly( df[df.workingday == 1], 'working day') # plotting hourly count of rented bikes for working days for a given year
box_year_hourly( df[df.workingday == 0], 'non working day') # plotting hourly count of rented bikes for non-working days for a given year

In [None]:
# an Hour bs Count Graph depicting average bike demand based on the hour 
figure,axes = plt.subplots(figsize = (10, 5))
hours = df.groupby(["Hour"]).agg("mean")["count"]  
hours.plot(kind="line", ax=axes) 
plt.title('Hours VS Counts')
axes.set_xlabel('Time in Hours')
axes.set_ylabel('Average of the Bike Demand')
plt.show()

# define range of hour values

In [None]:
def range_to_group(x):
    if 0<=x<6:
        return 0
    elif 6<=x<13:
        return 1
    elif 13<=x<19:
        return 2
    elif 19<=x<24:
        return 3

In [None]:
df['Hour2']=df['Hour'].apply(range_to_group)

In [None]:
# an Hour bs Count Graph depicting average bike demand based on the hour 
figure,axes = plt.subplots(figsize = (10, 5))
hours = df.groupby(["Hour2"]).agg("mean")["count"]  
hours.plot(kind="line", ax=axes) 
plt.title('Hours VS Counts')
axes.set_xlabel('Time in Hours')
axes.set_ylabel('Average of the Bike Demand')
plt.show()

In [None]:
df.sample(5)

# What is the relationship between temp and count

In [None]:
df['temp'].hist()

In [None]:
df.groupby(['temp'])['count'].mean().plot()
plt.show()

In [None]:
df['atemp'].hist()

In [None]:
df.groupby(['atemp'])['count'].mean().plot()
plt.show()

In [None]:
df.groupby(['humidity'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df.groupby(['windspeed'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df.groupby(['casual'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df.groupby(['registered'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
season	holiday	workingday	weather

In [None]:
df.groupby(['season'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df.groupby(['holiday'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df.groupby(['workingday'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df.groupby(['weather'])['count'].mean().plot()
plt.ylabel('count')
plt.show()

In [None]:
df['holiday'].unique()

In [None]:
sns.pairplot(df)