# Bike Sharing Demand

## 1. Understanding the Competition

## 2. EDA (Exploratory Data Analysis)

### 2.1.  Exploring the data

In [2]:
# Get data (train data/test data/sample submission data)
import numpy as np
import pandas as pd

data_path = '/kaggle/input/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

In [3]:
# Check data size
train.shape, test.shape

In [4]:
# Check the part of data (train data)
train.head()

In [5]:
# Check the part of data (test data)
test.head()

---
- Remove features like **'casual'** and **'registered'** (which is existing in train data but not test data)
---

In [6]:
# Check the part of data (sample submission data)
submission.head()

---
- Remove **'datetime'** feature (which is ID column for separating data but don't contribute of predicting target value)
---

In [7]:
# Check the number of missing values and data type (train data)
train.info()

In [8]:
# Check the number of missing values and data type (test data)
test.info()

### 2.2. Feature Engineering for effective visualization

#### 1) Feature 'datetime'
- Difficult to visualize the raw data because of 'datetime' feature
- Split 'datetime' feature and make new features like **'date', 'year', 'month', 'day', 'hour', 'minute', 'second'** and **'weekday'**

In [9]:
# How to split 'datetime'
    # Split to date, time
print(train['datetime'][30].split())
print(train['datetime'][30].split()[0]) # date
print(train['datetime'][30].split()[1]) # time

    # Split to year, month, day
print(train['datetime'][30].split()[0].split('-')[0]) # year
print(train['datetime'][30].split()[0].split('-')[1]) # month
print(train['datetime'][30].split()[0].split('-')[2]) # day

    # Split to hour, minute, second
print(train['datetime'][30].split()[1].split(':')[0]) # hour
print(train['datetime'][30].split()[1].split(':')[1]) # minute
print(train['datetime'][30].split()[1].split(':')[2]) # second

In [10]:
# Make new features (date/year/month/day/hour/minute/second)
train['date'] = train['datetime'].apply(lambda x: x.split()[0])
train['year'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[0])
train['month'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[1])
train['day'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[2])
train['hour'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[0])
train['minute'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[1])
train['second'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[2])

In [11]:
# How to get weekday from date
    # Import library
from datetime import datetime
import calendar

    # Change datatype from object to datetime
print(train['date'][30])
print(datetime.strptime(train['date'][30], '%Y-%m-%d'))

    # Get weekday (integer)
print(datetime.strptime(train['date'][30], '%Y-%m-%d').weekday())

    # Get weekday (string)
print(calendar.day_name[datetime.strptime(train['date'][30], '%Y-%m-%d').weekday()])

In [12]:
# Make new feature (weekday)
train['weekday'] = train['date'].apply(
    lambda dateStr: 
    calendar.day_name[datetime.strptime(dateStr, '%Y-%m-%d').weekday()])

#### 2) Feature 'season' and 'weather'
- Feature 'season' and 'weather' are composed of numbers
- Difficult to understand the meaning of each number
- Change the number into real meaning

In [13]:
# Map the number into real meaningz
train['season'] = train['season'].map({1: 'Spring',
                                      2: 'Summer',
                                      3: 'Fall',
                                      4: 'Winter'})
train['weather'] = train['weather'].map({1: 'Clear',
                                        2: 'Mist, Few clouds',
                                        3: 'Light Snow, Rain, Thunderstorm',
                                        4: 'Heavy Rain, Thunderstorm, Snow, Fog'})

In [14]:
train.head()

---
- Remove features with duplicate meanings when training model
    + **'date'** because of 'year'/'month'/'day'
    + **'month'** because of 'season'
---

### 2.3. Data visualization

In [15]:
# Import library
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

#### 1) Distribution plot

In [16]:
# Distribution plot of target value 'count'
mpl.rc('font', size=15)
sns.displot(train['count']);

---
- Transform target values **'count'**
    - y > log(y)
        + Target values distribution is skewed to the left
        + <u>For a regression model to perform well, the data should be normally distributed</u>
        + Use <u>log transformation</u> to make the left-skewed data close to a normal distribution
    - log(y) > y
        + But the log-transformed target values must be restored by <u>exponential transformation at the end</u>
---

In [17]:
# Visualize the log-transformed target values
sns.displot(np.log(train['count']));

#### 2) Bar plot
- Matplotlib
    + Figure : meaning of the entire subplot in row m and column n
    + Axes : meaning of each subplot
    + [matplotlib.pyplot.tick_params](https://bit.ly/3DBCQaQ)

In [18]:
# Bar plots of the average rental amounts by year/month/day/hour/minute/second

# Step 1 : Prepare m x n figure
mpl.rc('font', size=4)                        # Set font size
mpl.rc('axes', titlesize=15)                  # Set title size of each axes
figure, axes = plt.subplots(nrows=3, ncols=2) # Make rows(3) x cols(2) figure
plt.tight_layout()                            # Add space between graphs
figure.set_size_inches(10, 9)                 # Set figure size width(10) x height(9)

# Step 2 : Allocate subplots to each axes
sns.barplot(x='year', y='count', data=train, ax=axes[0, 0])
sns.barplot(x='month', y='count', data=train, ax=axes[0, 1])
sns.barplot(x='day', y='count', data=train, ax=axes[1, 0])
sns.barplot(x='hour', y='count', data=train, ax=axes[1, 1])
sns.barplot(x='minute', y='count', data=train, ax=axes[2, 0])
sns.barplot(x='second', y='count', data=train, ax=axes[2, 1])

# Step 3 : Set details
    # Add title
axes[0, 0].set(title='Rental amounts by year')
axes[0, 1].set(title='Rental amounts by month')
axes[1, 0].set(title='Rental amounts by day')
axes[1, 1].set(title='Rental amounts by hour')
axes[2, 0].set(title='Rental amounts by minute')
axes[2, 1].set(title='Rental amounts by second')
    # Rotate labels of x axis
axes[1, 0].tick_params(axis='x', labelrotation=90)
axes[1, 1].tick_params(axis='x', labelrotation=90)

---
- Remove **'day', 'minute', 'second'** feature
    + year : 2011 amounts < 2012 amounts
    + month : Most amounts in June, Least mounts in January
    + day : No common values between train data and test data  
        - <u>To use 'day' column for feature, there should common values between train and test set</u>
    + hour : Most amounts at 8 am and 5-6 pm
    + minute : No information in this column
    + second : No information in this column
---

#### 4) Box plot

In [19]:
# Box plots on count across season/weather/holiday/working day

# Step 1 : Prepare m x n figure
figure, axes = plt.subplots(nrows=2, ncols=2)
plt.tight_layout()
figure.set_size_inches(10, 10)

# Step 2 : Allocate subplots to each axes
sns.boxplot(x='season', y='count', data=train, ax=axes[0,0])
sns.boxplot(x='weather', y='count', data=train, ax=axes[0,1])
sns.boxplot(x='holiday', y='count', data=train, ax=axes[1,0])
sns.boxplot(x='workingday', y='count', data=train, ax=axes[1,1])

# Step 3 : Set details
axes[0, 0].set(title='Box Plot On Count Across Season')
axes[0, 1].set(title='Box Plot On Count Across Weather')
axes[1, 0].set(title='Box Plot On Count Across Holiday')
axes[1, 1].set(title='Box Plot On Count Across Workingday')

axes[0, 1].tick_params(axis='x', labelrotation=10)

---
- Understanding graphs
    + Not in holiday, many outliers
    + In workingday, many outliers
---

#### 5) Point plot

In [20]:
# Point plot of average rental amounts per hour according to workingday/holiday/weekday/season/weather

# Step 1 : Prepare m x n figure
mpl.rc('font', size=11)
figure, axes = plt.subplots(nrows=5)
figure.set_size_inches(12, 18)

# Step 2 : Allocate subplots to each axes
sns.pointplot(x='hour', y='count', hue='workingday', data=train, ax=axes[0])
sns.pointplot(x='hour', y='count', hue='holiday', data=train, ax=axes[1])
sns.pointplot(x='hour', y='count', hue='weekday', data=train, ax=axes[2])
sns.pointplot(x='hour', y='count', hue='season', data=train, ax=axes[3])
sns.pointplot(x='hour', y='count', hue='weather', data=train, ax=axes[4])

---
- Understanding graphs
    + Workingday/Holiday/Weekday
        - Lots of rentals during commuting time on workingday
        - Lots of rentals at 12-2pm on days off
    + Season
        - Most in autumn and least in spring
    + Weather
        - Lots of rentals when good weather
        - **Delete outlier data whose weather is 4**(heavy snow, rain, thunder)
---

#### 6) Scatter plot with regression line
- Identify correlations between numeric data
- Parameters
    + scatter_kws={'alpha': 0.2} : Set the opacity of points
    + line_kws={'color': blue} : Set the color of regression line

In [22]:
# Scatter plot of rental amounts by temp/atemp/windspeed/humidity

# Step 1 : Prepare m x n figure
mpl.rc('font', size=15)
figure, axes = plt.subplots(nrows=2, ncols=2)
plt.tight_layout()
figure.set_size_inches(7, 6)

# Step 2 : Allocate subplots to each axes
sns.regplot(x='temp', y='count', data=train, ax=axes[0, 0],
           scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})
sns.regplot(x='atemp', y='count', data=train, ax=axes[0, 1],
           scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})
sns.regplot(x='windspeed', y='count', data=train, ax=axes[1, 0],
           scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})
sns.regplot(x='humidity', y='count', data=train, ax=axes[1, 1],
           scatter_kws={'alpha': 0.2}, line_kws={'color': 'blue'})

---
- Understanding graphs
    + temp : Higher temperature, more rental
    + atemp : Higher temperature, more rental
    + humidity : Lower humidity, more rental
    + windspeed
        - Higher windspeed, more rental?
        - Too many 0 values most likely to have been recorded due to an error
        - Hard to identify correlation between windspeed and rentals because of missing values
        - Remove **'windspeed'** feature
---

#### 7) Heatmap
- Identify correlations between numeric data and visualize with heatmap
- Numeric data : temp, atemp, humidity, windspeed, count

In [23]:
# Correlation between numeric data
train[['temp', 'atemp', 'humidity', 'windspeed', 'count']].corr()

In [24]:
# Correlation Matrix
corrMat = train[['temp', 'atemp', 'humidity', 'windspeed', 'count']].corr()

# Heatmap of correlations between numeric data
fig, ax = plt.subplots()
fig.set_size_inches(10, 10)
sns.heatmap(corrMat, annot=True) # annot=True : Display correlation coefficient
ax.set(title='Heatmap of Numerical Data');

---
- Understanding graphs
    + temp : Positive (Higher temperature, more rental)
    + atemp : Positive (Higher temperature, more rental)
    + humidity : Negative (Lower humidity, more rental)
    + windspeed
        - Positive but weak correlation
        - Remove **'windspeed'** feature
---

- When training ML model, features data type should be int/float