In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd drive/MyDrive/kaggle/SeoulBikeDemandForecasting/notebook

/content/drive/MyDrive/kaggle/SeoulBikeDemandForecasting/notebook


In [3]:
import pandas as pd
from sklearn.preprocessing import  LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV,  cross_val_score

In [4]:
data = pd.read_csv("../data/SeoulBikeData.csv", encoding= 'unicode_escape')

## (1) Problem Description

This is the data that stores the number of bicycle rentals in Seoul every hour from January 12, 2017 to November 30, 2018. Our goal is to predict the number of bicycle rentals on the last 28 days given the data.

## (2) Exploratory Data Analysis

In [5]:
data_new = pd.DataFrame(index = data.columns)
data_new['DataType'] = data.dtypes
data_new['num_data'] = data.count()
data_new['NaN'] = data.isnull().sum()

data_new

Unnamed: 0,DataType,num_data,NaN
Date,object,8760,0
Rented Bike Count,int64,8760,0
Hour,int64,8760,0
Temperature(°C),float64,8760,0
Humidity(%),int64,8760,0
Wind speed (m/s),float64,8760,0
Visibility (10m),int64,8760,0
Dew point temperature(°C),float64,8760,0
Solar Radiation (MJ/m2),float64,8760,0
Rainfall(mm),float64,8760,0


(1) There are 8,760 data with 14 features in the table. The features associated with time are Date, Hour, and Seasons.

In [6]:
data['Date'] = pd.to_datetime(data['Date'])
data['day'] = data['Date'].dt.day
data['month'] = data['Date'].dt.month
data['year'] = data['Date'].dt.year
data.drop(['Date'], axis = 1, inplace = True)

In [7]:
data.groupby("Functioning Day")['Rented Bike Count'].sum()

Functioning Day
No           0
Yes    6172314
Name: Rented Bike Count, dtype: int64

As the number of bikes is zero on a non-functioning day, the corresponding data is removed from the dataframe

In [8]:
data = data.drop(data[data['Functioning Day'] == 'No'].index)

(1) Apply LabelEncoder to the categorical data.

(2) The last 28 days and 24 hours data is used as test data and the rest of the data is used as learning data.

In [9]:
data_ = data.apply(LabelEncoder().fit_transform)
X = data_.drop('Rented Bike Count',axis=1)
y = data_['Rented Bike Count']

train, test = data_.iloc[:-678], data_.iloc[-678:]
x_train, y_train = train.drop(['Rented Bike Count'], axis = 1), train['Rented Bike Count']
x_test, y_test   = test.drop(['Rented Bike Count'], axis = 1), test['Rented Bike Count']

Set the LightGBM Regressor as the baseline and select the features through recursive feature elimination (RFE). 

And use GridSearchCV to tune the hyperparameters.

In [10]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

param_grid = [{'estimator__n_estimators': [50, 100, 150, 200], 
               'estimator__learning_rate': [0.1, 0.5, 0.01, 0.05]}] 

clf = LGBMRegressor(random_state = 42, n_estimators = 100, learning_rate = 0.01)
selector = RFECV(clf, step = 1, cv = 5, min_features_to_select = 5)
model = GridSearchCV(selector, param_grid, cv = 5)
#model = LGBMRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.7019152245826251

In [14]:
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

estimator = model.best_estimator_
y_pred = estimator.predict(x_test)

fig = make_subplots(rows = 1, cols = 1)

fig.add_trace(
    go.Scatter(x = np.arange(678), mode = 'lines', y = y_pred,
               marker = dict(color='dodgerblue'), 
               name = 'Prediction'), row = 1, col = 1)

fig.add_trace(
    go.Scatter(x = np.arange(678), mode = 'lines', y = y_test,
               marker = dict(color="seagreen"), 
               name = 'Ground Truth'), row = 1, col = 1)

fig.show()

In [13]:
from sklearn.metrics import r2_score

print(f"Model R-Square : {r2_score(y_test,y_pred)*100:.2f}%")

Model R-Square : 70.19%
