# Bike sharing Demand

[캐글 자전거 대여수요](https://www.kaggle.com/c/bike-sharing-demand/overview)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [3]:
bike = pd.read_csv('data/bike_train.csv')

In [4]:
bike.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


* datetime : hourly date + timestemp
* season : 1 = 봄 , 2= 여름, 3= 가을 4= 겨울
* holiday : 1 = 주말을 제외한 국경일 등의 휴일, 0 = 휴일이 아닌 날 
* workingday : 1 = 주말/휴일이 아닌 주중, 0 = 주말 및 휴일 
* weather : 1 = 맑음, 2 = 안개 / 흐림 , 3 = 가벼운 비/눈 + 천둥 , 4 = 심한 눈/비 + 천둥/번개
* temp : 온도(섭씨)
* atemp : 체감온도(섭씨)
* humidity : 상대습도
* windspeed : 풍속
* casual : 사전에 등록되지 않은 사용자가 대여한 횟수
* registerd : 사전에 등록한 사용자가 대여한 횟수 
* count : 전체 대여 횟수

In [7]:
bike['datetime'] = bike.datetime.apply(pd.to_datetime)

In [10]:
bike['year'] = bike.datetime.apply(lambda x : x.year)
bike['month'] = bike.datetime.apply(lambda x : x.month)
bike['day'] = bike.datetime.apply(lambda x : x.day)
bike['hour'] = bike.datetime.apply(lambda x : x.hour)

bike.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2


In [11]:
bike.drop(['datetime','casual','registered'], axis = 1, inplace= True)

count = casual + registered 임으로 상관성이 높아 예측을 저해할 우려가 있음.  

In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [13]:
# log 변환 시 NaN 등의 이슈로 log1p()을 이용 

def rmsle(y, pred) :
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    
    squared_error = (log_y - log_pred)**2
    rmsle = np.sqrt(squared_error)
    
    return rmsle

def rmse(y, pred):
    return np.sqrt(mean_squared_error(y, pred))

def evaluate_regr(y, pred) :
    rmsle_val = rmsle(y, pred)
    rmse_val = rmse(y, pred)
    
    mse_val = mean_absolute_error(y, pred)
    print('RMSLE : {0:.3f}, RMSE : {1:.3f}, MSE :{2:.3f}'.format(rmsle_val, rmse_val, mse_val))