In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/Bike_Sharing_Demand.csv'
DF = pd.read_csv(url)

In [3]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [4]:
DF.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
DF['datetime'] = DF.datetime.apply(pd.to_datetime)
DF['year'] = DF.datetime.apply(lambda x : x.year)
DF['month'] = DF.datetime.apply(lambda x : x.month)
DF['day'] = DF.datetime.apply(lambda x : x.day)
DF['hour'] = DF.datetime.apply(lambda x: x.hour)


In [6]:
drop_columns = ['datetime', 'casual', 'registered']
DF.drop(drop_columns, axis = 1,inplace = True)

# year, month, day, hour 분리 후, datetime, casual, registered 열 삭제

In [7]:
DF = DF[['temp', 'atemp', 'humidity' ,'count', 'season', 'holiday', 'workingday', 'weather', 'month', 'hour']]
DF.head()

Unnamed: 0,temp,atemp,humidity,count,season,holiday,workingday,weather,month,hour
0,9.84,14.395,81,16,1,0,0,1,1,0
1,9.02,13.635,80,40,1,0,0,1,1,1
2,9.02,13.635,80,32,1,0,0,1,1,2
3,9.84,14.395,75,13,1,0,0,1,1,3
4,9.84,14.395,75,1,1,0,0,1,1,4


# Train & Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = DF[['temp', 'atemp', 'humidity', 'season', 'holiday', 'workingday', 'weather', 'month', 'hour']]
y = DF['count']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 2045)

In [11]:
print('Train Data : ', X_train.shape, y_train.shape)
print('Test Data : ', X_test.shape, y_test.shape)

Train Data :  (7620, 9) (7620,)
Test Data :  (3266, 9) (3266,)


# Gradient Boosting Machine Classifier 사용

In [54]:
%%time

from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(loss = 'ls',
                                n_estimators = 9000,
                                learning_rate = 0.0001,
                                criterion = 'mse',
                                max_features = 3,
                                max_depth = 1)

GBR.fit(X_train, y_train)

CPU times: user 11 s, sys: 18.4 ms, total: 11 s
Wall time: 11.1 s


# 1. Linear Regression

In [55]:
from sklearn.linear_model import LinearRegression

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                      test_size = 0.3, 
                                                      random_state = 2045)

In [57]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [58]:
y_hat = model_lr.predict(X_test)

In [59]:
print(model_lr.coef_)
print(model_lr.intercept_)

[  0.73929596   5.81229507  -2.3932283  -10.39178529   4.17815303
  -2.02075546  -1.75752214  10.62521953   7.66216618]
59.18916672341018


In [60]:
print("훈련 세트 점수: {:2f}".format(model_lr.score(X_train, y_train)))
print("테스트 세트 점수: {:2f}".format(model_lr.score(X_test, y_test)))

훈련 세트 점수: 0.341658
테스트 세트 점수: 0.326219


In [61]:
# 1파원 데이터셋에선 모델이 매우 단순. 과대적합을 걱정할 필요 없음.
# 고차원 모델에서는 선형 모델의 성능이 매우 높아져 과대적합 될 가능성

In [62]:
from sklearn.metrics import mean_squared_error

In [63]:
mean_squared_error(y_test, model_lr.predict(X_test))


21925.35925091442

In [64]:
lrMSE = mean_squared_error(y_test, model_lr.predict(X_test))
print(lrMSE)

21925.35925091442


# 2. Ridge Regression

In [65]:
from sklearn.linear_model import Ridge

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                      test_size = 0.3, 
                                                      random_state = 2045)

In [67]:
model_rd = Ridge(normalize = True,
                 alpha=0.1,
                 solver = 'cholesky').fit(X_train, y_train)
print("훈련 세트 점수: {:2f}".format(model_rd.score(X_train, y_train)))
print("테스트 세트 점수: {:2f}".format(model_rd.score(X_test, y_test)))

훈련 세트 점수: 0.339538
테스트 세트 점수: 0.327341


In [68]:
from sklearn.metrics import mean_squared_error

In [69]:
rdMSE = mean_squared_error(y_test, model_rd.predict(X_test))
print(rdMSE)

21888.854159887644


# 3. Lasso Regression

In [70]:
from sklearn.linear_model import Lasso
import numpy as np

In [71]:
model_ls = Lasso(alpha=0.01).fit(X_train, y_train)
print("훈련 세트 점수: {:2f}".format(model_ls.score(X_train, y_train)))
print("테스트 세트 점수: {:2f}".format(model_ls.score(X_test, y_test)))
print("사용한 특성의 개수: ", np.sum(model_ls.coef_ != 0))

훈련 세트 점수: 0.341658
테스트 세트 점수: 0.326250
사용한 특성의 개수:  9


In [72]:
from sklearn.metrics import mean_squared_error

In [73]:
lsMSE = mean_squared_error(y_test, model_ls.predict(X_test))
print(lsMSE)

21924.35401912829


# MSE 비교

In [74]:
print(lrMSE)
print(rdMSE)
print(lsMSE)

21925.35925091442
21888.854159887644
21924.35401912829


  * Linear Regression MSE가 21925.35925로 가장 높다