In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import plotly.express as px

# 그래프 타이틀 한글 출력
matplotlib.rcParams['font.family'] ='Malgun Gothic'

# 그래프 사이즈 지정
plt.rcParams['figure.figsize'] = [12, 8]

import warnings
warnings.filterwarnings('ignore')

### 1. 데이터 불러오기 : 자전거 대여량 수요 예측

In [4]:
df = pd.read_csv('./datasets/bike_train.csv')
pd.set_option('display.max_columns', len(df.columns))
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


주요 컬럼 설명
- datetime: hourly date + timestamp
- season: 1 = 봄, 2 = 여름, 3 = 가을, 4 = 겨울
- holiday: 1 = 토, 일요일의 주말을 제외한 국경일 등의 휴일, 0 = 휴일이 아닌 날
- workingday: 1 = 토, 일요일의 주말 및 휴일이 아닌 주중, 0 = 주말 및 휴일
- weather: 1 = 맑음, 약간 구름 낀 흐림 2 = 안개, 안개 + 흐림 3 = 가벼운 눈, 가벼운 비 + 천둥 4 = 심한 눈/비, 천둥/번개
- temp: 온도(섭씨)
- atemp: 체감온도(섭씨)
- humidity: 상대습도
- windspeed: 풍속
- casual: 사전에 등록되지 않는 사용자가 대여한 횟수
- registered: 사전에 등록된 사용자가 대여한 횟수
- count: 대여 횟수

자전거 대여 횟수(Y) 예측을 위해 X를 만들어 모델에 적용해보세요!

### 2. 전처리

- casual, registered 컬럼은 대여 횟수이므로 이용할 수 없음

In [5]:
df_p = df.copy()

df_p = df.drop(columns = ['casual', 'registered'])

- 결측치는 없음. datetime은 object type인 상태.

In [6]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   count       10886 non-null  int64  
dtypes: float64(3), int64(6), object(1)
memory usage: 850.6+ KB


- 이상한 값 없나 확인

In [7]:
df_p.season.unique()

array([1, 2, 3, 4], dtype=int64)

In [8]:
df_p.holiday.unique()

array([0, 1], dtype=int64)

In [9]:
df_p.workingday.unique()

array([0, 1], dtype=int64)

In [10]:
df_p.weather.unique()

array([1, 2, 3, 4], dtype=int64)

- datetime 컬럼에서 연도, 월, 시간 뽑아오기

In [11]:
df_p['datetime'] = df_p['datetime'].apply(pd.to_datetime)

In [12]:
df_p['year'] = df_p['datetime'].apply(lambda x: x.year)
df_p['month'] = df_p['datetime'].apply(lambda x: x.month)
df_p['hour'] = df_p['datetime'].apply(lambda x: x.hour)

In [13]:
df_p

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,...,humidity,windspeed,count,year,month,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,...,81,0.0000,16,2011,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,...,80,0.0000,40,2011,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,...,80,0.0000,32,2011,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,...,75,0.0000,13,2011,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,...,75,0.0000,1,2011,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,...,50,26.0027,336,2012,12,19
10882,2012-12-19 20:00:00,4,0,1,1,14.76,...,57,15.0013,241,2012,12,20
10883,2012-12-19 21:00:00,4,0,1,1,13.94,...,61,15.0013,168,2012,12,21
10884,2012-12-19 22:00:00,4,0,1,1,13.94,...,61,6.0032,129,2012,12,22


### 3. 시각화

- season에 따른 대여량. 
    - season 1의 경우 컬럼 설명은 봄이지만 사실상 겨울에 해당하는 1월과 2월이 끼어있어 대여량이 낮다
    - 유의미한 차이가 있어 예측에 도움이 될 듯 하다

In [15]:
df_s = df_p.groupby(['season']).mean().reset_index()
df_s

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,hour
0,1,0.026433,0.680566,1.424423,12.530491,15.228956,56.297841,14.636978,116.343261,2011.507446,2.006329,11.642591
1,2,0.017563,0.692645,1.422978,22.823483,26.647098,60.852909,13.405607,215.251372,2011.499817,5.001098,11.508965
2,3,0.035126,0.675082,1.36663,28.789111,32.540783,64.123674,11.508862,234.417124,2011.500549,7.998902,11.510428
3,4,0.035113,0.675201,1.459766,16.649239,20.059909,66.173738,11.678147,198.988296,2011.5,11.000366,11.506218


In [16]:
fig = px.histogram(df_p, x="season", text_auto=True,
                 y="count", nbins = 10, width=600, height=400)

fig.show()

In [17]:
fig = px.bar(df_s, x="season", text_auto=True,
                 y="count", width=600, height=400)

fig.show()

- holiday에 따른 평균 대여량
    - 거의 차이가 없다

In [18]:
df_h = df_p.groupby(['holiday']).mean().reset_index()
df_h

Unnamed: 0,holiday,season,workingday,weather,temp,atemp,humidity,windspeed,count,year,month,hour
0,0,2.500993,0.700898,1.419196,20.230466,23.662662,61.880095,12.787623,191.741655,2011.500898,6.520473,11.542033
1,1,2.697749,0.0,1.392283,20.244244,23.397395,62.102894,13.199711,185.877814,2011.536977,6.55627,11.527331


In [19]:
fig = px.bar(df_h, x="holiday", text_auto=True,
                 y="count", width=600, height=400)

fig.update_layout(bargap=0.2)

fig.show()

- workingday 에 따른 평균 대여량
    - 역시 큰 차이가 없다

In [20]:
df_w = df_p.groupby(['workingday']).mean().reset_index()

In [21]:
fig = px.histogram(df_w, x="workingday", text_auto=True,
                 y="count", width=600, height=400)

fig.update_layout(bargap=0.2)

fig.show()

- 시간에 따른 holiday, workingday 별 count 그래프
    - 시간으로 나눠서 보니 의미가 있다
    - 근무일에는 출퇴근 시간에 대여량이 늘어남

In [152]:
df_hh = df_p.groupby(['hour', 'holiday']).mean().reset_index()
df_hh.head()

Unnamed: 0,hour,holiday,season,workingday,weather,temp,atemp,humidity,windspeed,count,year,month
0,0,0,2.497738,0.701357,1.39819,19.002851,22.454796,68.097285,10.676935,54.79638,2011.5,6.511312
1,0,1,2.692308,0.0,1.230769,19.364615,22.727308,67.461538,11.538946,66.769231,2011.538462,6.538462
2,1,0,2.496599,0.70068,1.437642,18.633152,22.009252,69.623583,10.408488,33.582766,2011.501134,6.505669
3,1,1,2.692308,0.0,1.230769,18.86,22.086923,68.153846,10.769992,43.230769,2011.538462,6.538462
4,2,0,2.517241,0.701149,1.409195,18.454713,21.828655,70.666667,10.12447,22.747126,2011.505747,6.56092


In [153]:
fig = px.line(df_hh, x = 'hour', y = 'count', color = 'holiday', width = 1000, height = 400)

fig.show()

In [150]:
df_hw = df_p.groupby(['hour', 'workingday']).mean().reset_index()
df_hw.head()

Unnamed: 0,hour,workingday,season,holiday,weather,temp,atemp,humidity,windspeed,count,year,month
0,0,0,2.517241,0.089655,1.337931,18.68469,22.121138,66.972414,10.510775,94.489655,2011.503448,6.531034
1,0,1,2.496774,0.0,1.419355,19.166839,22.62229,68.596774,10.790804,36.732258,2011.5,6.503226
2,1,0,2.517241,0.089655,1.413793,18.305793,21.630276,68.606897,10.172633,71.910345,2011.503448,6.531034
3,1,1,2.495146,0.0,1.440129,18.796311,22.190356,70.038835,10.534373,16.003236,2011.501618,6.495146
4,2,0,2.538462,0.090909,1.384615,18.12028,21.466503,69.79021,10.000429,53.748252,2011.503497,6.58042


In [151]:
fig = px.line(df_hw, x = 'hour', y = 'count', color = 'workingday', width = 1000, height = 400)

fig.show()

- 연도별 대여량
    - 2011년과 2012년 사이에 유의미한 차이가 난다.
    - 연도는 반복되지 않는데 예측에 쓰는게 맞나..?

In [22]:
fig = px.histogram(df_p, x="year", text_auto=True,
                 y="count", width=600, height=400)

fig.update_layout(bargap=0.2)

fig.show()

- 월별 대여량

In [23]:
fig = px.histogram(df_p, x="month", text_auto=True,
                 y="count", nbins = 50, width=1000, height=400)

fig.show()

- 시간별 대여량
    - 1~5시 새벽시간에는 사용이 너무 적어서 모두 합치는 걸 고려해볼 수도..

In [24]:
fig = px.histogram(df_p, x="hour", text_auto=True,
                 y="count", nbins = 50, width=1000, height=400)

fig.show()

- 바람 세기 분포
    - 바람 세기가 0인 날이 많다.
    - 결측치이거나 일정 이하의 숫자면 0으로 몰아넣은 듯

In [84]:
px.histogram(df_p, x = 'windspeed', y = df_p.index, width = 800, height = 400)

### 4. Encoding / Scaling

- Encoding

In [155]:
df_pp = df_p.copy()

In [18]:
df_pp = pd.get_dummies(df_pp, columns=['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'hour'])

- train / test split

In [19]:
X = df_pp.drop(columns = ['datetime', 'count'])
y = df_pp['count']

In [20]:
X

Unnamed: 0,temp,atemp,humidity,windspeed,season_1,season_2,...,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,9.84,14.395,81,0.0000,1,0,...,0,0,0,0,0,0
1,9.02,13.635,80,0.0000,1,0,...,0,0,0,0,0,0
2,9.02,13.635,80,0.0000,1,0,...,0,0,0,0,0,0
3,9.84,14.395,75,0.0000,1,0,...,0,0,0,0,0,0
4,9.84,14.395,75,0.0000,1,0,...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,15.58,19.695,50,26.0027,0,0,...,0,1,0,0,0,0
10882,14.76,17.425,57,15.0013,0,0,...,0,0,1,0,0,0
10883,13.94,15.910,61,15.0013,0,0,...,0,0,0,1,0,0
10884,13.94,17.425,61,6.0032,0,0,...,0,0,0,0,1,0


In [21]:
X = X.to_numpy()
y = y.to_numpy()

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 7)

- Scaling

In [23]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

- atemp 지워보기

In [204]:
X = df_pp.drop(columns = ['datetime', 'atemp', 'count'])
y = df_pp['count']

- 1~5시 통합

In [16]:
# df_pp = df_p.copy()

df_pp['hour'] = df_pp['hour'].apply(lambda x: 5 if x in [1,2,3,4,5] else x)

In [17]:
df_pp

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,...,humidity,windspeed,count,year,month,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,...,81,0.0000,2.772589,2011,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,...,80,0.0000,3.688879,2011,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,...,80,0.0000,3.465736,2011,1,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,...,75,0.0000,2.564949,2011,1,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,...,75,0.0000,0.000000,2011,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,...,50,26.0027,5.817111,2012,12,19
10882,2012-12-19 20:00:00,4,0,1,1,14.76,...,57,15.0013,5.484797,2012,12,20
10883,2012-12-19 21:00:00,4,0,1,1,13.94,...,61,15.0013,5.123964,2012,12,21
10884,2012-12-19 22:00:00,4,0,1,1,13.94,...,61,6.0032,4.859812,2012,12,22


- 연도 빼기

In [120]:
df_pp = pd.get_dummies(df_pp, columns=['season', 'holiday', 'workingday', 'weather', 'month', 'hour'])

In [121]:
X = df_pp.drop(columns = ['datetime', 'atemp', 'count', 'year'])
y = df_pp['count']

### 5. 모델 적용 / 평가

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 사이킷런의 mean_square_error() 를 이용하여 RMSE 계산
def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

# MSE, RMSE, RMSLE 를 모두 계산
def evaluate_regr(y,pred):
    rmse_val = rmse(y,pred)
    # MAE 는 scikit learn의 mean_absolute_error() 로 계산
    mae_val = mean_absolute_error(y,pred)
    print(f'RMSE: {rmse_val:.3F}, MAE: {mae_val:.3F}')

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor


lr_reg = LinearRegression()
lgbm_reg = LGBMRegressor()


In [189]:
# 1
lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

evaluate_regr(y_test ,pred)

RMSE: 147.217, MAE: 108.096


In [210]:
# 2. atemp 지워보기

lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

evaluate_regr(y_test ,pred)

RMSE: 147.309, MAE: 108.217


In [283]:
# 3. year, month 컬럼 추가

lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

evaluate_regr(y_test ,pred)

RMSE: 139.978, MAE: 105.029


In [34]:
# 4. OneHot Encoding

lr_reg.fit(X_train, y_train)
pred = lr_reg.predict(X_test)

evaluate_regr(y_test ,pred)

RMSE: 104.044, MAE: 75.559


In [35]:
# 5. LGBMRegressor

lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)

evaluate_regr(y_test ,pred)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 53
[LightGBM] [Info] Start training from score 192.077745
RMSE: 49.573, MAE: 31.859


In [171]:
# 6. 1시부터 5시 통합.

lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)

evaluate_regr(y_test ,pred)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 49
[LightGBM] [Info] Start training from score 192.077745
RMSE: 48.005, MAE: 31.141


In [180]:
# 7. 0 미만으로 예측한 것은 0으로 변경

lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)

for i in range(len(pred)):
    if pred[i] < 0:
        pred[i] = 0
        
evaluate_regr(y_test ,pred)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 302
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 49
[LightGBM] [Info] Start training from score 192.077745
RMSE: 47.975, MAE: 31.050


In [126]:
# 8. 연도 빼고

lgbm_reg.fit(X_train, y_train)
pred = lgbm_reg.predict(X_test)

evaluate_regr(y_test ,pred)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 240
[LightGBM] [Info] Number of data points in the train set: 8708, number of used features: 46
[LightGBM] [Info] Start training from score 192.077745
RMSE: 69.024, MAE: 47.638


### 6. 추가) Linear Regression / LGBM Regressor 차이 시각화

In [36]:
pd.DataFrame(data=[y_test, pred])

Unnamed: 0,0,1,2,3,4,5,...,2172,2173,2174,2175,2176,2177
0,285.0,138.0,215.0,184.0,24.0,153.0,...,270.0,469.0,253.0,58.0,76.0,4.0
1,256.609859,140.785171,206.829941,212.559278,3.20207,175.158484,...,218.907411,569.931774,303.265579,66.509515,83.166625,52.755901


In [37]:
df_plot = pd.DataFrame({'y_test': y_test, 'pred': pred})

In [38]:
df_plot

Unnamed: 0,y_test,pred
0,285,256.609859
1,138,140.785171
2,215,206.829941
3,184,212.559278
4,24,3.202070
...,...,...
2173,469,569.931774
2174,253,303.265579
2175,58,66.509515
2176,76,83.166625


- Linear Regression 예측 결과 Scatter Plot

In [387]:
import plotly.express as px
import plotly.graph_objects as go


fig = px.scatter(df_plot, x = y_test, y = df_plot.index, width=1000, height=600)

fig.add_trace(go.Scatter(x = df_plot.pred, y = df_plot.index, mode='markers'))

fig.show()

- LGBM Regressor 예측 결과 Scatter Plot

In [85]:
import plotly.express as px
import plotly.graph_objects as go


fig = px.scatter(df_plot, x = y_test, y = df_plot.index, width=1000, height=600)

fig.add_trace(go.Scatter(x = df_plot.pred, y = df_plot.index, mode='markers'))

fig.show()