In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

bicycle = pd.read_csv('train.csv')
bicycle.head()

Unnamed: 0,date_time,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,number_of_rentals
0,2018-04-01,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,22994
1,2018-04-02,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,28139
2,2018-04-03,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,26817
3,2018-04-04,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,26034
4,2018-04-05,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,2833


In [2]:
def seperate_datetime(dataframe):
    year = []
    month = []
    day = []

    for date in dataframe.date_time:
        year_point, month_point, day_point = date.split('-') # - 기준으로 string을 나누고 list로 만듦 ex) '2016-04-01' -> ['2016', '04', '01']
        year.append(int(year_point)-2017)
        month.append(int(month_point)-3)
        day.append(int(day_point))
    return year, month, day

year, month, day = seperate_datetime(bicycle)

bicycle['year'] = year
bicycle['month'] = month
bicycle['day'] = day

bicycle.head()

Unnamed: 0,date_time,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,number_of_rentals,year,month,day
0,2018-04-01,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,22994,1,1,1
1,2018-04-02,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,28139,1,1,2
2,2018-04-03,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,26817,1,1,3
3,2018-04-04,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,26034,1,1,4
4,2018-04-05,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,2833,1,1,5


# 예측

In [3]:
X  = bicycle.drop(['date_time', 'number_of_rentals'], axis=1)
y = bicycle.number_of_rentals
X

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day
0,207.500,4.000,0.000,3.050,75.000,12.600,21.000,30.000,1,1,1
1,208.317,2.950,0.000,3.278,69.833,12.812,19.000,19.500,1,1,2
2,213.516,2.911,0.000,2.690,74.879,10.312,15.316,19.113,1,1,3
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4
4,95.905,4.000,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
268,228.662,3.980,0.223,2.271,78.378,20.500,27.526,36.486,3,3,26
269,207.770,2.865,0.081,1.794,78.412,20.812,28.842,21.081,3,3,27
270,282.568,1.730,0.000,1.820,72.736,21.000,29.053,7.297,3,3,28
271,137.027,2.257,0.088,2.043,70.473,19.625,26.000,15.541,3,3,29


In [4]:
# metric 정의

import numpy as np

def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score

In [5]:
from sklearn.linear_model import LinearRegression

model = LinearRegression() # 모델 정의
model.fit(X, y) # 학습

y_hat = model.predict(X) # y 예측

score = NMAE(y, y_hat)

print(f'모델 NMAE: {score}')

모델 NMAE: 0.32347975212014557


In [6]:
# 1번은 사람 손으로 만든 feature들을 사용합니다.
X_human = X.copy()

In [7]:
from sklearn.preprocessing import LabelEncoder

# 요일 정보 추가
week_day = pd.to_datetime(bicycle['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
X_human['week_day'] = le.transform(week_day)
X_human.head()

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day,week_day
0,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,1,1,1,3
1,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,1,1,2,1
2,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,1,1,3,5
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4,6
4,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5,4


In [8]:
# 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
X_human['temp_diff_info'] = X_human['high_temp'] - X_human['low_temp']

# 2. "덥고 습한 날씨"를 알기 위한 정보
X_human['sweat_info'] = X_human['high_temp'] * X_human['humidity'] 

# 3. "춥고 바람부는 날씨"를 알기 위한 정보
X_human['cold_info'] = X_human['low_temp'] * X_human['wind_speed'] 

X_human.head()

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day,week_day,temp_diff_info,sweat_info,cold_info
0,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,1,1,1,3,8.4,1575.0,38.43
1,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,1,1,2,1,6.188,1326.827,41.997736
2,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,1,1,3,5,5.004,1146.846764,27.73928
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4,6,4.056,888.628432,26.083056
4,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5,4,4.546,768.903064,18.71775


In [9]:
## 1번 모델 성능 확인

model = LinearRegression() # 모델 정의
model.fit(X_human, y) # 학습

y_hat = model.predict(X_human) # y 예측

score = NMAE(y, y_hat)

print(f'모델 NMAE: {score}')

모델 NMAE: 0.28426261278332005


# 컴퓨터가 연산

In [10]:
# 2번은 컴퓨터로 만든 feature들을 사용합니다.
X_computer = X.copy()
X_computer.head()

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,day
0,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,1,1,1
1,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,1,1,2
2,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,1,1,3
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,4
4,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,1,1,5


In [11]:
col_list = X_computer.columns

# 이중 for문을 사용하여 feature 자기 자신의 제곱과 두 feature간의 곱이라는 새로운 feature를 추가합니다.
for i in range(len(col_list)):
    for j in range(i, len(col_list)):
        X_computer[f'{col_list[i]}*{col_list[j]}'] = X_computer[col_list[i]] * X_computer[col_list[j]]

X_computer

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,...,Precipitation_Probability*Precipitation_Probability,Precipitation_Probability*year,Precipitation_Probability*month,Precipitation_Probability*day,year*year,year*month,year*day,month*month,month*day,day*day
0,207.500,4.000,0.000,3.050,75.000,12.600,21.000,30.000,1,1,...,900.000000,30.000,30.000,30.000,1,1,1,1,1,1
1,208.317,2.950,0.000,3.278,69.833,12.812,19.000,19.500,1,1,...,380.250000,19.500,19.500,39.000,1,1,2,1,2,4
2,213.516,2.911,0.000,2.690,74.879,10.312,15.316,19.113,1,1,...,365.306769,19.113,19.113,57.339,1,1,3,1,3,9
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,...,1891.641049,43.493,43.493,173.972,1,1,4,1,4,16
4,95.905,4.000,0.723,3.186,73.784,5.875,10.421,63.378,1,1,...,4016.770884,63.378,63.378,316.890,1,1,5,1,5,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,228.662,3.980,0.223,2.271,78.378,20.500,27.526,36.486,3,3,...,1331.228196,109.458,109.458,948.636,9,9,78,9,78,676
269,207.770,2.865,0.081,1.794,78.412,20.812,28.842,21.081,3,3,...,444.408561,63.243,63.243,569.187,9,9,81,9,81,729
270,282.568,1.730,0.000,1.820,72.736,21.000,29.053,7.297,3,3,...,53.246209,21.891,21.891,204.316,9,9,84,9,84,784
271,137.027,2.257,0.088,2.043,70.473,19.625,26.000,15.541,3,3,...,241.522681,46.623,46.623,450.689,9,9,87,9,87,841


In [12]:
## 2번 모델 성능 확인

model = LinearRegression() # 모델 정의
model.fit(X_computer, y) # 학습

y_hat = model.predict(X_computer) # y 예측

score = NMAE(y, y_hat)

print(f'모델 NMAE: {score}')

모델 NMAE: 0.1401335572563925


# 컴퓨터 + 사람

In [13]:
# 3번은 2번에 사람의 도메인 지식을 추가한 형태입니다.
X_combination = X.copy()

In [14]:
from sklearn.preprocessing import LabelEncoder

# 요일 정보 추가
week_day = pd.to_datetime(bicycle['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
X_combination['week_day'] = le.transform(week_day)

# 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
X_combination['temp_diff_info'] = X_combination['high_temp'] - X_combination['low_temp']

# 2. "덥고 습한 날씨"를 알기 위한 정보
X_combination['sweat_info'] = X_combination['high_temp'] * X_combination['humidity'] 

# 3. "춥고 바람부는 날씨"를 알기 위한 정보
X_combination['cold_info'] = X_combination['low_temp'] * X_combination['wind_speed'] 

In [15]:
col_list = X_combination.columns

# 이중 for문을 사용하여 변수 자기 자신의 제곱과 두 변수간의 곱이라는 새로운 변수를 추가합니다.
for i in range(len(col_list)):
    for j in range(i, len(col_list)):
        X_combination[f'{col_list[i]}*{col_list[j]}'] = X_combination[col_list[i]] * X_combination[col_list[j]]

X_combination

Unnamed: 0,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,year,month,...,week_day*week_day,week_day*temp_diff_info,week_day*sweat_info,week_day*cold_info,temp_diff_info*temp_diff_info,temp_diff_info*sweat_info,temp_diff_info*cold_info,sweat_info*sweat_info,sweat_info*cold_info,cold_info*cold_info
0,207.500,4.000,0.000,3.050,75.000,12.600,21.000,30.000,1,1,...,9,25.200,4725.000000,115.290000,70.560000,13230.000000,322.812000,2.480625e+06,60527.250000,1476.864900
1,208.317,2.950,0.000,3.278,69.833,12.812,19.000,19.500,1,1,...,1,6.188,1326.827000,41.997736,38.291344,8210.405476,259.881990,1.760470e+06,55723.730064,1763.809829
2,213.516,2.911,0.000,2.690,74.879,10.312,15.316,19.113,1,1,...,25,25.020,5734.233820,138.696400,25.040016,5738.821207,138.807357,1.315258e+06,31812.703504,769.467655
3,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,1,1,...,36,24.336,5331.770592,156.498336,16.451136,3604.276920,105.792875,7.896605e+05,23178.145155,680.325810
4,95.905,4.000,0.723,3.186,73.784,5.875,10.421,63.378,1,1,...,16,18.184,3075.612256,74.871000,20.666116,3495.433329,85.090891,5.912119e+05,14392.135326,350.354165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,228.662,3.980,0.223,2.271,78.378,20.500,27.526,36.486,3,3,...,0,0.000,0.000000,0.000000,49.364676,15158.123050,327.098943,4.654516e+06,100440.364024,2167.414580
269,207.770,2.865,0.081,1.794,78.412,20.812,28.842,21.081,3,3,...,4,16.060,4523.117808,74.673456,64.480900,18160.317999,299.813926,5.114649e+06,84439.209655,1394.031258
270,282.568,1.730,0.000,1.820,72.736,21.000,29.053,7.297,3,3,...,9,24.159,6339.597024,114.660000,64.850809,17017.591611,307.785660,4.465610e+06,80766.466086,1460.768400
271,137.027,2.257,0.088,2.043,70.473,19.625,26.000,15.541,3,3,...,1,6.375,1832.298000,40.093875,40.640625,11680.899750,255.598453,3.357316e+06,73463.926975,1607.518813


In [16]:
## 3번 모델 성능 확인

model = LinearRegression() # 모델 정의
model.fit(X_combination, y) # 학습

y_hat = model.predict(X_combination) # y 예측

score = NMAE(y, y_hat)

print(f'모델 NMAE: {score}')

모델 NMAE: 0.10278949917570272
