## 자전거 대여 수요 예측
### 경진대회 이해
- https://www.kaggle.com/competitions/bike-sharing-demand
- 경진대회명 : Bike Sharing Demand (자전거 대여 수요 예측)
- 미션 : 날짜, 계절, 근무일 여부, 날씨, 온도, 체감 온도, 풍속 데이터를 활용하여 자전거 대여 수량 예측
- 문제 유형 : 회귀
- 평가지표 : RMSLE

#### 6.3 탐색적 데이터 분석

##### 6.3.2 데이터 둘러보기
- datetime - hourly date + timestamp  (1시간 간격)
- season 
    - 1 = spring
    - 2 = summer
    - 3 = fall
    - 4 = winter 
- holiday - whether the day is considered a holiday (0: 공휴일x, 1: 공휴일)
- workingday - whether the day is neither a weekend nor holiday (0: 근무일x, 1: 근무일)
- weather
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy (맑음)
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist (옅은 안개, 약간 흐림)
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds (약간의 눈, 약간의 비)
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog (폭우, 천둥 번개)
- temp - temperature in Celsius
- atemp - "feels like" temperature in Celsius
- humidity - relative humidity
- windspeed - wind speed
- casual - number of non-registered user rentals initiated (비회원 수)
- registered - number of registered user rentals initiated (회원 수)
- count - number of total rentals

In [4]:
import numpy as np
import pandas as pd

data_path = 'C:/Users/hyebin.hyebin/Desktop/python/kaggle_dataset/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

In [5]:
train.shape, test.shape

((10886, 12), (6493, 9))

In [6]:
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [7]:
test.head() # 테스트 데이터에는 casual, registered 피처 없음 -> 모델 훈련 시에도 두 피처는 제외해야함

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [8]:
submission.head() # datetime은 ID역할만 하고, 타깃값을 예측하는 데에는 도움x, 모델 훈련 시 해당 피처 제거

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,0
1,2011-01-20 01:00:00,0
2,2011-01-20 02:00:00,0
3,2011-01-20 03:00:00,0
4,2011-01-20 04:00:00,0


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


##### 6.3.3 더 효과적인 분석을 위한 피처 엔지니어링
- 시각화하기에 적합하지 않은 형태의 데이터를 변환

In [13]:
# datetime 확인
print(train['datetime'][100])
print(train['datetime'][100].split())
print(train['datetime'][100].split()[0]) # 일자
print(train['datetime'][100].split()[1]) # 시간

2011-01-05 09:00:00
['2011-01-05', '09:00:00']
2011-01-05
09:00:00


In [16]:
print(train['datetime'][100].split()[0]) # 일자
print(train['datetime'][100].split()[0].split('-')) 
print(train['datetime'][100].split()[0].split('-')[0]) # 연도
print(train['datetime'][100].split()[0].split('-')[1]) # 월
print(train['datetime'][100].split()[0].split('-')[2]) # 일

2011-01-05
['2011', '01', '05']
2011
01
05


In [21]:
print(train['datetime'][100].split()[1]) # 시간
print(train['datetime'][100].split()[1].split(':')) 
print(train['datetime'][100].split()[1].split(':')[0]) # 시간
print(train['datetime'][100].split()[1].split(':')[1]) # 분
print(train['datetime'][100].split()[1].split(':')[2]) # 초

09:00:00
['09', '00', '00']
09
00
00


In [25]:
# 파생 피처, 파생 변수 생성

train['date'] = train['datetime'].apply(lambda x: x.split()[0]) # 일자

train['year'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[0]) # 연도
train['month'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[1]) # 월
train['day'] = train['datetime'].apply(lambda x: x.split()[0].split('-')[2]) # 일

train['hour'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[0]) # 시간
train['minute'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[1]) # 분
train['second'] = train['datetime'].apply(lambda x: x.split()[1].split(':')[2]) # 초

In [26]:
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,year,month,day,hour,minute,second
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,2011,1,1,0,0,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,2011,1,1,1,0,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,2011,1,1,2,0,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,2011,1,1,3,0,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,2011,1,1,4,0,0


In [30]:
from datetime import datetime
import calendar

print(train['date'][100]) # 일자
print(datetime.strptime(train['date'][100], '%Y-%m-%d')) # datetime 타입으로 변경

# 정수로 요일 반환. 0: 월, 1: 화, 2: 수, ...
print(datetime.strptime(train['date'][100], '%Y-%m-%d').weekday()) 

# 문자열로 요일 반환
print(calendar.day_name[datetime.strptime(train['date'][100], '%Y-%m-%d').weekday()]) 

2011-01-05
2011-01-05 00:00:00
2
Wednesday


In [31]:
train['weekday'] = train['date'].apply(lambda dateString: calendar.day_name[datetime.strptime(train['date'][100], '%Y-%m-%d').weekday()])

In [32]:
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,year,month,day,hour,minute,second,weekday
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011-01-01,2011,1,1,0,0,0,Wednesday
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011-01-01,2011,1,1,1,0,0,Wednesday
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011-01-01,2011,1,1,2,0,0,Wednesday
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011-01-01,2011,1,1,3,0,0,Wednesday
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011-01-01,2011,1,1,4,0,0,Wednesday


In [33]:
train['season'] = train['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'})
train['weather'] = train['weather'].map({1: 'Clear', 2: 'Mist, Few clouds', 3: 'Light Snow, Rain, Thunderstorm', 4: 'Heavy Rain, Thunderstorm, Snow, Fog'})

In [37]:
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,date,year,month,day,hour,minute,second,weekday
0,2011-01-01 00:00:00,Spring,0,0,Clear,9.84,14.395,81,0.0,3,13,16,2011-01-01,2011,1,1,0,0,0,Wednesday
1,2011-01-01 01:00:00,Spring,0,0,Clear,9.02,13.635,80,0.0,8,32,40,2011-01-01,2011,1,1,1,0,0,Wednesday
2,2011-01-01 02:00:00,Spring,0,0,Clear,9.02,13.635,80,0.0,5,27,32,2011-01-01,2011,1,1,2,0,0,Wednesday
3,2011-01-01 03:00:00,Spring,0,0,Clear,9.84,14.395,75,0.0,3,10,13,2011-01-01,2011,1,1,3,0,0,Wednesday
4,2011-01-01 04:00:00,Spring,0,0,Clear,9.84,14.395,75,0.0,0,1,1,2011-01-01,2011,1,1,4,0,0,Wednesday
