In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

## 데이터 준비

In [None]:
# -------------------------------------------------
# data 불러오기
# -------------------------------------------------
df = pd.read_csv('data/bike_sharing_demand.csv', parse_dates=['datetime'])

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,4,5


# 전처리

### 파생컬럼 추가

In [None]:
# --------------------------------------------
# 연, 월, 시, 요일
# --------------------------------------------
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek # 월요일:0, 일요일:6
df.head()

### 변수선택

In [6]:
# --------------------------------------------
# 독립변수, 종속변수선택
# --------------------------------------------

# 독립변수에 사용하지 않을 컬럼
del_cols = ['datetime','casual','registered','count','temp'] 

X = df.drop(del_cols, axis=1).copy()
y = df['count']

X.head()

Unnamed: 0,season,holiday,workingday,weather,atemp,humidity,windspeed,year,month,hour,dayofweek
0,1,0,0,1,14.395,81,0.0,2011,1,0,5
1,1,0,0,1,13.635,80,0.0,2011,1,1,5
2,1,0,0,1,13.635,80,0.0,2011,1,2,5
3,1,0,0,1,14.395,75,0.0,2011,1,3,5
4,1,0,0,1,14.395,75,0.0,2011,1,4,5


In [7]:
# ------------------------------------------------
# 독립변수 - 범주형, 수치형, 순환형 변수 구분
# ------------------------------------------------
cat_cols = ['season','holiday','workingday','weather']      
num_cols = ['atemp','humidity','windspeed', 'year']
cycle_cols = ['month', 'hour', 'dayofweek']

### 순환형 데이터 인코딩

In [9]:
X['hour_sin'] = np.sin(2 * np.pi * X['hour'] / 24)
X['hour_cos'] = np.cos(2 * np.pi * X['hour'] / 24)

X['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)
X['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)

X['dayofweek_sin'] = np.sin(2 * np.pi * X['dayofweek'] / 7)
X['dayofweek_cos'] = np.cos(2 * np.pi * X['dayofweek'] / 7)


### 훈련세트/테스트세트 분할

In [10]:
# 훈련세트/테스트세트 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8164, 17) (2722, 17) (8164,) (2722,)


### 전처리기 구성
* 파이프라인 
    * 데이터 처리 과정을 순차적으로 연결하여 자동화하는 도구
* ColumnTransformer
    * 열별 전처리 파이프라인 구성

In [11]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# -----------------------------
# ColumnTransformer 구성
#   - "cat" : 범주형 → OneHotEncoder
#   - "num" : 수치형 → StandardScaler
#   remainder="passthrough" : transformers에 포함되지 않은 컬럼은 변환 없이 그대로 유지
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ],
    remainder="passthrough"   # drop 으로 바꾸면 나머지 컬럼을 제거
)

### 파이프라인 구성

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# --------------------------------------------
# 파이프라인 정의
# --------------------------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# --------------------------------------------
# 훈련
# --------------------------------------------
pipeline.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = pipeline.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f'r2:{r2}')
print(f'rmse:{rmse}')
print(f'mae:{mae}')
print(f'mse:{mse}')

# --------------------------------------------
# 과적합 확인
# --------------------------------------------
print('train>>>>>>>>>>>>', pipeline.score(X_train, y_train))
print('test>>>>>>>>>>>>>', pipeline.score(X_test, y_test))

r2:0.9533745773604204
rmse:39.1148116187644
mae:24.450800146950776
mse:1529.9684879714264
train>>>>>>>>>>>> 0.9922214138812906
test>>>>>>>>>>>>> 0.9533745773604204


### 파이프라인 다운로드


In [13]:
import joblib

# ----------------------
# 파이프라인 저장
# ----------------------
joblib.dump(pipeline, "model/bike_rent_pipe.pkl")

['model/bike_rent_pipe.pkl']

In [22]:
# ------------------
# 예측 
# ------------------

new_row = {
    "season": 2, "holiday": 0, "workingday": 1, "weather": 3,
    "atemp": 20.5, "humidity": 55, "windspeed": 0.12,
    "year": 2025, "month": 5, "day": 1, "hour": 17, "dayofweek": 3
}

new_df = pd.DataFrame([new_row])

# Pipeline 내부에서 자동으로 → One‑Hot → 스케일링 → 예측
count_pred = loaded_pipe.predict(new_df)[0]
print(f"예상 대여 수: {count_pred:.0f}대")


예상 대여 수: 546대
