# 수치예측 - 자전거 대여 수요

---

아래는 날짜별로 자전거 대여 수요에 대한 데이터이다.
- 날짜별로 자전거 대여 수요(count)를 예측하는 모델을 구성하시오.

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Visual Python: Data Analysis > File
df = pd.read_csv('./data/bike.csv')
df

In [None]:
# Visual Python: Data Analysis > Data Info
df.info()

In [None]:
# Visual Python: Data Analysis > Data Info
df.describe()

In [None]:
# Visual Python: Data Analysis > Data Info
pd.DataFrame({'Null Count': df.isnull().sum(), 'Non-Null Count': df.notnull().sum()})

#### 날짜 타입 데이터 생성

In [None]:
# Visual Python: Data Analysis > Frame
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour
df

#### 컬럼 삭제

In [None]:
# Visual Python: Data Analysis > Frame
df.drop(['datetime','casual','registered'], axis=1, inplace=True)
df

#### 수치형 컬럼: Min-Max Scaling

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Prep
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# [2] Fit
scaler.fit(df[['temp', 'atemp', 'humidity', 'windspeed']])

# [3] Transform
trans = scaler.transform(df[['temp', 'atemp', 'humidity', 'windspeed']])

In [None]:
cols = ['temp', 'atemp', 'humidity', 'windspeed']

df[cols] = trans

#### count 예측

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'year', 'month', 'day', 'dayofweek', 'hour']], df['count'])

# [2] Regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# R squared
print('R squared: {}'.format(metrics.r2_score(y_test, pred)))
# MAE(Mean Absolute Error)
print('MAE: {}'.format(metrics.mean_absolute_error(y_test, pred)))
# RMSE(Root Mean Squared Error)
print('RMSE: {}'.format(metrics.mean_squared_error(y_test, pred)**0.5))
# Regression plot
display(Markdown('### Regression plot'))
plt.scatter(y_test, pred)
plt.xlabel('y_test')
plt.ylabel('pred')
plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * df_i['Feature_importance']
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(model, X_train, sort=True, top_count=10)

---

In [None]:
# End of file