# [과제]
- 다중공선성, 과적합, 종속변수의 정규성 등을 고려하여 전처리 후 회귀분석 모델링, 학습 및 평가를 수행하세요.
- 모델링은 하이퍼 파라미터 옵션 포함하여 학습한 모든 방법 고려하여 수행

### imported modules

In [67]:
# basic module
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# sepearte module
from sklearn.model_selection import train_test_split

# machine Learning modules (Regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# machine Learning modules (Regression-trees)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# machine Learning evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# Developing
from sklearn.model_selection import GridSearchCV

### data load

In [2]:
from sklearn.datasets import load_boston
house = load_boston()
house.keys()

house_df = pd.DataFrame(house.data, columns= house.feature_names)
house_df.to_pickle('../Data/house_df.pkl')
house_df = pd.read_pickle('../Data/house_df.pkl')
house_df['PRICE'] = house.target
house_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### EDA

In [3]:
display(house_df.shape)
house_df.info()

(506, 14)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [4]:
hc = house_df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(hc,annot=True)

<AxesSubplot:>

In [5]:
X = house_df.iloc[:,:-1]
y = house_df.iloc[:,-1]

### collinearity

In [6]:
import statsmodels.formula.api as smf
model = smf.ols(formula='PRICE ~ ' + '+'.join(house_df.columns),data=house_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.716e+30
Date:                Tue, 31 Aug 2021   Prob (F-statistic):               0.00
Time:                        10:01:34   Log-Likelihood:                 14873.
No. Observations:                 506   AIC:                        -2.972e+04
Df Residuals:                     491   BIC:                        -2.965e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -2.301e-14   4.76e-14     -0.483      0.6

In [29]:
# Strong multicollinerity

In [30]:
X = house_df.drop(['DIS','PRICE'],axis=1)
y = house_df['PRICE']

### sepearte

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

### Machine Learning models

#### Regression

In [32]:
# Linear_
lr = LinearRegression()

# Regulation_
ridge = Ridge()
lasso = Lasso()
ela = ElasticNet()

#### Regression Tree

In [33]:
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
gbr = GradientBoostingRegressor()
xgb = XGBRegressor()
lgb = LGBMRegressor()

### Fitting

In [34]:
models = [lr,ridge,lasso,ela,dt,rf,gbr,xgb,lgb]
for model in models:
    model.fit(X_train,y_train)

### predict

In [35]:
lr_pred = lr.predict(X_test)
ridge_pred = ridge.predict(X_test)
lasso_pred = lasso.predict(X_test)
ela_pred = ela.predict(X_test)

dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)
gbr_pred = gbr.predict(X_test)
xgb_pred = xgb.predict(X_test)
lgb_pred = lgb.predict(X_test)


### Evaluation

In [36]:
def evaluation(model_name,y_test,prediction,n_cv):
    mse = mean_squared_error(y_test,prediction)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,prediction)
    np.set_printoptions(precision=6, suppress=True)
    print('[model]:',model_name,'\n')
    print('[MSE] : {:.4f}, [RMSE] : {:.4f}, [r2] : {:.4f}'.format(mse,rmse,r2))
    print('[Intercept]:', lr.intercept_)
    print('[Coef_]:', np.round(lr.coef_,2),'\n')
    
    neg_mse_score = cross_val_score(model_name, X, y,scoring='neg_mean_squared_error',cv=n_cv)
    rmse_score = np.sqrt(-1*neg_mse_score)
    avg_rmse = np.mean(rmse_score)
    print('5 folds independent Negative MSE scores:', np.round(neg_mse_score,2))
    print('5 folds indenpendent RMSE scores:', np.round(rmse_score,2))
    print('5 folds Mean RMSE: {:.3f}'.format(avg_rmse))
    print('---' * 30)

#### Linear_Regression evaulation

In [37]:
# a = [1,2,3,4,5]
# b = [6,7,8,9,10]
# for x,y in zip(a,b):
#     print(x,y)

In [38]:
models = [lr,ridge,lasso,ela]
predictions = [lr_pred,ridge_pred,lasso_pred,ela_pred]

for a,b in zip(models,predictions):
    evaluation(a, y_test, b, 5)

[model]: LinearRegression() 

[MSE] : 30.1148, [RMSE] : 5.4877, [r2] : 0.6383
[Intercept]: 25.063717601256283
[Coef_]: [-0.09  0.    0.11  2.63 -9.13  4.34  0.02  0.24 -0.01 -1.18  0.01 -0.48] 

5 folds independent Negative MSE scores: [-12.15 -25.62 -38.6  -86.86 -36.76]
5 folds indenpendent RMSE scores: [3.49 5.06 6.21 9.32 6.06]
5 folds Mean RMSE: 6.029
------------------------------------------------------------------------------------------
[model]: Ridge() 

[MSE] : 30.2871, [RMSE] : 5.5034, [r2] : 0.6363
[Intercept]: 25.063717601256283
[Coef_]: [-0.09  0.    0.11  2.63 -9.13  4.34  0.02  0.24 -0.01 -1.18  0.01 -0.48] 

5 folds independent Negative MSE scores: [-11.37 -26.1  -35.16 -86.1  -35.53]
5 folds indenpendent RMSE scores: [3.37 5.11 5.93 9.28 5.96]
5 folds Mean RMSE: 5.930
------------------------------------------------------------------------------------------
[model]: Lasso() 

[MSE] : 33.8173, [RMSE] : 5.8153, [r2] : 0.5939
[Intercept]: 25.063717601256283
[Coef_]: [-0

##### Linear_Regression

In [39]:
evaluation(lr,y_test,lr_pred,5)

[model]: LinearRegression() 

[MSE] : 30.1148, [RMSE] : 5.4877, [r2] : 0.6383
[Intercept]: 25.063717601256283
[Coef_]: [-0.09  0.    0.11  2.63 -9.13  4.34  0.02  0.24 -0.01 -1.18  0.01 -0.48] 

5 folds independent Negative MSE scores: [-12.15 -25.62 -38.6  -86.86 -36.76]
5 folds indenpendent RMSE scores: [3.49 5.06 6.21 9.32 6.06]
5 folds Mean RMSE: 6.029
------------------------------------------------------------------------------------------


##### regulation_Linear_regression

In [40]:
def reg_evaluation(model_name, params=None, X_data_n=None, y_target_n=None,verbose=True):
    coeff_df = pd.DataFrame()
    if verbose : print('[model] :' ,model_name , '\n')
    for param in params:
        if model_name == 'Ridge': model = Ridge(alpha=param)
        elif model_name == 'Lasso': model = Lasso(alpha=param)
        elif model_name == 'ElasticNet': model = ElasticNet(alpha=param, l1_ratio=0.7)
        
        neg_mean_scores = cross_val_score(model,X_data_n,y_target_n, scoring='neg_mean_squared_error',cv=5)
        avg_rmse = np.mean(np.sqrt(-1*neg_mean_scores))
        
        print('alpha : {}  (5 folds) RMSE means : {:.4f}'.format(param,avg_rmse))
        model.fit(X_data_n, y_target_n)
        
        coeff = pd.Series(data=model.coef_, index=X_data_n.columns)
        colname= 'alpha:' + str(param)
        coeff_df[colname] = coeff
    return coeff_df

###### ridge

In [41]:
alphas = [0,0.1,1,10,100]
print('[model]: Ridge() \n')
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    neg_mean_score = cross_val_score(ridge,X,y,scoring="neg_mean_squared_error",cv =5)
    avg_rmse=np.mean(np.sqrt(-1*neg_mean_score))
    print('alpha : {} | (5 folds) RMSE means : {:.4f}'.format(alpha,avg_rmse))
    print('---' * 20)

[model]: Ridge() 

alpha : 0 | (5 folds) RMSE means : 6.0285
------------------------------------------------------------
alpha : 0.1 | (5 folds) RMSE means : 6.0116
------------------------------------------------------------
alpha : 1 | (5 folds) RMSE means : 5.9298
------------------------------------------------------------
alpha : 10 | (5 folds) RMSE means : 5.7884
------------------------------------------------------------
alpha : 100 | (5 folds) RMSE means : 5.6032
------------------------------------------------------------


###### Lasso & ElasticNet

In [42]:
lasso_alphas = [0.07,0.1,0.5,1,3]
coeff_lasso_df = reg_evaluation('Lasso',params=lasso_alphas,X_data_n=X,y_target_n=y)
sort_column ='alpha:' + str(lasso_alphas[0])
coeff_lasso_df.sort_values(by=sort_column, ascending=False)

[model] : Lasso 

alpha : 0.07  (5 folds) RMSE means : 5.8775
alpha : 0.1  (5 folds) RMSE means : 5.8857
alpha : 0.5  (5 folds) RMSE means : 5.9157
alpha : 1  (5 folds) RMSE means : 5.9544
alpha : 3  (5 folds) RMSE means : 6.1888


Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
RM,4.074027,3.982371,2.682478,1.071938,0.0
CHAS,1.596071,1.124633,0.0,0.0,0.0
RAD,0.291803,0.294006,0.278107,0.260734,0.061902
INDUS,0.060858,0.060736,0.014954,0.0,-0.0
AGE,0.019323,0.020587,0.031913,0.041847,0.042495
ZN,0.011932,0.01223,0.016022,0.02428,0.037231
B,0.010241,0.010232,0.009323,0.008107,0.006511
NOX,-0.0,-0.0,-0.0,-0.0,0.0
TAX,-0.013911,-0.014046,-0.013163,-0.013111,-0.008603
CRIM,-0.077231,-0.077264,-0.067989,-0.052772,-0.0


In [43]:
ela_alphas = [0.07,0.1,0.5,1,3]
coeff_lasso_df = reg_evaluation('ElasticNet',params=ela_alphas,X_data_n=X,y_target_n=y)
sort_column ='alpha:' + str(ela_alphas[0])
coeff_lasso_df.sort_values(by=sort_column, ascending=False)

[model] : ElasticNet 

alpha : 0.07  (5 folds) RMSE means : 5.8047
alpha : 0.1  (5 folds) RMSE means : 5.7912
alpha : 0.5  (5 folds) RMSE means : 5.7292
alpha : 1  (5 folds) RMSE means : 5.8026
alpha : 3  (5 folds) RMSE means : 6.0778


Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
RM,3.843115,3.669934,2.052707,1.003266,0.0
CHAS,1.447499,1.089848,0.0,0.0,0.0
RAD,0.301119,0.305222,0.309306,0.286982,0.146801
INDUS,0.061149,0.059867,0.020567,0.0,-0.0
AGE,0.021229,0.022971,0.0367,0.043647,0.044476
ZN,0.013031,0.01372,0.019969,0.025749,0.037166
B,0.010121,0.010045,0.008964,0.008111,0.007008
NOX,-0.0,-0.0,-0.0,-0.0,-0.0
TAX,-0.01429,-0.014478,-0.014515,-0.014043,-0.011329
CRIM,-0.078296,-0.07843,-0.07279,-0.062105,-0.018573


##### Regression_Tree 

In [44]:
models = [dt,rf,gbr,xgb,lgb]
predictions = [dt_pred,rf_pred,gbr_pred,xgb_pred,lgb_pred]

for a,b in zip(models,predictions):
    evaluation(a, y_test, b, 5)

[model]: DecisionTreeRegressor() 

[MSE] : 25.1309, [RMSE] : 5.0131, [r2] : 0.6982
[Intercept]: 25.063717601256283
[Coef_]: [-0.09  0.    0.11  2.63 -9.13  4.34  0.02  0.24 -0.01 -1.18  0.01 -0.48] 

5 folds independent Negative MSE scores: [-11.22 -32.66 -25.99 -55.84 -66.06]
5 folds indenpendent RMSE scores: [3.35 5.72 5.1  7.47 8.13]
5 folds Mean RMSE: 5.953
------------------------------------------------------------------------------------------
[model]: RandomForestRegressor() 

[MSE] : 16.8828, [RMSE] : 4.1089, [r2] : 0.7972
[Intercept]: 25.063717601256283
[Coef_]: [-0.09  0.    0.11  2.63 -9.13  4.34  0.02  0.24 -0.01 -1.18  0.01 -0.48] 

5 folds independent Negative MSE scores: [ -7.48 -13.2  -21.66 -47.72 -20.57]
5 folds indenpendent RMSE scores: [2.74 3.63 4.65 6.91 4.54]
5 folds Mean RMSE: 4.494
------------------------------------------------------------------------------------------
[model]: GradientBoostingRegressor() 

[MSE] : 14.7410, [RMSE] : 3.8394, [r2] : 0.8230
[In

###### lasso / alpha 3 RMSE : 6.0681

### Best_parmas

In [45]:
# alpha = 3
lr_params = Lasso(alpha=3)

lr_params.fit(X_train,y_train)
lr_params_pred = lr_params.predict(X_test)

In [46]:
evaluation(lr_params,y_test,lr_params_pred,5)

[model]: Lasso(alpha=3) 

[MSE] : 38.2387, [RMSE] : 6.1837, [r2] : 0.5408
[Intercept]: 25.063717601256283
[Coef_]: [-0.09  0.    0.11  2.63 -9.13  4.34  0.02  0.24 -0.01 -1.18  0.01 -0.48] 

5 folds independent Negative MSE scores: [-22.75 -36.89 -71.6  -52.54 -19.27]
5 folds indenpendent RMSE scores: [4.77 6.07 8.46 7.25 4.39]
5 folds Mean RMSE: 6.189
------------------------------------------------------------------------------------------


### conclusion

Default Lasso - 6.0681
alpha 3 Lasso - 6.189

# [도전 과제]

캐글의 자전거 대여 수요 예측을 아래와 같이 수행하세요
- 데이터는 https://www.kaggle.com/c/bike-sharing-demand/data 에서 train.csv를 다운로드 받아 이용
- 문자열을 datetime 타입으로 변경
- 종속변수 정규성 개선위한 로그 변환
- 평가지표 : RMSLE, RMSE, MAE
- 'year', 'month', 'day', 'hour' 등의 피러들을 OneHotEncoding 하여 회귀모델의 예측 성능 비교

### module import

In [None]:
# basic module
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sepearte module
from sklearn.model_selection import train_test_split

# machine Learning modules (Regression)
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

# machine Learning modules (Regression-trees)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# machine Learning evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# Developing
from sklearn.model_selection import GridSearchCV

### Data load

In [None]:
bike = pd.read_csv('Data/train.csv')
display(bike.head())
print(bike.info())
print()
print(bike.isnull().sum())

### Preprocessing

In [None]:
# str to datetime
bike["datetime"] = pd.to_datetime(bike["datetime"])

# seperate
bike["year"] = bike["datetime"].dt.year
bike["month"] = bike["datetime"].dt.month
bike["day"] = bike["datetime"].dt.day
bike["hour"] = bike["datetime"].dt.hour
bike["minute"] = bike["datetime"].dt.minute
bike["second"] = bike["datetime"].dt.second

# sort
bike[["datetime", "year", "month", "day", "hour", "minute", "second"]].head()

print(bike.info())

### EDA

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(bike.corr(),annot = True)

In [None]:
import statsmodels.formula.api as smf
model = smf.ols(formula='count ~ ' + '+'.join(bike.columns),data=bike).fit()
print(model.summary())

In [None]:
sns.displot(bike['count'])
sns.displot(np.log1p(bike['count']))