# 파일 읽어서 불러오기

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
%matplotlib inline

In [3]:
train = pd.read_csv('./train_bic.csv', encoding='utf-8')
train.head()

FileNotFoundError: [Errno 2] No such file or directory: './train_bic.csv'

In [None]:
test = pd.read_csv('./test_bic.csv', encoding='utf-8')
test.head()

# 결측치 채우기

In [None]:
## 결측치를 확인하고 결측치 채우기 (simple imputer 이용)
train.info()
print(train.isnull().sum())
print(train[train['hour_bef_pm2.5'].isnull()])

test.info()
print(test.isnull().sum())
print(test[test['hour_bef_pm2.5'].isnull()])

In [None]:
from sklearn.impute import SimpleImputer
si = SimpleImputer(strategy='mean')
imputed_df = si.fit_transform(train)
train = pd.DataFrame(imputed_df, columns = train.columns)

# si = SimpleImputer(strategy='mean')
imputed_df2 = si.fit_transform(test)
test = pd.DataFrame(imputed_df2, columns = test.columns)

In [None]:
## 컬럼간 상관관계 확인하기
train.corr()
train.corr()[np.abs(train.corr())>=0.3]
sns.heatmap(train.corr()[np.abs(train.corr())>=0.3], annot=True)

In [None]:
test.corr()
test.corr()[np.abs(test.corr())>=0.3]
sns.heatmap(test.corr()[np.abs(test.corr())>=0.3], annot=True)

train에서는 id는 상관관계가 없기 때문에 삭제하고 진행 
강수량은 상관관계가 낮으나 test에서는 상관관계가 존재하므로 삭제 안함
test는 id만 삭제하고 진행

# 모델링을 위한 데이터 분리 및 모델링

In [None]:
X_train = train.drop(columns=['id', 'count'], axis=1)
y_train = train['count']
X_test = test.drop(columns=['id'], axis=1)

print(X_train.shape, y_train.shape)
print(X_test.shape)

In [None]:
### 앙상블 모델링 진행하기
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

abc = AdaBoostRegressor(random_state=100)
gbc = GradientBoostingRegressor(random_state=100)
rf = RandomForestRegressor(random_state=100)
xgb = xgb.XGBRegressor(random_state=100)
lgb = lgb.LGBMRegressor(random_state=100, boosting_type = 'gbdt')

# help(RandomForestRegressor)

In [None]:
######### 
param_grid_abc = {
    'n_estimators': [1, 10, 50, 100],
    'loss': ['linear', 'square', 'exponential'],
    'learning_rate': [0.1, 0.2, 0.5, 0.8, 1.0],
}

grid_search_abc = GridSearchCV(abc, param_grid=param_grid_abc, cv=10, n_jobs=-1)
grid_search_abc.fit(X_train, y_train)
print(grid_search_abc.best_estimator_)  #AdaBoostRegressor(learning_rate=0.1, n_estimators=100, random_state=100)

best_param_abc_gs = grid_search_abc.best_estimator_
pred_abc_gs = best_param_abc_gs.predict(X_test)

In [None]:
param_grid_rf = {
    'max_depth': [None, 1, 10, 15, 20],
    'max_leaf_nodes': [2],
    'criterion':["mse"],
    'n_estimators': [1, 10, 50, 100, 150, 200],
    'min_samples_split':[2,3,4,8,10],
}
param_grid_rf = GridSearchCV(rf, param_grid=param_grid_rf, cv=10, n_jobs=-1)
param_grid_rf.fit(X_train, y_train)
print(param_grid_rf.best_estimator_) #RandomForestRegressor(max_leaf_nodes=2, n_estimators=150, random_state=100)

best_param_rf_gs = param_grid_rf.best_estimator_
pred_rf_gs = best_param_rf_gs.predict(X_test)

In [None]:
param_grid_gbc = {
    'n_estimators': [1, 10, 50, 100],
    'learning_rate': [0, 0.1, 0.2, 0.5, 0.8, 1.0],
    'criterion':["mse"],
    'max_depth':[None, 10, 20, 30, 50],
    'min_samples_split':[2,3,4,8,10],
}

param_grid_gbc = GridSearchCV(gbc, param_grid=param_grid_gbc, cv=10, n_jobs=-1)
param_grid_gbc.fit(X_train, y_train)
print(param_grid_gbc.best_estimator_) #GradientBoostingRegressor(criterion='mse', max_depth=10, min_samples_split=10, random_state=100)

best_param_gbc_gs = param_grid_gbc.best_estimator_ 
pred_gbc_gs = best_param_gbc_gs.predict(X_test)

In [None]:
param_grid_xgb = {
    'max_depth': [None, 1, 10, 15, 20],
    'n_estimators': [1, 10, 50, 100],
#    'alpha': [0.001, 0.01, 0.1, 1],
#    'lambda': [0.001, 0.01, 0.1, 1],
    'learning_rate': [0, 0.1, 0.2, 0.5, 0.8, 1.0],
    'eval_metric': ['rmse'],
    'booster': ['gbtree'],
}
param_grid_xgb = GridSearchCV(xgb, param_grid=param_grid_xgb, cv=10, n_jobs=-1)
param_grid_xgb.fit(X_train, y_train)
print(param_grid_xgb.best_estimator_)
            # XGBRegressor(booster='gbtree', eval_metric='rmse', learning_rate=0.1, max_depth=6, n_estimators=100, random_state=100, 
            #              reg_alpha=0, reg_lambda=1)
        
best_param_xgb_gs = param_grid_xgb.best_estimator_
pred_xgb_gs = best_param_xgb_gs.predict(X_test)

In [None]:
param_grid_lgb = {
    'max_depth': [-1, 1, 5, 10, 15, 20],
    'n_estimators': [1, 9, 10, 50, 100],
#    'alpha': [0.001, 0.01, 0.1, 1],
#    'lambda': [0.001, 0.01, 0.1, 1],
    'learning_rate': [0.1, 0.2, 0.5, 0.8, 1.0],
    
}
param_grid_lgb = GridSearchCV(lgb, param_grid=param_grid_lgb, cv=10, n_jobs=-1)
param_grid_lgb.fit(X_train, y_train)
print(param_grid_lgb.best_estimator_)  #LGBMRegressor(max_depth=10, random_state=100)

best_param_lgb_gs = param_grid_lgb.best_estimator_
pred_lgb_gs = best_param_lgb_gs.predict(X_test)

# 성능 평가 비교

In [None]:
print('AdaBoost score: {:.3f}'.format(best_param_abc_gs.score(X_train, y_train)))
print('Random Forest score: {:.3f}'.format(best_param_rf_gs.score(X_train, y_train)))  
print('Gradient Boosting score: {:.3f}'.format(best_param_gbc_gs.score(X_train, y_train))) 
print('XGBoost score: {:.3f}'.format(best_param_xgb_gs.score(X_train, y_train))) 
print('LightGBM score: {:.3f}'.format(best_param_lgb_gs.score(X_train, y_train)))  

In [None]:
sns.kdeplot(y_train, label = 'real_y_train')
sns.kdeplot(pred_abc_gs, label = 'grid_pred_abc')
sns.kdeplot(pred_rf_gs, label = 'grid_pred_rf')
sns.kdeplot(pred_gbc_gs, label = 'grid_pred_gbc')
sns.kdeplot(pred_xgb_gs, label = 'grid_pred_xgb')
sns.kdeplot(pred_lgb_gs, label = 'grid_pred_lgb')
plt.title('modeling result comparision', fontsize =16)
plt.legend()
plt.show()

In [None]:
best_model = pd.DataFrame(columns = ['name', 'score'])

In [None]:
model_nm = ['Ada', 'Gradient', 'Random Forest', 'XGB', 'LGBM']
model_score = [best_param_abc_gs.score(X_train, y_train), best_param_gbc_gs.score(X_train, y_train),
              best_param_rf_gs.score(X_train, y_train), best_param_xgb_gs.score(X_train, y_train),
              best_param_lgb_gs.score(X_train, y_train)]
best_model['name'] = model_nm 
best_model['score'] = model_score

In [None]:
best_model

In [None]:
sns.barplot(data = best_model.sort_values(by = 'score', ascending=False), x='score', y = 'name')
sns.despine(left=True, bottom=True)
plt.title('Final Modeling Score Comparison', fontsize = 14)
plt.ylabel(None)
plt.show()

# 컬럼별 중요도 막대그래프

In [None]:
col_imp1 = pd.DataFrame(best_param_abc_gs.feature_importances_, index = X_train.columns, columns = ['value']).sort_values(by='value', ascending=False)
plt.figure(figsize=(10,10))
sns.barplot(col_imp1.index, col_imp1['value'])
plt.title('Ada Boosting Features Importance', fontsize=15)
plt.xticks(rotation=45)

In [None]:
col_imp2 = pd.DataFrame(best_param_rf_gs.feature_importances_, index = X_train.columns, columns = ['value']).sort_values(by='value', ascending=False)
plt.figure(figsize=(10,10))
sns.barplot(col_imp2.index, col_imp2['value'])
plt.title('Random Forest Features Importance', fontsize=15)
plt.xticks(rotation=45)

In [None]:
col_imp3 = pd.DataFrame(best_param_gbc_gs.feature_importances_, index = X_train.columns, columns = ['value']).sort_values(by='value', ascending=False)
plt.figure(figsize=(10,10))
sns.barplot(col_imp3.index, col_imp3['value'])
plt.title('Gradient Boosting Features Importance', fontsize=15)
plt.xticks(rotation=45)

In [None]:
col_imp4 = pd.DataFrame(best_param_xgb_gs.feature_importances_, index = X_train.columns, columns = ['value']).sort_values(by='value', ascending=False)
plt.figure(figsize=(10,10))
sns.barplot(col_imp4.index, col_imp4['value'])
plt.title('XGBoost Features Importance', fontsize=15)
plt.xticks(rotation=45)

In [None]:
col_imp5 = pd.DataFrame(best_param_lgb_gs.feature_importances_, index = X_train.columns, columns = ['value']).sort_values(by='value', ascending=False)
plt.figure(figsize=(10,10))
sns.barplot(col_imp5.index, col_imp5['value'])
plt.title('Light GBM Features Importance', fontsize=15)
plt.xticks(rotation=45)