<a href="https://colab.research.google.com/github/hyxxnii/Tave-6th-Project/blob/master/NIPA%20%EB%B3%B8%EC%84%A0%20-%20KB%EC%B0%A8%EC%B0%A8%EC%B0%A8%20%EC%A4%91%EA%B3%A0%EC%B0%A8%20%EB%A7%A4%EB%AC%BC%20%ED%8C%90%EB%A7%A4%EA%B8%B0%EA%B0%81%20%EC%98%88%EC%B8%A1%20%EB%AA%A8%EB%8D%B8%20%EA%B0%9C%EB%B0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train_path = '../data/.train/.task147/train.csv'
test_path = '../data/.train/.task147/test.csv'
submission_path = '../data/.sample_submission/.task147/sample_submission.csv'
car_df_train = pd.read_csv(train_path)
car_df_test = pd.read_csv(test_path)
car_df_submission= pd.read_csv(submission_path)

car_df = car_df_train.copy()
car_df.head()

In [None]:
car_df.info()

In [None]:
# 타깃값 분포 확인
plt.title('Originial Ad Periods Histogram')
sns.distplot(car_df['ad_periods'])

In [None]:
# 정규분포로 변환하기 위해 로그 변환적용
plt.title('Log Transformed Ad Periods Histogram')
log_Periods = np.log1p(car_df['ad_periods'])
sns.distplot(log_Periods)

# 예측시에 다시 expm1()으로 환원

In [None]:
# SalePrice 로그 변환
originial_Periods = car_df['ad_periods']
car_df['ad_periods'] = np.log1p(car_df['ad_periods'])

# Base Model
### Linear Regression, Ridge, Lasso

In [None]:
def get_rmse(model):
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test_exp, pred)
    rmse = np.sqrt(mse)
    print(model.__class__.__name__, 'RMSE: ', np.round(rmse,3))
    return rmse

def get_rmses(models):
    rmses = []
    for model in models:
        rmse = get_rmse(model)
        rmses.append(rmse)
    return np.round(rmses, 3)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

y_target = car_df['ad_periods']
X_features = car_df.drop('ad_periods', axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=156)

# LinearRegression, Ridge, Lasso 학습/예측/평가
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)

# target 값이 로그 변환된 값이므로 expm1을 이용해 다시 원래 스케일로 변환
pred_lr = np.expm1(lr_reg.predict(X_test))
pred_ridge = np.expm1(ridge_reg.predict(X_test))
pred_lasso = np.expm1(lasso_reg.predict(X_test))

y_test_exp = np.expm1(y_test)

models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

In [None]:
def get_top_bottom_coef(model):
    # coef_ 속성을 기반으로 Series 객체를 생성. index는 컬럼명. 
    coef = pd.Series(model.coef_, index=X_features.columns)
    
    # + 상위 10개 , - 하위 10개 coefficient 추출하여 반환.
    coef_high = coef.sort_values(ascending=False).head(10)
    coef_low = coef.sort_values(ascending=False).tail(10)
    return coef_high, coef_low

In [None]:
def visualize_coefficient(models):
    # 3개 회귀 모델의 시각화를 위해 3개 칼럼을 가지는 subplot 생성
    fig, axs = plt.subplots(figsize=(24,10), nrows=1, ncols=3)
    fig.tight_layout()
    
    # 입력 인자로 받은 list 객체인 models를 차례롤 model을 추출해 회귀 계수 시각화
    for i_num, model in enumerate(models):
        # 상위 10개, 하위 10개 회귀 계수를 구하고, 이를 concat으로 결합
        coef_high, coef_low = get_top_bottom_coef(model)
        coef_concat = pd.concat([coef_high, coef_low])
        
        # ax subplot에 barchar로 표현
        # 한 화면에 표현하기 위해 tick label 위치와 font 크기 조정
        axs[i_num].set_title(model.__class__.__name__+' Coefficients', size=25)
        axs[i_num].tick_params(axis='y', direction='in', pad=-120)
        for label in (axs[i_num].get_xticklabels() + axs[i_num].get_yticklabels()):
            label.set_fontsize(22)
        sns.barplot(x=coef_concat.values, y=coef_concat.index, ax=axs[i_num])
        
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)

In [None]:
from sklearn.model_selection import GridSearchCV

def print_best_params(model, params):
    grid_model = GridSearchCV(model, param_grid=params,
                             scoring='neg_mean_squared_error', cv=5)
    grid_model.fit(X_features, y_target)
    rmse = np.sqrt(-1*grid_model.best_score_)
    print('\n{0} 5 CV 시 최적 평균 RMSE 값: {1}, 최적 alpha: {2}'.format(model.__class__.__name__,
    np.round(rmse,4), grid_model.best_params_))

ridge_params = {'alpha':[0.05, 0.1, 1, 5, 8, 10, 12, 15, 20]}
lasso_params = {'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1, 5, 10]}
print_best_params(ridge_reg, ridge_params)
print_best_params(lasso_reg, lasso_params)

In [None]:
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge(alpha=20)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

# 모든 모델의 RMSE 출력
models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

# 모든 모델의 회귀 계수 시각화
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)

# EDA

In [None]:
onehot_list = car_df.filter(regex='^dum', axis=1).columns
onehot_list, len(onehot_list)

In [None]:
# 피처 데이터 세트의 데이터 분포도 확인
from scipy.stats import skew

# one-hot encoding된 카테고리 숫자형 피처는 제외
features = X_features.drop(onehot_list, axis=1, inplace=False)
skew_features = features.apply(lambda x:skew(x))

# skew(왜곡) 정도가 1 이상인 칼럼만 추출
skew_features_top = skew_features[skew_features > 1]
print(skew_features_top.sort_values(ascending=False))

In [None]:
# 추출된 왜곡 정도가 높은 피처를 로그 변환
car_df[skew_features_top.index] = np.log1p(car_df[skew_features_top.index])

In [None]:
# car_df의 피처를 일부 로그 변환했으므로 다시 피처/타깃 데이터 생성

y_target = car_df['ad_periods']
X_features = car_df.drop('ad_periods', axis=1, inplace=False)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2,
                                                   random_state=156)

# 다시 최적화 하이퍼 파라미터와 RMSE 출력
ridge_params = {'alpha':[0.05, 0.1, 1, 5, 8, 10, 12, 15, 20]}
lasso_params = {'alpha':[0.001, 0.005, 0.008, 0.05, 0.03, 0.1, 0.5, 1, 5, 10]}
print_best_params(ridge_reg, ridge_params)
print_best_params(lasso_reg, lasso_params)

In [None]:
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)
ridge_reg = Ridge(alpha=15)
ridge_reg.fit(X_train, y_train)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

# 모든 모델의 RMSE 출력
models = [lr_reg, ridge_reg, lasso_reg]
get_rmses(models)

# 모든 모델의 회귀 계수 시각화
models = [lr_reg, ridge_reg, lasso_reg]
visualize_coefficient(models)


In [None]:
features.columns

In [None]:
car_rm_dum = car_df_train.drop(onehot_list, axis=1, inplace=False)
car_rm_dum

In [None]:
plt.figure(figsize=(10,20))
sns.heatmap(car_rm_dum, fmt='.3f', annot=True, cmap='PuBu')