In [6]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [7]:
#NullValueImputer
#여러 종류의 누락된 값을 대체하는 변환기 만들기
#TransformerMixin 클래스: fit과 transform을 연속으로 호출하는 fit_transform 메서드를 제공하는 클래스

from sklearn.base import TransformerMixin 
class NullValueImputer(TransformerMixin):
    
    def __init__(self):
        None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist(): #dtype이 범주형변수이면 최빈값으로 채운다.
                X[column] = X[column].fillna(X[column].mode())
            else: #dtype이 숫자형변수이면 -999로 채운다.
                X[column]=X[column].fillna(-999.0)
        return X
    
df = pd.read_csv('../data/student-por.csv', sep=';')
nvi = NullValueImputer().fit_transform(df)
nvi.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999.0,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [13]:
#SparseMatrix
#여러 종류의 특성으로 구성된 데이터를 원-핫 인코딩하기
#

from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import OneHotEncoder

class SparseMatrix(TransformerMixin):
    
    def __init__(self):
        self.ohe = OneHotEncoder()
        
    def fit(self, X, y=None):
        #범주형 변수에 해당하는 컬럼만 인코더에 fit시킴
        self.categorical_columns= X.columns[X.dtypes==object].tolist()
        self.ohe.fit(X[self.categorical_columns])
        return self
    
    def transform(self, X, y=None):
        #범주형 변수에 해당하는 컬럼을 인코딩한다.
        hot = self.ohe.transform(X[self.categorical_columns])
        #범주형 변수를 제외한 컬럼을 뽑는다. 
        cold_df = X.select_dtypes(exclude=["object"])
        #csr_matrix: 희소행렬로 변환
        cold = csr_matrix(cold_df)
        #hstck: 수평방향으로 연결
        final_sparse_matrix = hstack((hot, cold))
        return final_sparse_matrix
sm = SparseMatrix().fit_transform(nvi)

In [14]:
df = pd.read_csv('../data/student-por.csv', sep=';')
y = df.iloc[:, -1]
X = df.iloc[:, :-3]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

from sklearn.pipeline import Pipeline
data_pipeline = Pipeline([('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix())])
X_train_transformed = data_pipeline.fit_transform(X_train).toarray()
X_train_transformed

array([[1., 0., 1., ..., 3., 5., 0.],
       [1., 0., 0., ..., 1., 2., 4.],
       [1., 0., 1., ..., 1., 3., 0.],
       ...,
       [0., 1., 1., ..., 5., 5., 5.],
       [0., 1., 0., ..., 2., 3., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [10]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error as MSE
from xgboost import XGBRegressor

y_train.value_counts()
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

# 교차 검증 점수를 계산하는 함수
def cross_val(model):
    scores = cross_val_score(model, X_train_transformed, y_train, 
                             scoring='neg_root_mean_squared_error', cv=kfold)
    rmse = (-scores.mean())
    return rmse

cross_val(XGBRegressor(missing=-999.0))

2.9004041754792746

In [15]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train_transformed, 
                                                            y_train, random_state=2)

In [16]:
def n_estimators(model):
    eval_set = [(X_test_2, y_test_2)]
    eval_metric="rmse"
    model.fit(X_train_2, y_train_2, eval_metric=eval_metric, 
              eval_set=eval_set, early_stopping_rounds=100)
    y_pred = model.predict(X_test_2)
    rmse = MSE(y_test_2, y_pred)**0.5
    return rmse  

In [17]:
n_estimators(XGBRegressor(n_estimators=5000, missing=-999.0))


[0]	validation_0-rmse:8.49176
[1]	validation_0-rmse:6.31389
[2]	validation_0-rmse:4.97965
[3]	validation_0-rmse:4.16109
[4]	validation_0-rmse:3.67782
[5]	validation_0-rmse:3.42779
[6]	validation_0-rmse:3.30579
[7]	validation_0-rmse:3.25238
[8]	validation_0-rmse:3.22878
[9]	validation_0-rmse:3.20020
[10]	validation_0-rmse:3.17934
[11]	validation_0-rmse:3.16766
[12]	validation_0-rmse:3.15061
[13]	validation_0-rmse:3.13508
[14]	validation_0-rmse:3.14204
[15]	validation_0-rmse:3.13769
[16]	validation_0-rmse:3.15551
[17]	validation_0-rmse:3.15064
[18]	validation_0-rmse:3.14732
[19]	validation_0-rmse:3.14887
[20]	validation_0-rmse:3.14607
[21]	validation_0-rmse:3.14591
[22]	validation_0-rmse:3.14349
[23]	validation_0-rmse:3.14303
[24]	validation_0-rmse:3.14024
[25]	validation_0-rmse:3.14376
[26]	validation_0-rmse:3.14765
[27]	validation_0-rmse:3.14520
[28]	validation_0-rmse:3.13969
[29]	validation_0-rmse:3.14365
[30]	validation_0-rmse:3.13755
[31]	validation_0-rmse:3.14122
[32]	validation_0-

3.125373597402936

In [18]:
def grid_search(params, reg=XGBRegressor(missing=-999.0)):
    grid_reg = GridSearchCV(reg, params, 
                            scoring='neg_mean_squared_error', cv=kfold)
    grid_reg.fit(X_train_transformed, y_train)
    best_params = grid_reg.best_params_
    print("최상의 매개변수:", best_params)
    best_score = np.sqrt(-grid_reg.best_score_)
    print("최상의 점수:", best_score)

In [19]:
grid_search(params={'max_depth':[1],
                    'min_child_weight':[8, 9, 10], 
                    'subsample':[0.8], 
                    'colsample_bytree':[1.0],
                    'colsample_bylevel':[0.6, 0.7, 0.8, 0.9, 1],
                    'colsample_bynode':[0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators':[40]})

최상의 매개변수: {'colsample_bylevel': 0.6, 'colsample_bynode': 0.6, 'colsample_bytree': 1.0, 'max_depth': 1, 'min_child_weight': 8, 'n_estimators': 40, 'subsample': 0.8}
최상의 점수: 2.6288406162401854
