# 10장. XGBoost 모델 배포

In [1]:
!pip install -q --upgrade xgboost

## 혼합 데이터 인코딩

### 데이터 로딩

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv')
df.head()

Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;NaN;18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";NaN;""U"";""GT3"";""T"";1;1;""at_home"";""other""..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."
3,"GP;""F"";15;""U"";""GT3"";""T"";4;2;""health"";""services..."
4,"GP;""F"";16;""U"";""GT3"";""T"";3;3;""other"";""other"";""h..."


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv', sep=';')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


### 누락된 값 처리

In [4]:
df.isnull().sum()

school        0
sex           1
age           1
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      1
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [5]:
df[df.isna().any(axis=1)]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11


In [6]:
pd.options.display.max_columns = None

In [7]:
df[df.isna().any(axis=1)]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,course,,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11


In [8]:
df['age'] = df['age'].fillna(-999.0)

In [9]:
df['sex'] = df['sex'].fillna(df['sex'].mode())
df['guardian'] = df['guardian'].fillna(df['guardian'].mode())

In [10]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999.0,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


### 원-핫 인코딩

In [11]:
categorical_columns = df.columns[df.dtypes==object].tolist()
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
hot = ohe.fit_transform(df[categorical_columns])
hot_df = pd.DataFrame(hot.toarray())
hot_df.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [12]:
print(hot)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 25)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 31)	1.0
  (0, 33)	1.0
  (0, 36)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 41)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0
  (1, 17)	1.0
  (1, 20)	1.0
  :	:
  (647, 27)	1.0
  (647, 29)	1.0
  (647, 31)	1.0
  (647, 33)	1.0
  (647, 35)	1.0
  (647, 38)	1.0
  (647, 40)	1.0
  (647, 41)	1.0
  (648, 1)	1.0
  (648, 3)	1.0
  (648, 4)	1.0
  (648, 7)	1.0
  (648, 9)	1.0
  (648, 13)	1.0
  (648, 17)	1.0
  (648, 20)	1.0
  (648, 25)	1.0
  (648, 27)	1.0
  (648, 29)	1.0
  (648, 31)	1.0
  (648, 33)	1.0
  (648, 35)	1.0
  (648, 38)	1.0
  (648, 40)	1.0
  (648, 41)	1.0


In [13]:
hot

<649x43 sparse matrix of type '<class 'numpy.float64'>'
	with 11033 stored elements in Compressed Sparse Row format>

In [14]:
cold_df = df.select_dtypes(exclude=["object"])
cold_df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18.0,4,4,2,2,0,4,3,4,1,1,3,4,0,11,11
1,-999.0,1,1,1,2,0,5,3,3,1,1,3,2,9,11,11
2,15.0,1,1,1,2,0,4,3,2,2,3,3,6,12,13,12
3,15.0,4,2,1,3,0,3,2,2,1,1,5,0,14,14,14
4,16.0,3,3,1,2,0,4,3,2,1,2,5,0,11,13,13


In [15]:
from scipy.sparse import csr_matrix
cold = csr_matrix(cold_df)

from scipy.sparse import hstack
final_sparse_matrix = hstack((hot, cold))

final_df = pd.DataFrame(final_sparse_matrix.toarray())
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,18.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,15.0,1.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,16.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


## 사용자 정의 사이킷런 변환기

### 사용자 정의 변환기

#### 여러 종류의 누락된 값을 대체하는 변환기 만들기

In [16]:
from sklearn.base import TransformerMixin 

class NullValueImputer(TransformerMixin):
    
    def __init__(self):
        None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for column in X.columns.tolist():
            if column in X.columns[X.dtypes==object].tolist():
                X[column] = X[column].fillna(X[column].mode())
            else:
                X[column]=X[column].fillna(-999.0)
        return X

In [17]:
df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv', sep=';')
nvi = NullValueImputer().fit_transform(df)
nvi.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999.0,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


#### ColumnTransformer 사용하기

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv', sep=';')

mode_imputer = SimpleImputer(strategy='most_frequent')
const_imputer = SimpleImputer(strategy='constant', fill_value=-999.0)

numeric_columns = df.columns[df.dtypes!=object].tolist()

ct = ColumnTransformer([('str', mode_imputer, categorical_columns),
                        ('num', const_imputer, numeric_columns)])

new_df = pd.DataFrame(ct.fit_transform(df), 
                      columns=categorical_columns+numeric_columns)
new_df = new_df[df.columns]
new_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


#### 여러 종류의 특성으로 구성된 데이터를 원-핫 인코딩하기

In [19]:
class SparseMatrix(TransformerMixin):
    
    def __init__(self):
        self.ohe = OneHotEncoder()
        
    def fit(self, X, y=None):
        self.categorical_columns= X.columns[X.dtypes==object].tolist()
        self.ohe.fit(X[self.categorical_columns])
        return self
    
    def transform(self, X, y=None):
        hot = self.ohe.transform(X[self.categorical_columns])
        cold_df = X.select_dtypes(exclude=["object"])
        cold = csr_matrix(cold_df)
        final_sparse_matrix = hstack((hot, cold))
        return final_sparse_matrix

In [20]:
sm = SparseMatrix().fit_transform(nvi)
print(sm)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 25)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 31)	1.0
  (0, 33)	1.0
  (0, 36)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 41)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0
  (1, 17)	1.0
  (1, 20)	1.0
  :	:
  (647, 49)	2.0
  (647, 50)	4.0
  (647, 51)	5.0
  (647, 52)	3.0
  (647, 53)	4.0
  (647, 54)	2.0
  (647, 55)	6.0
  (647, 56)	10.0
  (647, 57)	10.0
  (647, 58)	10.0
  (648, 43)	18.0
  (648, 44)	3.0
  (648, 45)	2.0
  (648, 46)	3.0
  (648, 47)	1.0
  (648, 49)	4.0
  (648, 50)	4.0
  (648, 51)	1.0
  (648, 52)	3.0
  (648, 53)	4.0
  (648, 54)	5.0
  (648, 55)	4.0
  (648, 56)	10.0
  (648, 57)	11.0
  (648, 58)	11.0


In [21]:
sm_df = pd.DataFrame(sm.toarray())
sm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,18.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,15.0,1.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,16.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


#### ColumnTransformer 사용하기

In [22]:
ct2 = ColumnTransformer([('ohe', OneHotEncoder(), categorical_columns)],
                       remainder='passthrough')
ct2.fit(new_df)
oh_columns = ct2.named_transformers_['ohe'].get_feature_names_out()

import numpy as np
new_df2 = pd.DataFrame(ct2.transform(new_df), 
                       columns=np.append(oh_columns, numeric_columns))
new_df2.head()

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_out'

### 전처리 파이프라인 만들기

In [23]:
df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv', sep=';')

In [24]:
y = df.iloc[:, -1]
X = df.iloc[:, :-3]

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [26]:
from sklearn.pipeline import Pipeline
data_pipeline = Pipeline([('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix())])
X_train_transformed = data_pipeline.fit_transform(X_train).toarray()

## XGBoost 모델 만들기

In [27]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error as MSE
from xgboost import XGBRegressor

In [28]:
y_train.value_counts()

11    82
10    75
13    58
12    53
14    42
15    36
9     29
16    27
8     26
17    24
18    14
0     10
7      7
19     1
6      1
5      1
Name: G3, dtype: int64

In [29]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [30]:
def cross_val(model):
    scores = cross_val_score(model, X_train_transformed, y_train, 
                             scoring='neg_root_mean_squared_error', cv=kfold)
    rmse = (-scores.mean())
    return rmse

In [31]:
cross_val(XGBRegressor(missing=-999.0))

2.9004041754792746

#### 사이킷런으로 예측 구간 계산하기

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_lower = GradientBoostingRegressor(loss="quantile", alpha=0.05, 
                                      random_state=2)
y_lower = gbr_lower.fit(X_train_transformed, y_train).predict(X_train_transformed)

gbr_upper = GradientBoostingRegressor(loss="quantile", alpha=0.95, 
                                      random_state=2)
y_upper = gbr_upper.fit(X_train_transformed, y_train).predict(X_train_transformed)

np.logical_and(y_lower <= y_train, 
               y_train <= y_upper).mean()

0.8950617283950617

In [33]:
X_test_clean = data_pipeline.transform(X_test).toarray()

y_lower = gbr_lower.predict(X_test_clean)
y_upper = gbr_upper.predict(X_test_clean)

np.logical_and(y_lower <= y_test, 
               y_test <= y_upper).mean()

0.8466257668711656

In [34]:
!pip -q install mapie

In [35]:
from mapie.regression import MapieRegressor

xgbr = XGBRegressor(missing=-999.0)
mapie = MapieRegressor(xgbr, cv=10, n_jobs=-1)

mapie.fit(X_train_transformed, y_train)

SyntaxError: future feature annotations is not defined (regression.py, line 1)

In [None]:
from mapie.metrics import regression_coverage_score

y_pred, y_pis = mapie.predict(X_test_clean, alpha=0.1)
regression_coverage_score(y_test, y_pis[:, 0], y_pis[:, 1])

In [None]:
from mapie.subsample import Subsample

subs = Subsample(n_resamplings=30, random_state=0)

mapie = MapieRegressor(xgbr, cv=subs, n_jobs=-1)
mapie.fit(X_train_transformed, y_train)

In [None]:
y_pred, y_pis = mapie.predict(X_test_clean, alpha=0.1)
regression_coverage_score(y_test, y_pis[:, 0], y_pis[:, 1])

### XGBoost 하이퍼파라미터 튜닝

In [36]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train_transformed, 
                                                            y_train, random_state=2)

In [37]:
def n_estimators(model):
    eval_set = [(X_test_2, y_test_2)]
    eval_metric="rmse"
    model.fit(X_train_2, y_train_2, eval_metric=eval_metric, 
              eval_set=eval_set, early_stopping_rounds=100)
    y_pred = model.predict(X_test_2)
    rmse = MSE(y_test_2, y_pred)**0.5
    return rmse  

In [38]:
n_estimators(XGBRegressor(n_estimators=5000, missing=-999.0))

[0]	validation_0-rmse:8.49176
[1]	validation_0-rmse:6.31389
[2]	validation_0-rmse:4.97965
[3]	validation_0-rmse:4.16109
[4]	validation_0-rmse:3.67782
[5]	validation_0-rmse:3.42779
[6]	validation_0-rmse:3.30579
[7]	validation_0-rmse:3.25238
[8]	validation_0-rmse:3.22878
[9]	validation_0-rmse:3.20020
[10]	validation_0-rmse:3.17934
[11]	validation_0-rmse:3.16766
[12]	validation_0-rmse:3.15062
[13]	validation_0-rmse:3.13508
[14]	validation_0-rmse:3.14204
[15]	validation_0-rmse:3.13769
[16]	validation_0-rmse:3.15551
[17]	validation_0-rmse:3.15064
[18]	validation_0-rmse:3.14732
[19]	validation_0-rmse:3.14887
[20]	validation_0-rmse:3.14607
[21]	validation_0-rmse:3.14591
[22]	validation_0-rmse:3.14349
[23]	validation_0-rmse:3.14303
[24]	validation_0-rmse:3.14024
[25]	validation_0-rmse:3.14376
[26]	validation_0-rmse:3.14765
[27]	validation_0-rmse:3.14520
[28]	validation_0-rmse:3.13969
[29]	validation_0-rmse:3.14365
[30]	validation_0-rmse:3.13755
[31]	validation_0-rmse:3.14121
[32]	validation_0-

3.125373597402936

In [39]:
def grid_search(params, reg=XGBRegressor(missing=-999.0)):
    grid_reg = GridSearchCV(reg, params, 
                            scoring='neg_mean_squared_error', cv=kfold)
    grid_reg.fit(X_train_transformed, y_train)
    best_params = grid_reg.best_params_
    print("최상의 매개변수:", best_params)
    best_score = np.sqrt(-grid_reg.best_score_)
    print("최상의 점수:", best_score)

In [40]:
grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], 
                    'n_estimators':[34]})

최상의 매개변수: {'max_depth': 1, 'n_estimators': 34}
최상의 점수: 2.662773659268993


In [41]:
grid_search(params={'max_depth':[1, 2], 
                    'min_child_weight':[1, 2, 3, 4, 5], 
                    'n_estimators':[34]})

최상의 매개변수: {'max_depth': 1, 'min_child_weight': 5, 'n_estimators': 34}
최상의 점수: 2.6619193269068284


In [42]:
grid_search(params={'max_depth':[1],
                    'min_child_weight':[6, 7, 8, 9, 10],
                    'subsample':[0.5, 0.6, 0.7, 0.8, 0.9],
                    'n_estimators':[34, 50]})

최상의 매개변수: {'max_depth': 1, 'min_child_weight': 8, 'n_estimators': 50, 'subsample': 0.8}
최상의 점수: 2.655334578520487


In [43]:
grid_search(params={'max_depth':[1],
                    'min_child_weight':[7, 8, 9, 10], 
                    'subsample':[0.8, 0.9, 1], 
                    'colsample_bytree':[0.5, 0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators':[40, 50, 60]})

최상의 매개변수: {'colsample_bytree': 0.8, 'max_depth': 1, 'min_child_weight': 8, 'n_estimators': 40, 'subsample': 0.9}
최상의 점수: 2.6398680061869304


In [44]:
grid_search(params={'max_depth':[1],
                    'min_child_weight':[8, 9, 10], 
                    'subsample':[0.8], 
                    'colsample_bytree':[1.0],
                    'colsample_bylevel':[0.6, 0.7, 0.8, 0.9, 1],
                    'colsample_bynode':[0.6, 0.7, 0.8, 0.9, 1],
                    'n_estimators':[40]})

최상의 매개변수: {'colsample_bylevel': 0.9, 'colsample_bynode': 0.7, 'colsample_bytree': 1.0, 'max_depth': 1, 'min_child_weight': 9, 'n_estimators': 40, 'subsample': 0.8}
최상의 점수: 2.6382947884921335


### 모델 테스트

In [45]:
X_test_transformed = data_pipeline.fit_transform(X_test).toarray()

In [46]:
model = XGBRegressor(max_depth=1,
                     min_child_weight=10,
                     subsample=0.8, 
                     colsample_bytree=1.0, 
                     colsample_bylevel=0.7,
                     colsample_bynode=0.6,
                     n_estimators=40,
                     missing=-999.0)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
rmse = MSE(y_pred, y_test)**0.5
rmse

2.8053885817272746

In [47]:
model = XGBRegressor(max_depth=1,
                     min_child_weight=13,
                     subsample=0.6, 
                     colsample_bytree=1.0, 
                     colsample_bylevel=0.7,
                     colsample_bynode=0.6,
                     n_estimators=40,
                     missing=-999.0)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_test_transformed)
rmse = MSE(y_pred, y_test)**0.5
rmse

2.83255472458864

## 머신러닝 파이프라인 구성하기

In [48]:
full_pipeline = Pipeline([('null_imputer', NullValueImputer()), 
                          ('sparse', SparseMatrix()), 
                          ('xgb', XGBRegressor(max_depth=1,
                                               min_child_weight=13,
                                               subsample=0.6, 
                                               colsample_bytree=1.0, 
                                               colsample_bylevel=0.7,
                                               colsample_bynode=0.6,
                                               n_estimators=40,
                                               missing=-999.0))])

In [49]:
full_pipeline.fit(X, y)

Pipeline(steps=[('null_imputer',
                 <__main__.NullValueImputer object at 0x7f218ce45c88>),
                ('sparse', <__main__.SparseMatrix object at 0x7f21ddf8ed30>),
                ('xgb',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=0.7, colsample_bynode=0.6,
                              colsample_bytree=1.0, enable_categorical=False,
                              gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='',
                              learning_rate=0.300000012, max_delta_step=0,
                              max_depth=1, min_child_weight=13, missing=-999.0,
                              monotone_constraints='()', n_estimators=40,
                              n_jobs=16, num_parallel_tree=1, predictor='auto',
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, subsample=0.6,

In [50]:
new_data = X_test
full_pipeline.predict(new_data)

array([13.410856 ,  9.399757 , 12.532548 , 13.520119 , 12.475722 ,
       11.750607 , 13.652693 , 11.784681 , 11.001605 , 12.69339  ,
       13.486883 ,  9.770833 , 12.657307 , 13.004725 , 13.76239  ,
        8.457308 , 10.29121  , 10.497839 , 14.168394 , 10.651788 ,
       12.1955805, 13.4779005,  7.5282187, 13.280034 ,  7.9106717,
        8.158135 , 10.9469185, 13.22751  , 13.218307 , 13.119164 ,
       11.677392 , 11.86242  , 14.54946  ,  9.172024 , 11.800204 ,
       13.447606 , 12.261321 , 11.484585 ,  9.221175 , 13.043528 ,
       11.527605 , 11.962923 , 11.959569 , 13.274131 , 13.723855 ,
       13.887159 , 12.665446 , 12.721364 , 12.63051  , 11.887977 ,
       13.781497 ,  7.0989876,  6.7631817, 12.5362835, 13.411464 ,
       10.029529 , 13.101118 ,  9.016784 , 13.1871395, 11.997433 ,
       12.707476 ,  6.7228174,  9.060835 , 11.503031 , 14.58472  ,
       11.054872 , 13.224182 , 13.934139 , 12.754667 , 10.923042 ,
       12.5546255, 11.823312 , 12.982801 ,  8.243411 , 13.2445

In [51]:
np.round(full_pipeline.predict(new_data))

array([13.,  9., 13., 14., 12., 12., 14., 12., 11., 13., 13., 10., 13.,
       13., 14.,  8., 10., 10., 14., 11., 12., 13.,  8., 13.,  8.,  8.,
       11., 13., 13., 13., 12., 12., 15.,  9., 12., 13., 12., 11.,  9.,
       13., 12., 12., 12., 13., 14., 14., 13., 13., 13., 12., 14.,  7.,
        7., 13., 13., 10., 13.,  9., 13., 12., 13.,  7.,  9., 12., 15.,
       11., 13., 14., 13., 11., 13., 12., 13.,  8., 13., 13., 13., 14.,
       13., 10., 13.,  8., 10., 11., 14., 11., 10., 12., 14., 14., 12.,
       10., 14., 12., 12., 13., 13.,  9., 14., 14., 13.,  9., 13., 13.,
       12., 14., 13., 11.,  9., 13.,  7., 10., 13., 12., 14., 12., 12.,
       12., 12., 12., 13.,  9., 13., 11., 14., 12., 14., 14., 12., 13.,
       10., 14.,  8., 10., 12., 13.,  9., 11., 14., 13., 10., 11., 12.,
       13., 12., 12., 13., 13., 14., 13.,  9., 11., 12.,  7.,  9., 12.,
       14., 13., 13., 10., 12., 13.,  9.], dtype=float32)

In [52]:
new_df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv')
new_X = df.iloc[:, :-3]
new_y = df.iloc[:, -1]
new_model = full_pipeline.fit(new_X, new_y)

In [53]:
more_new_data = X_test[:25]
np.round(new_model.predict(more_new_data))

array([13.,  9., 13., 14., 12., 12., 14., 12., 11., 13., 13., 10., 13.,
       13., 14.,  8., 10., 10., 14., 11., 12., 13.,  8., 13.,  8.],
      dtype=float32)

#### 배포를 위해 모델 직렬화하기

In [54]:
model.save_model('final_xgboost_model.json')

In [55]:
load_xgbr = XGBRegressor()
load_xgbr.load_model('final_xgboost_model.json')
load_xgbr

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
             colsample_bynode=0.6, colsample_bytree=1.0,
             enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=1, min_child_weight=13, missing=-999.0,
             monotone_constraints='()', n_estimators=40, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.6,
             tree_method='auto', validate_parameters=1, verbosity=None)

In [56]:
load_xgbr.predict(X_test_transformed[:5])

array([13.337358, 10.78673 , 12.109719, 12.481805, 12.658035],
      dtype=float32)

In [57]:
import pickle

with open('full_pipeline.pickle', 'wb') as f:
    pickle.dump(full_pipeline, f)

In [58]:
with open('full_pipeline.pickle', 'rb') as f:
    load_pipeline = pickle.load(f)

In [59]:
np.round(load_pipeline.predict(more_new_data))

array([13.,  9., 13., 14., 12., 12., 14., 12., 11., 13., 13., 10., 13.,
       13., 14.,  8., 10., 10., 14., 11., 12., 13.,  8., 13.,  8.],
      dtype=float32)

In [60]:
import joblib

joblib.dump(full_pipeline, 'full_pipeline.joblib')

['full_pipeline.joblib']

In [61]:
load_pipeline = joblib.load('full_pipeline.joblib')
np.round(load_pipeline.predict(more_new_data))

array([13.,  9., 13., 14., 12., 12., 14., 12., 11., 13., 13., 10., 13.,
       13., 14.,  8., 10., 10., 14., 11., 12., 13.,  8., 13.,  8.],
      dtype=float32)