# ML Pipeline

* 이제 여러분은 코드를 작성할 때, 두 가지를 고려해야 합니다.
    * 재사용 하려면 어떻게 작성해야 할까?
    * 물 흐르듯이 pipeline을 구성하려면 어떻게 작성해야 할까?

## 0.환경준비 

### 1) 라이브러리 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.metrics import classification_report

### 2) 데이터 불러오기

In [2]:
use_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp' ,'Parch', 'Fare', 'Embarked']
data = pd.read_csv('data/titanic.csv', usecols = use_cols)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## 2.데이터 전처리

### 1) 불필요한 데이터 처리
처음부터 꼭 필요한 칼럼만 지정하여 불러오는 것이 좋습니다.

### 2) 데이터 분할

#### x, y 분할

In [3]:
target = 'Survived'
x0 = data.drop(target, axis = 1)
y0 = data.loc[:, target]

#### test 분할

여기서는 조금만 떼어 냅시다.

In [4]:
x, x_test, y, y_test = train_test_split(x0, y0, test_size = 5, random_state = 2022)

In [5]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
106,3,female,21.0,0,0,7.65,S
449,1,male,52.0,0,0,30.5,S
785,3,male,25.0,0,0,7.25,S
268,1,female,58.0,0,1,153.4625,S
608,2,female,22.0,1,2,41.5792,C


#### train, val 분할

In [6]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### 3) Feature Engineering
* family 변수를 추가하려고 합니다. 가족과 관련된 변수가 SibSp, Parch 입니다. 이 둘을 더하고 자기자신까지 포함시켜서 가족 수 변수를 만듭시다.
* 그리고, SibSp, Parch 는 제거합니다.

In [7]:
x_train['Family'] = x_train['SibSp'].astype('int') + x_train['Parch'].astype('int') + 1
x_train.drop(['SibSp', 'Parch'], axis = 1, inplace = True)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
767,3,female,30.5,7.75,Q,1
122,2,male,32.5,30.0708,C,2
89,3,male,24.0,8.05,S,1
519,3,male,32.0,7.8958,S,1
444,3,male,,8.1125,S,1


* 재사용을 위해서는 함수로 만드는 것이 좋습니다.

In [8]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

#### validation set에 적용하기

In [9]:
x_val = titanic_fe(x_val)

x_val.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
679,1,male,36.0,512.3292,C,2
302,3,male,19.0,0.0,S,1
418,2,male,30.0,13.0,S,1
346,2,female,40.0,13.0,S,1
838,3,male,32.0,56.4958,S,1


### 4) NaN 조치①

* 먼저 x의 NaN을 조사해 봅시다.

In [10]:
x_train.isna().sum()

Pclass        0
Sex           0
Age         114
Fare          0
Embarked      2
Family        0
dtype: int64

* 어떻게 조치 방법에 따라 처리 시점이 달라집니다.
    * Embarked는 최빈값으로 **지금** 채우고
    * Age는 KNNImputer로 **가변수화 후에** 채우겠습니다.

* NaN 행 삭제를 결정한다면...
    * 운영에서 NaN이 들어오면 그 역시 버리겠다는 의미 입니다. 
        * 그래도 괜찮다면...
        * 그러나 괜찮은 상황은 별로 없을 겁니다.

#### SimpleImputer 

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [11]:
from sklearn.impute import SimpleImputer

* 최빈값으로 채우기 : 보통 범주형(숫자는 이산형)을 채울 때 사용합니다.
    * strategy = 'most_frequent'

In [12]:
# 대상을 리스트로 선언합시다. 
imputer1_list = ['Embarked']

# 선언하고 fit_transform
imputer1 = SimpleImputer(strategy = 'most_frequent')
x_train[imputer1_list] = imputer1.fit_transform(x_train[imputer1_list])
x_train.isna().sum()

Pclass        0
Sex           0
Age         114
Fare          0
Embarked      0
Family        0
dtype: int64

#### validation set에 적용하기

In [13]:
imputer1_list = ['Embarked']
x_val[imputer1_list] = imputer1.fit_transform(x_val[imputer1_list])

### 5) 가변수화

In [14]:
cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

for k, v in cat.items():
    x_train[k] = pd.Categorical(x_train[k], categories=v, ordered=False)

x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 866 entries, 536 to 220
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    866 non-null    int64  
 1   Sex       866 non-null    object 
 2   Age       691 non-null    float64
 3   SibSp     866 non-null    int64  
 4   Parch     866 non-null    int64  
 5   Fare      866 non-null    float64
 6   Embarked  864 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 54.1+ KB


In [15]:
x_train = pd.get_dummies(x_train, columns =cat.keys(), drop_first = 1)

In [16]:
x_train.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
767,30.5,7.75,1,0,1,0,0,1
122,32.5,30.0708,2,1,0,0,1,0
89,24.0,8.05,1,1,0,1,0,1
519,32.0,7.8958,1,1,0,1,0,1
444,,8.1125,1,1,0,1,0,1


#### validation set에 적용하기

In [17]:
# 함수로 생성

cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

def titanic_dumm(df, cat):
    temp = df.copy()
    for k, v in cat.items():
        temp[k] = pd.Categorical(temp[k], categories=v, ordered=False)
    temp = pd.get_dummies(temp, columns =cat.keys(), drop_first = 1)
    return temp
x_val = titanic_dumm(x_val, cat)
x_val.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
679,36.0,512.3292,2,1,0,0,0,0
302,19.0,0.0,1,1,0,1,0,1
418,30.0,13.0,1,1,0,1,1,0
346,40.0,13.0,1,0,0,1,1,0
838,32.0,56.4958,1,1,0,1,0,1


### 6) 스케일링


In [18]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)

#### validation set에 적용하기

In [19]:
# validation 적용
x_val_s = scaler.transform(x_val)

### 7) NaN 조치②

#### KNNImputer
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

In [20]:
from sklearn.impute import KNNImputer

In [21]:
imputer2_list = list(x_train)
imputer2_list

['Age',
 'Fare',
 'Family',
 'Sex_male',
 'Embarked_Q',
 'Embarked_S',
 'Pclass_2',
 'Pclass_3']

In [22]:
# 선언하고 fit_transform
imputer2 = KNNImputer()
x_train_s = imputer2.fit_transform(x_train_s)

#### validation set에 적용하기

In [23]:
# validation 적용
x_val_s = imputer2.transform(x_val_s)

## 3.모델링

여기에서는 성능 최적화가 주안점이 아니므로 기본값으로 모델링을 수행합니다.

In [24]:
# SVM으로 모델링 수행
model = SVC()
model.fit(x_train_s, y_train)

SVC()

In [25]:
# validation
pred = model.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.76      0.96      0.85       152
           1       0.91      0.57      0.70       108

    accuracy                           0.80       260
   macro avg       0.84      0.77      0.78       260
weighted avg       0.82      0.80      0.79       260



## 4.Data Pipeline 정리

* 이제 최적의 모델이 생성되어, 운영시스템에 배포되었습니다.
* 운영에서 new data가 주어졌을 때, 어떤 절차로 파이프라인을 구성해야 할까요?

In [26]:
# new data : x_test
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
106,3,female,21.0,0,0,7.65,S
449,1,male,52.0,0,0,30.5,S
785,3,male,25.0,0,0,7.25,S
268,1,female,58.0,0,1,153.4625,S
608,2,female,22.0,1,2,41.5792,C


### 1) [validation에 적용하기] 코드들 가져오기

* 함수, 변수 선언

In [27]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

def titanic_dumm(df, cat):
    for k, v in cat.items():
        df[k] = pd.Categorical(df[k], categories=v, ordered=False)
    df = pd.get_dummies(df, columns =cat.keys(), drop_first = 1)
    return df

imputer1_list = ['Embarked']

cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

* 전처리 실행

In [28]:
temp = x_test.copy()

In [29]:
# Feature Engineering
temp = titanic_fe(temp)

# NaN 조치① : SimpleImputer
temp[imputer1_list] = imputer1.fit_transform(temp[imputer1_list])

# 가변수화
temp = titanic_dumm(temp, cat)

# 스케일링
temp = scaler.transform(temp)

# NaN 조치② : KNNImputer
temp = imputer2.transform(temp)

temp

array([[0.25860769, 0.01493181, 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.6481528 , 0.05953204, 0.        , 1.        , 0.        ,
        1.        , 0.        , 0.        ],
       [0.30887158, 0.01415106, 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.72354863, 0.29953885, 0.1       , 0.        , 0.        ,
        1.        , 0.        , 0.        ],
       [0.27117366, 0.08115719, 0.3       , 0.        , 0.        ,
        0.        , 1.        , 0.        ]])

### 2) Data Pipeline 함수 만들고 실행하기

In [30]:
def titanic_datapipeline(df, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer):

    temp = df.copy()

    # Feature Engineering
    temp = titanic_fe(temp)

    # NaN 조치① : SimpleImputer
    temp[simple_impute_list] = simpleimputer.fit_transform(temp[simple_impute_list])

    # 가변수화
    temp = titanic_dumm(temp, dumm_list)

    x_cols = list(temp)
    # 스케일링
    temp = scaler.transform(temp)

    # NaN 조치② : KNNImputer
    temp = knnimputer.transform(temp)

    return pd.DataFrame(temp, columns = x_cols)


## 5.파이썬 오브젝트 저장하기

* data의 Embarked를 카테고리로 만들고 저장 


In [31]:
data['Embarked'] = pd.Categorical(data['Embarked'], categories=['C','Q','S'], ordered=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  871 non-null    int64   
 1   Pclass    871 non-null    int64   
 2   Sex       871 non-null    object  
 3   Age       696 non-null    float64 
 4   SibSp     871 non-null    int64   
 5   Parch     871 non-null    int64   
 6   Fare      871 non-null    float64 
 7   Embarked  869 non-null    category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 48.7+ KB


* csv로 저장하고 불러옵시다.

In [32]:
data.to_csv('data.csv', index = False)

In [33]:
data = pd.read_csv('data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  871 non-null    int64  
 1   Pclass    871 non-null    int64  
 2   Sex       871 non-null    object 
 3   Age       696 non-null    float64
 4   SibSp     871 non-null    int64  
 5   Parch     871 non-null    int64  
 6   Fare      871 non-null    float64
 7   Embarked  869 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 54.6+ KB


In [34]:
import joblib

* 파일로 저장

In [35]:
data['Embarked'] = pd.Categorical(data['Embarked'], categories=['C','Q','S'], ordered=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  871 non-null    int64   
 1   Pclass    871 non-null    int64   
 2   Sex       871 non-null    object  
 3   Age       696 non-null    float64 
 4   SibSp     871 non-null    int64   
 5   Parch     871 non-null    int64   
 6   Fare      871 non-null    float64 
 7   Embarked  869 non-null    category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 48.7+ KB


In [36]:
joblib.dump(data, 'data_df.pkl')

['data_df.pkl']

* 파일로 부터 읽어 오기

In [37]:
data2 = joblib.load('data_df.pkl')
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871 entries, 0 to 870
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  871 non-null    int64   
 1   Pclass    871 non-null    int64   
 2   Sex       871 non-null    object  
 3   Age       696 non-null    float64 
 4   SibSp     871 non-null    int64   
 5   Parch     871 non-null    int64   
 6   Fare      871 non-null    float64 
 7   Embarked  869 non-null    category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 48.6+ KB


* 딕셔너리  
        b = { 'v1':[1,2,3,4,5], 'v2':[6,7,8,9,0] }


In [38]:
b = { 'v1':[1,2,3,4,5], 'v2':[6,7,8,9,0] }
joblib.dump(b, 'b.pkl')
del b

In [39]:
b = joblib.load('b.pkl')
b

{'v1': [1, 2, 3, 4, 5], 'v2': [6, 7, 8, 9, 0]}

* 시리즈  
        data['Fare']

In [40]:
joblib.dump(data['Fare'], 'data_Fare.pkl')
data.drop('Fare', axis = 1, inplace = True)

In [41]:
data['Fare'] = joblib.load('data_Fare.pkl')
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Fare
0,0,3,male,22.0,1,0,S,7.25
1,1,1,female,38.0,1,0,C,71.2833
2,1,3,female,26.0,0,0,S,7.925
3,1,1,female,35.0,1,0,S,53.1
4,0,3,male,35.0,0,0,S,8.05


* 저장해야 할 오브젝트는 어떤 것들일까요?

* 자료형 : imputer1_list, cat

In [42]:
joblib.dump(imputer1_list, 'imputer1_list.pkl')
joblib.dump(cat, 'cat.pkl')

['cat.pkl']

* fitting된 함수 : imputer1, imputer2, model

In [43]:
joblib.dump(imputer1, 'preprocess/imputer1_ti1.pkl')
joblib.dump(imputer2, 'preprocess/imputer2_ti1.pkl')
joblib.dump(scaler, 'preprocess/scaler_ti1.pkl')
joblib.dump(model, 'model/model_ti1.pkl')

['model/model_ti1.pkl']

* 커널 재시작

* 환경 및 데이터 준비

In [44]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [45]:
use_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp' ,'Parch', 'Fare', 'Embarked']
data = pd.read_csv('data/titanic.csv', usecols = use_cols)

In [46]:
target = 'Survived'
x0 = data.drop(target, axis = 1)
y0 = data.loc[:, target]

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x0, y0, test_size = 10, random_state = 2022)

* 함수 생성하기

In [49]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

def titanic_dumm(df, cat):
    for k, v in cat.items():
        df[k] = pd.Categorical(df[k], categories=v, ordered=False)
    df = pd.get_dummies(df, columns =cat.keys(), drop_first = 1)
    return df

def titanic_datapipeline(df, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer):

    temp = df.copy()

    # Feature Engineering
    temp = titanic_fe(temp)

    # NaN 조치① : SimpleImputer
    temp[simple_impute_list] = simpleimputer.fit_transform(temp[simple_impute_list])

    # 가변수화
    temp = titanic_dumm(temp, dumm_list)

    x_cols = list(temp)

    # 스케일링
    temp = scaler.transform(temp)

    # NaN 조치② : KNNImputer
    temp = knnimputer.transform(temp)

    return pd.DataFrame(temp, columns = x_cols)


* 오브젝트들 불러오기

In [51]:
imputer1_list = joblib.load('imputer1_list.pkl')
cat = joblib.load('cat.pkl')

imputer1 = joblib.load('preprocess/imputer1_ti1.pkl')
imputer2 = joblib.load('preprocess/imputer2_ti1.pkl')
scaler = joblib.load('preprocess/scaler_ti1.pkl')
model = joblib.load('model/model_ti1.pkl')

* 적용하기

In [52]:
# 적용
input = titanic_datapipeline(x_test, imputer1, imputer1_list, cat, scaler, imputer2)

In [53]:
input

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,0.258608,0.014932,0.0,0.0,0.0,1.0,0.0,1.0
1,0.648153,0.059532,0.0,1.0,0.0,1.0,0.0,0.0
2,0.308872,0.014151,0.0,1.0,0.0,1.0,0.0,1.0
3,0.723549,0.299539,0.1,0.0,0.0,1.0,0.0,0.0
4,0.271174,0.081157,0.3,0.0,0.0,0.0,1.0,0.0
5,0.560191,0.051822,0.0,1.0,0.0,1.0,0.0,0.0
6,0.208344,0.013907,0.0,1.0,0.0,1.0,0.0,1.0
7,0.447097,0.025374,0.0,1.0,0.0,1.0,1.0,0.0
8,0.648153,0.02635,0.0,1.0,0.0,1.0,1.0,0.0
9,0.377984,0.015713,0.0,1.0,0.0,1.0,0.0,1.0


In [54]:
# 예측
model.predict(input)



array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0], dtype=int64)

* 모델_timestamp.pkl 형식으로 모델에 대한 버전관리를 해 봅시다.

* timestamp 만들기

In [55]:
import datetime

now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")
print(timestamp)

20220401_103218


* 모델 이름에 붙이기

In [56]:
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

model_fname = 'model_' + timestamp + '.pkl'
joblib.dump(model, model_fname)

['model_20220401_103218.pkl']

* 모델을 추가해 봅시다.

In [57]:
x_train = titanic_datapipeline(x_train, imputer1, imputer1_list, cat, imputer2, scaler)

model = SVC(C=0.1)
model.fit(x_train, y_train)

now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

model_fname = 'model_' + timestamp + '.pkl'
joblib.dump(model, model_fname)



['model_20220401_103219.pkl']