# ML Pipeline

* 이제 여러분은 코드를 작성할 때, 두 가지를 고려해야 합니다.
    * 재사용 하려면 어떻게 작성해야 할까?
    * 물 흐르듯이 pipeline을 구성하려면 어떻게 작성해야 할까?

## 0.환경준비 

### 1) 라이브러리 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.metrics import classification_report

### 2) 데이터 불러오기

In [2]:
use_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp' ,'Parch', 'Fare', 'Embarked']
data = pd.read_csv('https://bit.ly/3FsgwkJ', usecols = use_cols)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## 2.데이터 전처리

### 1) 불필요한 데이터 처리
처음부터 꼭 필요한 칼럼만 지정하여 불러오는 것이 좋습니다.

### 2) 데이터 분할

#### x, y 분할

In [3]:
target = 'Survived'
x0 = data.drop(target, axis = 1)
y0 = data.loc[:, target]

#### test 분할

여기서는 조금만 떼어 냅시다.

In [4]:
x, x_test, y, y_test = train_test_split(x0, y0, test_size = 5, random_state = 2022)

In [5]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
770,3,male,24.0,0,0,9.5,S
178,2,male,30.0,0,0,13.0,S
786,3,female,18.0,0,0,7.4958,S
159,3,male,,8,2,69.55,S
656,3,male,,0,0,7.8958,S


#### train, val 분할

In [6]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### 3) Feature Engineering
* family 변수를 추가하려고 합니다. 가족과 관련된 변수가 SibSp, Parch 입니다. 이 둘을 더하고 자기자신까지 포함시켜서 가족 수 변수를 만듭시다.
* 그리고, SibSp, Parch 는 제거합니다.

In [7]:
x_train['Family'] = x_train['SibSp'] + x_train['Parch'] + 1
x_train.drop(['SibSp', 'Parch'], axis = 1, inplace = True)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
175,3,male,18.0,7.8542,S,3
568,3,male,,7.2292,C,1
762,3,male,20.0,7.2292,C,1
726,2,female,30.0,21.0,S,4
790,3,male,,7.75,Q,1


* 재사용을 위해서는 함수로 만드는 것이 좋습니다.

In [8]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

#### validation set에 적용하기

In [9]:
x_val = titanic_fe(x_val)

x_val.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
598,3,male,,7.225,C,1
247,2,female,24.0,14.5,S,3
830,3,female,15.0,14.4542,C,2
625,1,male,61.0,32.3208,S,1
214,3,male,,7.75,Q,2


### 4) NaN 조치①

* 먼저 x의 NaN을 조사해 봅시다.

In [10]:
x_train.isna().sum()

Pclass        0
Sex           0
Age         112
Fare          0
Embarked      1
Family        0
dtype: int64

* 어떻게 조치 방법에 따라 처리 시점이 달라집니다.
    * Embarked는 최빈값으로 **지금** 채우고
    * Age는 KNNImputer로 **가변수화 후에** 채우겠습니다.

* NaN 행 삭제를 결정한다면...
    * 운영에서 NaN이 들어오면 그 역시 버리겠다는 의미 입니다. 
        * 그래도 괜찮다면...
        * 그러나 괜찮은 상황은 별로 없을 겁니다.

#### SimpleImputer 

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [11]:
from sklearn.impute import SimpleImputer

* 최빈값으로 채우기 : 보통 범주형(숫자는 이산형)을 채울 때 사용합니다.
    * strategy = 'most_frequent'

In [12]:
# 대상을 리스트로 선언합시다. 
imputer1_list = ['Embarked']

# 선언하고 fit_transform
imputer1 = SimpleImputer(strategy = 'most_frequent')
x_train[imputer1_list] = imputer1.fit_transform(x_train[imputer1_list])
x_train.isna().sum()

Pclass        0
Sex           0
Age         112
Fare          0
Embarked      0
Family        0
dtype: int64

#### validation set에 적용하기

In [13]:
imputer1_list = ['Embarked']
x_val[imputer1_list] = imputer1.fit_transform(x_val[imputer1_list])

### 5) 가변수화

In [14]:
cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

for k, v in cat.items():
    x_train[k] = pd.Categorical(x_train[k], categories=v, ordered=False)

x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 886 entries, 700 to 220
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    886 non-null    int64  
 1   Sex       886 non-null    object 
 2   Age       711 non-null    float64
 3   SibSp     886 non-null    int64  
 4   Parch     886 non-null    int64  
 5   Fare      886 non-null    float64
 6   Embarked  884 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 55.4+ KB


In [15]:
x_train = pd.get_dummies(x_train, columns =cat.keys(), drop_first = 1)

In [16]:
x_train.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
175,18.0,7.8542,3,1,0,1,0,1
568,,7.2292,1,1,0,0,0,1
762,20.0,7.2292,1,1,0,0,0,1
726,30.0,21.0,4,0,0,1,1,0
790,,7.75,1,1,1,0,0,1


#### validation set에 적용하기

In [17]:
# 함수로 생성

cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

def titanic_dumm(df, cat):
    temp = df.copy()
    for k, v in cat.items():
        temp[k] = pd.Categorical(temp[k], categories=v, ordered=False)
    temp = pd.get_dummies(temp, columns =cat.keys(), drop_first = 1)
    return temp
x_val = titanic_dumm(x_val, cat)
x_val.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
598,,7.225,1,1,0,0,0,1
247,24.0,14.5,3,0,0,1,1,0
830,15.0,14.4542,2,0,0,0,0,1
625,61.0,32.3208,1,1,0,1,0,0
214,,7.75,2,1,1,0,0,1


### 6) 스케일링


In [18]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)

#### validation set에 적용하기

In [19]:
# validation 적용
x_val_s = scaler.transform(x_val)

### 7) NaN 조치②

#### KNNImputer
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

In [20]:
from sklearn.impute import KNNImputer

In [21]:
imputer2_list = list(x_train)
imputer2_list

['Age',
 'Fare',
 'Family',
 'Sex_male',
 'Embarked_Q',
 'Embarked_S',
 'Pclass_2',
 'Pclass_3']

In [22]:
# 선언하고 fit_transform
imputer2 = KNNImputer()
x_train_s = imputer2.fit_transform(x_train_s)

#### validation set에 적용하기

In [23]:
# validation 적용
x_val_s = imputer2.transform(x_val_s)

## 3.모델링

여기에서는 성능 최적화가 주안점이 아니므로 기본값으로 모델링을 수행합니다.

In [24]:
# SVM으로 모델링 수행
model = SVC()
model.fit(x_train_s, y_train)

SVC()

In [25]:
# validation
pred = model.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.78      0.96      0.86       158
           1       0.92      0.60      0.73       108

    accuracy                           0.82       266
   macro avg       0.85      0.78      0.79       266
weighted avg       0.83      0.82      0.81       266



## 4.Data Pipeline 정리

* 이제 최적의 모델이 생성되어, 운영시스템에 배포되었습니다.
* 운영에서 new data가 주어졌을 때, 어떤 절차로 파이프라인을 구성해야 할까요?

In [26]:
# new data : x_test
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
770,3,male,24.0,0,0,9.5,S
178,2,male,30.0,0,0,13.0,S
786,3,female,18.0,0,0,7.4958,S
159,3,male,,8,2,69.55,S
656,3,male,,0,0,7.8958,S


### 1) [validation에 적용하기] 코드들 가져오기

* 함수, 변수 선언

In [27]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

def titanic_dumm(df, cat):
    for k, v in cat.items():
        df[k] = pd.Categorical(df[k], categories=v, ordered=False)
    df = pd.get_dummies(df, columns =cat.keys(), drop_first = 1)
    return df

imputer1_list = ['Embarked']

cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

* 전처리 실행

In [28]:
temp = x_test.copy()

In [29]:
# Feature Engineering
temp = titanic_fe(temp)

# NaN 조치① : SimpleImputer
temp[imputer1_list] = imputer1.fit_transform(temp[imputer1_list])

# 가변수화
temp = titanic_dumm(temp, cat)

# 스케일링
temp = scaler.transform(temp)

# NaN 조치② : KNNImputer
temp = imputer2.transform(temp)

temp

array([[0.2963056 , 0.01854277, 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.37170143, 0.02537431, 0.        , 1.        , 0.        ,
        1.        , 1.        , 0.        ],
       [0.22090978, 0.01463083, 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.16059311, 0.13575256, 1.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.32646394, 0.01541158, 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ]])

### 2) Data Pipeline 함수 만들고 실행하기

In [30]:
def titanic_datapipeline(df, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer):

    temp = df.copy()

    # Feature Engineering
    temp = titanic_fe(temp)

    # NaN 조치① : SimpleImputer
    temp[simple_impute_list] = simpleimputer.fit_transform(temp[simple_impute_list])

    # 가변수화
    temp = titanic_dumm(temp, dumm_list)

    x_cols = list(temp)
    # 스케일링
    temp = scaler.transform(temp)

    # NaN 조치② : KNNImputer
    temp = knnimputer.transform(temp)

    return pd.DataFrame(temp, columns = x_cols)


## 5.파이썬 오브젝트 저장하기

### 1) 데이터프레임을 파일로 저장
* 데이터프레임을 파일로 저장하려면 어떻게 해야 할까요? csv?


* data의 Embarked를 카테고리로 만들고 저장 


In [31]:
data['Embarked'] = pd.Categorical(data['Embarked'], categories=['C','Q','S'], ordered=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    int64   
 1   Pclass    891 non-null    int64   
 2   Sex       891 non-null    object  
 3   Age       714 non-null    float64 
 4   SibSp     891 non-null    int64   
 5   Parch     891 non-null    int64   
 6   Fare      891 non-null    float64 
 7   Embarked  889 non-null    category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 49.9+ KB


* csv로 저장하고 불러옵시다.

In [32]:
data.to_csv('data.csv', index = False)

In [33]:
data = pd.read_csv('data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


### 2) 파이썬 객체 그대로 저장하기

In [34]:
import joblib

* 파일로 저장

In [35]:
data['Embarked'] = pd.Categorical(data['Embarked'], categories=['C','Q','S'], ordered=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    int64   
 1   Pclass    891 non-null    int64   
 2   Sex       891 non-null    object  
 3   Age       714 non-null    float64 
 4   SibSp     891 non-null    int64   
 5   Parch     891 non-null    int64   
 6   Fare      891 non-null    float64 
 7   Embarked  889 non-null    category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 49.9+ KB


In [36]:
joblib.dump(data, 'data_df.pkl')

['data_df.pkl']

* 파일로 부터 읽어 오기

In [37]:
data2 = joblib.load('data_df.pkl')
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    int64   
 1   Pclass    891 non-null    int64   
 2   Sex       891 non-null    object  
 3   Age       714 non-null    float64 
 4   SibSp     891 non-null    int64   
 5   Parch     891 non-null    int64   
 6   Fare      891 non-null    float64 
 7   Embarked  889 non-null    category
dtypes: category(1), float64(2), int64(4), object(1)
memory usage: 49.7+ KB


#### 실습
* 다음을 저장하고, 삭제하고, 로딩해 봅시다.

* 리스트  
        a = [1,2,3,4,5]  


In [38]:
a=[1,2,3,4,5]
joblib.dump(a, 'a_list.plk')

['a_list.plk']

In [40]:
a = []
a = joblib.load('a_list.plk')
a

[1, 2, 3, 4, 5]

* 딕셔너리  
        b = { 'v1':[1,2,3,4,5], 'v2':[6,7,8,9,0] }


In [41]:
b = { 'v1':[1,2,3,4,5], 'v2':[6,7,8,9,0] }
joblib.dump(b, 'b_dic.plk')

['b_dic.plk']

In [42]:
tmp = joblib.load('b_dic.plk')
tmp

{'v1': [1, 2, 3, 4, 5], 'v2': [6, 7, 8, 9, 0]}

* 시리즈  
        data['Fare']

In [43]:
tmp = data['Fare']
joblib.dump(tmp, 'data_Fare.plk')

['data_Fare.plk']

In [44]:
tmp = joblib.load('data_Fare.plk')
tmp

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

### 3) 실습 : 저장하기

* 저장해야 할 오브젝트는 어떤 것들일까요?

* 자료형 : imputer1_list, cat

In [46]:
joblib.dump(imputer1_list, 'simpleimputer_list.plk')
joblib.dump(cat, 'dumm_list.plk')

['dumm_list.plk']

* fitting된 함수 : imputer1, imputer2, model, scaler

In [48]:
joblib.dump(imputer1, 'simpleimputer.plk')
joblib.dump(imputer2, 'knnimputer.plk')
joblib.dump(model, 'model.plk')
joblib.dump(scaler, 'scaler.plk')

['scaler.plk']

### 4) 실습 : 커널 재시작 & 불러온 함수로 New Data 예측하기

* 커널 재시작

* 환경 및 데이터 준비

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
use_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp' ,'Parch', 'Fare', 'Embarked']
data = pd.read_csv('https://bit.ly/3FsgwkJ', usecols = use_cols)

In [3]:
target = 'Survived'
x0 = data.drop(target, axis = 1)
y0 = data.loc[:, target]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x0, y0, test_size = 10, random_state = 2022)

* 함수 생성하기

In [5]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

def titanic_dumm(df, cat):
    for k, v in cat.items():
        df[k] = pd.Categorical(df[k], categories=v, ordered=False)
    df = pd.get_dummies(df, columns =cat.keys(), drop_first = 1)
    return df

def titanic_datapipeline(df, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer ):

    temp = df.copy()

    # Feature Engineering
    temp = titanic_fe(temp)

    # NaN 조치① : SimpleImputer
    temp[simple_impute_list] = simpleimputer.fit_transform(temp[simple_impute_list])

    # 가변수화
    temp = titanic_dumm(temp, dumm_list)

    x_cols = list(temp)
    # 스케일링
    temp = scaler.transform(temp)

    # NaN 조치② : KNNImputer
    temp = knnimputer.transform(temp)

    return pd.DataFrame(temp, columns = x_cols)


* 오브젝트들 불러오기

In [6]:
# 변수 불러오기
simple_impute_list = joblib.load('simpleimputer_list.plk')
dumm_list = joblib.load('dumm_list.plk')

In [7]:
# 모델 불러오기
simpleimputer = joblib.load('simpleimputer.plk')
scaler = joblib.load('scaler.plk')
knnimputer = joblib.load('knnimputer.plk')
model = joblib.load('model.plk')

* 적용하기

In [8]:
# 적용
input_data = titanic_datapipeline(x_test, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer)
input_data.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,0.296306,0.018543,0.0,1.0,0.0,1.0,0.0,1.0
1,0.371701,0.025374,0.0,1.0,0.0,1.0,1.0,0.0
2,0.22091,0.014631,0.0,0.0,0.0,1.0,0.0,1.0
3,0.160593,0.135753,1.0,1.0,0.0,1.0,0.0,1.0
4,0.326464,0.015412,0.0,1.0,0.0,1.0,0.0,1.0


In [9]:
# 예측
model.predict(input_data)



array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0], dtype=int64)

## 6.모델 버전관리

* 모델_timestamp.pkl 형식으로 모델에 대한 버전관리를 해 봅시다.

* timestamp 만들기

In [10]:
import datetime

now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")
print(timestamp)

20220331_154253


* 모델 이름에 붙이기

In [11]:
now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

model_fname = 'model_' + timestamp + '.pkl'
joblib.dump(model, model_fname)

['model_20220331_154323.pkl']

* 모델을 추가해 봅시다.

In [13]:
x_train = titanic_datapipeline(x_train, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer)

model = SVC(C=0.1)
model.fit(x_train, y_train)

now = datetime.datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M%S")

model_fname = 'model_' + timestamp + '.pkl'
joblib.dump(model, model_fname)

['model_20220331_154417.pkl']