# Data Pipeline

* 이제 여러분은 코드를 작성할 때, 두 가지를 고려해야 합니다.
    * 재사용 하려면 어떻게 작성해야 할까?
    * 물 흐르듯이 pipeline을 구성하려면 어떻게 작성해야 할까?

## 0.환경준비 

### 1) 라이브러리 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.metrics import classification_report

### 2) 데이터 불러오기

In [2]:
data = pd.read_csv('https://bit.ly/3FsgwkJ')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 1.데이터 이해

## 2.데이터 전처리

* 여러분은 **순서**에 집중하십시오.
* **처리 순서**가 적절한지, 왜 그런지 의문을 갖고 설명을 들으세요.

### 1) 불필요한 데이터 처리
처음부터 꼭 필요한 칼럼만 지정하여 불러오는 것이 좋습니다.

In [3]:
use_cols = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp' ,'Parch', 'Fare', 'Embarked']
data = pd.read_csv('https://bit.ly/3FsgwkJ', usecols = use_cols)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### 2) 데이터 분할

#### x, y 분할

In [4]:
target = 'Survived'
x0 = data.drop(target, axis = 1)
y0 = data.loc[:, target]

#### test 분할

여기서는 조금만 떼어 냅시다.

In [5]:
x, x_test, y, y_test = train_test_split(x0, y0, test_size = 10, random_state = 2022)

In [6]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
770,3,male,24.0,0,0,9.5,S
178,2,male,30.0,0,0,13.0,S
786,3,female,18.0,0,0,7.4958,S
159,3,male,,8,2,69.55,S
656,3,male,,0,0,7.8958,S
700,1,female,18.0,1,0,227.525,C
471,3,male,38.0,0,0,8.6625,S
780,3,female,13.0,0,0,7.2292,C
711,1,male,,0,0,26.55,S
231,3,male,29.0,0,0,7.775,S


#### train, val 분할

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### 3) Feature Engineering
* family 변수를 추가하려고 합니다. 가족과 관련된 변수가 SibSp, Parch 입니다. 이 둘을 더하고 자기자신까지 포함시켜서 가족 수 변수를 만듭시다.
* 그리고, SibSp, Parch 는 제거합니다.

In [8]:
x_train['Family'] = x_train['SibSp'] + x_train['Parch'] + 1
x_train.drop(['SibSp', 'Parch'], axis = 1, inplace = True)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
742,1,female,21.0,262.375,C,5
591,1,female,52.0,78.2667,C,2
1,1,female,38.0,71.2833,C,2
152,3,male,55.5,8.05,S,1
636,3,male,32.0,7.925,S,1


* 재사용을 위해서는 함수로 만드는 것이 좋습니다.

In [9]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

#### validation set에 적용하기

In [10]:
x_val = titanic_fe(x_val)

x_val.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
39,3,female,14.0,11.2417,C,2
336,1,male,29.0,66.6,S,2
623,3,male,21.0,7.8542,S,1
647,1,male,56.0,35.5,C,1
868,3,male,,9.5,S,1


### 4) NaN 조치①

* 먼저 x의 NaN을 조사해 봅시다.

In [11]:
x_train.isna().sum()

Pclass        0
Sex           0
Age         126
Fare          0
Embarked      2
Family        0
dtype: int64

* 어떻게 조치 방법에 따라 처리 시점이 달라집니다.
    * Embarked는 최빈값으로 **지금** 채우고
    * Age는 KNNImputer로 **가변수화 후에** 채우겠습니다.

* NaN 행 삭제를 결정한다면...
    * 운영에서 NaN이 들어오면 그 역시 버리겠다는 의미 입니다. 
        * 그래도 괜찮다면...
        * 그러나 괜찮은 상황은 별로 없을 겁니다.

#### SimpleImputer 

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [12]:
from sklearn.impute import SimpleImputer

* 최빈값으로 채우기 : 보통 범주형(숫자는 이산형)을 채울 때 사용합니다.
    * strategy = 'most_frequent'

In [13]:
# 대상을 리스트로 선언합시다. 
imputer1_list = ['Embarked']

# 선언하고 fit_transform
imputer1 = SimpleImputer(strategy='most_frequent')

x_train[imputer1_list] = imputer1.fit_transform(x_train[imputer1_list])

x_train.isna().sum()

Pclass        0
Sex           0
Age         126
Fare          0
Embarked      0
Family        0
dtype: int64

#### validation set에 적용하기

In [14]:
x_val[imputer1_list] = imputer1.transform(x_val[imputer1_list])

x_val.isna().sum()

Pclass       0
Sex          0
Age         48
Fare         0
Embarked     0
Family       0
dtype: int64

#### 실습 

* x_train를 temp로 복사한 후에 temp['Age'] 에 대해서 평균으로 채워 봅시다.

In [15]:
temp = x_train.copy()

In [17]:
# 대상을 리스트로 선언합시다. 
imputer2_list = ['Age']

# 선언하고 fit_transform
imputer2 = SimpleImputer()

temp[imputer2_list] = imputer2.fit_transform(temp[imputer2_list])

temp.isna().sum()

Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
Family      0
dtype: int64

### 5) 가변수화

* 가변수화 할 때 고려해야 할 점.

In [18]:
dict1 = {'x1':['a','b','a','b','c'], 'x2':[2,6,5,3,4]}
dict2 = {'x1':['a','b','b'], 'x2':[7,8,9]}

train = pd.DataFrame(dict1)
test =  pd.DataFrame(dict2)

In [19]:
train

Unnamed: 0,x1,x2
0,a,2
1,b,6
2,a,5
3,b,3
4,c,4


In [20]:
test

Unnamed: 0,x1,x2
0,a,7
1,b,8
2,b,9


In [21]:
# get_dummies로 가변수화를 수행해 봅시다.
pd.get_dummies(train, columns = ['x1'])

Unnamed: 0,x2,x1_a,x1_b,x1_c
0,2,1,0,0
1,6,0,1,0
2,5,1,0,0
3,3,0,1,0
4,4,0,0,1


In [22]:
pd.get_dummies(test, columns = ['x1'])

Unnamed: 0,x2,x1_a,x1_b
0,7,1,0
1,8,0,1
2,9,0,1


#### category type

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   x1      5 non-null      object
 1   x2      5 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [24]:
train['x1'] = pd.Categorical(train['x1'], categories=["a", "b", "c"], ordered=False)
test['x1'] = pd.Categorical(test['x1'], categories=["a", "b", "c"], ordered=False)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   x1      5 non-null      category
 1   x2      5 non-null      int64   
dtypes: category(1), int64(1)
memory usage: 305.0 bytes


category로 변경 후 get_dummies를 사용하면 지정된 카테고리 기준으로 칼럼이 생성 됩니다.

In [25]:
pd.get_dummies(test, columns = ['x1'])

Unnamed: 0,x2,x1_a,x1_b,x1_c
0,7,1,0,0
1,8,0,1,0
2,9,0,1,0


#### 실습
* x_train 범주형 변수에 대해서 category 타입으로 변경합니다.
* 가변수화를 수행합니다.(drop_first=True)

In [29]:
cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

for cat_name, cat_item in cat.items() :
    x_train[cat_name] = pd.Categorical(x_train[cat_name], categories=cat_item, ordered=False)

x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 616 entries, 742 to 393
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Pclass    616 non-null    category
 1   Sex       616 non-null    category
 2   Age       490 non-null    float64 
 3   Fare      616 non-null    float64 
 4   Embarked  616 non-null    category
 5   Family    616 non-null    int64   
dtypes: category(3), float64(2), int64(1)
memory usage: 21.4 KB


In [33]:
dum_col = list(cat.keys())
x_train = pd.get_dummies(x_train, columns=dum_col, drop_first=True)

In [34]:
x_train.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
742,21.0,262.375,5,0,0,0,0,0
591,52.0,78.2667,2,0,0,0,0,0
1,38.0,71.2833,2,0,0,0,0,0
152,55.5,8.05,1,1,0,1,0,1
636,32.0,7.925,1,1,0,1,0,1


In [35]:
def titanic_dumm(df, cat) :
    tmp = df.copy()
    for k, v in cat.items() :
        tmp[k] = pd.Categorical(tmp[k], categories=v, ordered=False)
    tmp = pd.get_dummies(tmp, columns=cat.keys(), drop_first=True)
    return tmp

#### validation set에 적용하기

In [36]:
# 함수로 생성
x_val = titanic_dumm(x_val, cat)
x_val.head()

Unnamed: 0,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
39,14.0,11.2417,2,0,0,0,0,1
336,29.0,66.6,2,1,0,1,0,0
623,21.0,7.8542,1,1,0,1,0,1
647,56.0,35.5,1,1,0,0,0,0
868,,9.5,1,1,0,1,0,1


### 6) 스케일링


In [37]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)

#### validation set에 적용하기

In [38]:
# validation 적용
x_val_s = scaler.transform(x_val)

### 7) NaN 조치②

#### KNNImputer
https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

In [39]:
from sklearn.impute import KNNImputer

In [41]:
imputer2_list = list(x_train)

# 선언하고 fit_transform
imputer2 = KNNImputer()

x_train_s = imputer2.fit_transform(x_train_s)

x_train_s = pd.DataFrame(x_train_s, columns=imputer2_list)
x_train_s.isna().sum()

Age           0
Fare          0
Family        0
Sex_male      0
Embarked_Q    0
Embarked_S    0
Pclass_2      0
Pclass_3      0
dtype: int64

#### validation set에 적용하기

In [42]:
# validation 적용
x_val_s = imputer2.transform(x_val_s)

x_val_s = pd.DataFrame(x_val_s, columns=imputer2_list)
x_val_s.isna().sum()

Age           0
Fare          0
Family        0
Sex_male      0
Embarked_Q    0
Embarked_S    0
Pclass_2      0
Pclass_3      0
dtype: int64

## 3.모델링

여기에서는 성능 최적화가 주안점이 아니므로 기본값으로 모델링을 수행합니다.

In [43]:
# SVM으로 모델링 수행
model = SVC()
model.fit(x_train_s, y_train)

SVC()

In [44]:
# validation
pred = model.predict(x_val_s)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

           0       0.80      0.96      0.87       167
           1       0.90      0.58      0.71        98

    accuracy                           0.82       265
   macro avg       0.85      0.77      0.79       265
weighted avg       0.84      0.82      0.81       265



## 4.Data Pipeline 정리

* 이제 최적의 모델이 생성되어, 운영시스템에 배포되었습니다.
* 운영에서 new data가 주어졌을 때, 어떤 절차로 파이프라인을 구성해야 할까요?

In [45]:
# new data : x_test
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
770,3,male,24.0,0,0,9.5,S
178,2,male,30.0,0,0,13.0,S
786,3,female,18.0,0,0,7.4958,S
159,3,male,,8,2,69.55,S
656,3,male,,0,0,7.8958,S


### 1) [validation에 적용하기] 코드들 가져오기

* 함수, 변수 선언

In [46]:
def titanic_fe(df):
    temp = df.copy()
    # Family 변수 추가
    temp['Family'] = temp['SibSp'] + temp['Parch'] + 1
    temp.drop(['SibSp', 'Parch'], axis = 1, inplace = True)

    # OOO 추가...
    return temp

def titanic_dumm(df, cat):
    for k, v in cat.items():
        df[k] = pd.Categorical(df[k], categories=v, ordered=False)
    df = pd.get_dummies(df, columns =cat.keys(), drop_first = 1)
    return df

imputer1_list = ['Embarked']

cat = {'Sex':["female", "male"]
       , 'Embarked':["C", "Q", "S"]
       ,'Pclass':[1,2,3]}

* 전처리 실행

In [47]:
temp = x_test.copy()

In [48]:
# Feature Engineering
temp = titanic_fe(temp)

# NaN 조치① : SimpleImputer
temp[imputer1_list] = imputer1.fit_transform(temp[imputer1_list])

# 가변수화
temp = titanic_dumm(temp, cat)

# 스케일링
temp = scaler.transform(temp)

# NaN 조치② : KNNImputer
temp = imputer2.transform(temp)

temp

array([[0.2963056 , 0.01854277, 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.37170143, 0.02537431, 0.        , 1.        , 0.        ,
        1.        , 1.        , 0.        ],
       [0.22090978, 0.01463083, 0.        , 0.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.07514451, 0.13575256, 1.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.31892435, 0.01541158, 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.22090978, 0.44409922, 0.1       , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.4722292 , 0.01690807, 0.        , 1.        , 0.        ,
        1.        , 0.        , 1.        ],
       [0.15807992, 0.01411046, 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.        ],
       [0.54762503, 0.05182215, 0.        , 1.        , 0.        ,
        1.        , 0.      

### 2) Data Pipeline 함수 만들고 실행하기

In [49]:
def titanic_datapipeline(df, simpleimputer, simple_impute_list, dumm_list, scaler, knnimputer):

    temp = df.copy()

    # Feature Engineering
    temp = titanic_fe(temp)

    # NaN 조치① : SimpleImputer
    temp[simple_impute_list] = simpleimputer.fit_transform(temp[simple_impute_list])

    # 가변수화
    temp = titanic_dumm(temp, dumm_list)

    x_cols = list(temp)
    # 스케일링
    temp = scaler.transform(temp)

    # NaN 조치② : KNNImputer
    temp = knnimputer.transform(temp)

    return pd.DataFrame(temp, columns = x_cols)


In [50]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
770,3,male,24.0,0,0,9.5,S
178,2,male,30.0,0,0,13.0,S
786,3,female,18.0,0,0,7.4958,S
159,3,male,,8,2,69.55,S
656,3,male,,0,0,7.8958,S
700,1,female,18.0,1,0,227.525,C
471,3,male,38.0,0,0,8.6625,S
780,3,female,13.0,0,0,7.2292,C
711,1,male,,0,0,26.55,S
231,3,male,29.0,0,0,7.775,S


In [51]:
# 적용
input = titanic_datapipeline(x_test, imputer1, imputer1_list, cat, scaler, imputer2)

In [52]:
# 예측
model.predict(input)

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0], dtype=int64)