In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from sklearn import ?
# from sklearn.metrics import ?

<br>

## 1. Preparing dataset (2번부터 실습 진행)

In [2]:
data_df = pd.read_csv('titanic.csv')
data_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Data info

- **PassengerId** : Unique ID of passenger
- **Survived** : 0 = No, 1 = Yes
- **pclass** : Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
- **sibsp** : # of siblings & spouses aboard the Titanic
- **parch** : # of parents / children aboard the Titanic
- **ticket** : Ticket number
- **cabin** : Cabin number
- **embarked** : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [3]:
y_data = data_df[['Survived']]
y_data.head(3)

Unnamed: 0,Survived
0,0
1,1
2,1


In [4]:
del data_df['Survived']
x_data = data_df.copy()
x_data.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


<br>

## 2. Feature engineering & Feature selection

#### 시도해볼 수 있는 전략들

- 불필요한 열이나 예측에 방해가 되는 열은 아예 지우기 (ex. PassengerId)
- 결측치 채우기 
- Text로 되어있는 Category(Factor)는 숫자로 바꿔주기 (ex. Male/Female -> 0/1)
- 실수 범위를 구간 범위로 바꿔주기 
- 필요한 경우 기존 열을 바탕으로 새로운 열을 계산해 추가하기

## 원하는 전처리

### Plass, Sex(one-hot encoding), Age(Min-Max? Standardizatin?), Companion(SibSp+parch, 같은 티켓), Fare(고민?) Embarked(고민??)

In [10]:
x_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    uint8  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(4), uint8(1)
memory usage: 70.6+ KB


In [11]:
x_data['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

## [AGE]열 결측치 -> 중앙값
---

In [7]:
# 기본적인 결측치 채우는 방법
mean_age = x_data['Age'].median(skipna=True)
x_data['Age'] = x_data['Age'].fillna(mean_age) # or .fillna(int/float/str 값)

## ['Sex']열 One-hot encoding

In [14]:
x_data['Sex'] = pd.get_dummies(x_data['Sex']) # 남자 0, 여자 1

📌 배우자 형제랑 같이 있으면 같이 탈출 가능성 높아질수도?, 부모자식이면 느려질 수도? 

## ['Companion'] 열 만들기
['SibSp'] 배우자, 형제 + ['Parch'] 부모 자식  
['Ticket'] 중복시 친구?? 일행 가능성?

In [16]:
x_data['Companion'] = x_data['SibSp'] + x_data['Parch']
del x_data['SibSp']
del x_data['Parch']

In [23]:
x_data.duplicated('Ticket')

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Length: 891, dtype: bool

## 필요한 열만 뽑아내는 과정

In [22]:
x_data[['Pclass', 'Sex', 'Age',  ]]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Companion
0,1,3,"Braund, Mr. Owen Harris",0,22.0,A/5 21171,7.2500,,S,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,PC 17599,71.2833,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",1,26.0,STON/O2. 3101282,7.9250,,S,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,113803,53.1000,C123,S,1
4,5,3,"Allen, Mr. William Henry",0,35.0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",0,27.0,211536,13.0000,,S,0
887,888,1,"Graham, Miss. Margaret Edith",1,19.0,112053,30.0000,B42,S,0
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,28.0,W./C. 6607,23.4500,,S,3
889,890,1,"Behr, Mr. Karl Howell",0,26.0,111369,30.0000,C148,C,0


<br>

## 2. Train - Test split (비율을 7:3 으로 유지해주시고, seed는 0을 적용해주세요)

In [19]:
from sklearn import model_selection

In [None]:
x_train, x_test, y_train, y_test = model_selection.train_test_split()

<br>

## 3. Create model instance variable (동시에 여러 모델을 다른 이름으로 만들 수 있습니다.)

<br>

## 4. Train the model

<br>

## 5. Predict on test data & Check the result with metrics (모델 간 비교가 가능합니다.)