# 1.캐글(Kaggle)
* 전세계 데이터 사이언티스트들이 다양한 데이터를 부석하고 토론할 수 있는 커뮤니티를 제공
* 데이터 분석 및 머신러닝, 딥러닝 대회를 개최
* 데이터셋, 파이썬 자료, R 자료 
* [케글 홈페이지](https://kaggle.com)
* [데이콘 ](https://dacon.io/)

In [35]:
import numpy as np
import pandas as pd

In [36]:
df = pd.read_csv('https://bit.ly/fc-ml-titanic')

In [37]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


* PassengerId: 승객 아이디
* Survived: 생존 여부
* Pclass: 좌석등급
* Name: 이름
* Sex: 성별
* Age: 나이
* SibSip: 형제, 자매 , 배우자 수
* Ticket: 티켓번호
* Fare: 요금
* Cabin: 좌석번호
* Embarked: 탐승항구 

# 2. 데이터 전처리
* 넓은 범위의 데이터 정제 작업을 뜻함
* 필요없는 데이터를 삭제하고, 필요한 데이터만 취하는것, NULL값이 있는 행을 삭제하는 것, 정규화, 표준화 등의 많은 작업들을 포함하고 있음
* 머신러닝, 딥러닝 실무에서도 전처리가 50% 이상의 중요도를 차지함

### 2-1. 독립변수와 종속변수 나누기

In [38]:
feature = ['Sex','Fare','Age','Pclass']
label = ['Survived']# 종속변수


In [39]:
df[feature].head()

Unnamed: 0,Sex,Fare,Age,Pclass
0,male,7.25,22.0,3
1,female,71.2833,38.0,1
2,female,7.925,26.0,3
3,female,53.1,35.0,1
4,male,8.05,35.0,3


In [40]:
df[label].head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [41]:
df[label].value_counts()

Survived
0           549
1           342
dtype: int64

### 2-2.결측치 처리

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [43]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [44]:
# 나이 결측치를 평균으로 처리
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

### 2-3 라벨 인코딩(Label Encoding)
* 문자(Category)를 수치(numerical)로 변환

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [46]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [47]:
def conver_sex(data):
  if data == 'male':
    return 1
  elif data == 'female':
    return 0

In [48]:
df['Sex'] = df['Sex'].apply(conver_sex)

In [49]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [50]:
from sklearn.preprocessing import LabelEncoder

In [51]:
le = LabelEncoder()

In [52]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [53]:
le.fit_transform(df['Embarked'])

array([2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2,
       1, 2, 2, 2, 0, 2, 1, 2, 0, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0,
       1, 2, 1, 1, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 3, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 0, 2, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 0, 1, 2, 0, 2, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2,
       2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2,
       1, 2, 2, 2, 2, 0, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1,
       2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 0,
       2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 1,

In [54]:
le.classes_ #(['C':0, 'Q':1, 'S':2, nan:3]

array(['C', 'Q', 'S', nan], dtype=object)

### 2-4. 원 핫 인코딩(One hot Encoding)
* 독립적인 데이터는 별도의 컬럼으로 분리하고 각각 컬럼에 해당 값에만 1, 나머지는 0값을 갖게 하는 방법
* 예) 머신러닝 알고리즘은 'C':0, 'Q':1, 'S':2, nan:3 데이터의 관계성을 찾아 'Q + Q = s'라고 학습 할 수 있음 -> 관계성을 끊어주기 위해 원 핫 인코딩을 사용

In [55]:
df['Embarked_num'] = LabelEncoder().fit_transform(df['Embarked'])

In [56]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_num
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,2


In [57]:
pd.get_dummies(df['Embarked_num'])

Unnamed: 0,0,1,2,3
0,0,0,1,0
1,1,0,0,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
886,0,0,1,0
887,0,0,1,0
888,0,0,1,0
889,1,0,0,0


# 학습데이터와 검증 데이터를 분할
# 학습데이터(80%), 검증데이터(20%)

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(df[feature],
                                                    df[label],test_size=0.2, random_state=10)

In [60]:
X_train.shape,y_train.shape

((712, 4), (712, 1))

In [61]:
X_test.shape,y_test.shape 

((179, 4), (179, 1))