In [52]:
import pandas as pd
import numpy as np

In [53]:
titanic_df = pd.read_csv('data/train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [54]:
# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

In [55]:
# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

In [56]:
# 레이블 인코딩 수행
from sklearn.preprocessing import LabelEncoder

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])

    return df

In [57]:
# 앞에서 설정한 전처리 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [58]:
#데이터 전처리
transform_features(titanic_df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.000000,1,0,7.2500,7,3
1,1,1,0,38.000000,1,0,71.2833,2,0
2,1,3,0,26.000000,0,0,7.9250,7,3
3,1,1,0,35.000000,1,0,53.1000,2,3
4,0,3,1,35.000000,0,0,8.0500,7,3
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,7,3
887,1,1,0,19.000000,0,0,30.0000,1,3
888,0,3,0,29.699118,1,2,23.4500,7,3
889,1,1,1,26.000000,0,0,30.0000,2,0


In [59]:
#결측치 확인 
titanic_df.isnull().sum() #isnull() isna() 똑같은거입니다. 

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [60]:
#원-핫 인코딩
#Pclass는 타입이 int이기 때문에 원핫 인코딩하면 안먹습니다. 원한 다면 타입변경후 원핫 인코딩하셔야 나눌수 있습니다.

titanic_df['Pclass']=titanic_df['Pclass'].astype(str)
titanic_df=pd.get_dummies(titanic_df)
titanic_df.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3
0,0,1,22.0,1,0,7.25,7,3,False,False,True
1,1,0,38.0,1,0,71.2833,2,0,True,False,False
2,1,0,26.0,0,0,7.925,7,3,False,False,True
3,1,0,35.0,1,0,53.1,2,3,True,False,False
4,0,1,35.0,0,0,8.05,7,3,False,False,True


In [61]:
#변수와 값 나누기
feature=titanic_df.drop('Survived',axis=1)   #생존외 나머지
label=titanic_df['Survived']                 #생존

In [62]:
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score

In [63]:
#하이퍼 파라미터 선언

seed=42 #랜덤 시드 설정

In [64]:
#훈련, 학습 데이터 나누기
X_train, X_test,y_train,y_test=train_test_split(feature, label, test_size=0.25, random_state=seed)  
# x_train,y_train 로 모델을 학습
# x_test 로 예측
# y_test 로 정확도 측정 할겁니다.

In [65]:
#훈련,학습 데이터 크기 
X_train.shape, y_train.shape, X_test.shape, y_test.shape 

((668, 10), (668,), (223, 10), (223,))

In [66]:
# 모델 선언
# 모델 바꿔가면서 해보세요.
model = DecisionTreeClassifier() 

In [67]:
# 결정트리 Classfier 클래스 생성
dt_clf = DecisionTreeClassifier(random_state=11)

# DecisionTreeClassifier 학습/예측
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

In [68]:
#정확도 평가
accuracy_score(y_test, dt_pred)

0.7488789237668162

In [None]:
# 타이타닉 테스트 데이터 불러오기
test_df = pd.read_csv('data/test.csv')
test_df.info()
#테스트 데이터에는 Survived 컬럼이 없습니다.

In [None]:
# 테스트 데이터 전처리
transform_features(test_df)
test_df.isnull().sum()

In [71]:
# 테스트 데이터 원핫인코딩
test_df['Pclass'] = test_df['Pclass'].astype(str)
test_df = pd.get_dummies(test_df)
test_df.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass_1,Pclass_2,Pclass_3
0,1,34.5,0,0,7.8292,7,1,False,False,True
1,0,47.0,1,0,7.0,7,2,False,False,True
2,1,62.0,0,0,9.6875,7,1,False,True,False
3,1,27.0,0,0,8.6625,7,2,False,False,True
4,0,22.0,1,1,12.2875,7,2,False,False,True


In [None]:
# 컬럼수가 동일한지 확인
test_df.shape

In [73]:
# 모델 예측
pred = dt_clf.predict(test_df)

In [None]:
# gender_submission 파일 불러온 후 예측 결과 합치기
submit = pd.read_csv('data/gender_submission.csv') 
submit['Survived']= pred
submit

In [75]:
# 제출 파일로 내보내기
submit.to_csv('submission01.csv', index=False)