# 타이타닉 : 누가 살아남았을까?

## 1. 데이터 불러오기

In [33]:
!pip install catboost --upgrade

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.1 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
path = "/content/drive/MyDrive/Dacon/튜토리얼/data/titanic_data"
os.chdir(path)

In [42]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)
submission = pd.read_csv("sample_submission.csv")

In [None]:
train.shape, test.shape

((891, 11), (418, 10))

In [None]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- 특성 설명

num---------------

PassengerID : 탑승객의 고유 아이디

Survived : 생존유무(0: 사망, 1: 생존)

Age : 나이

Pclass : 등실의 등급

SibSP : 함께 탑승한 형제자매, 아내, 남편의 수

Parch : 함께 탑승한 부모, 자식의 수

Fare : 요금

cat---------------

Name : 이름

Sex : 성별

Ticket : 티켓 번호

Cabin : 객실번호

Embarked : 배에 탑승한 위치(C = Cherbourg, Q = Queenstown, S = Southampton)

## 2. 데이터 전처리

In [None]:
corr_matrix = train.corr()
corr_matrix['Survived'].sort_values(ascending=False)

Survived    1.000000
Fare        0.257307
Parch       0.081629
SibSp      -0.035322
Age        -0.077221
Pclass     -0.338481
Name: Survived, dtype: float64

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [None]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [43]:
# 훈련데이터와 타깃 분리
target = ['Survived']
y_train = train[target]
X_train = train.drop(target, axis=1)

In [5]:
# 숫자형, 범주형 특성 나누기
cat_features = [i for i in X_train.columns if X_train[i].dtype=='object']
num_features  = [i for i in X_train.select_dtypes(exclude='object')]

In [6]:
print(f"범주형 특성: {cat_features}")
print(f"숫자형 특성: {num_features}")

범주형 특성: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
숫자형 특성: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [None]:
train['Cabin'].nunique()

147

In [None]:
train['Ticket'].nunique()

681

In [8]:
# 불필요하다고 판단한 특성 제거
cat_features.remove('Name')
cat_features.remove('Ticket')
cat_features.remove('Cabin')

In [9]:
# nan인 특성 찾아보기
X_train.isna().sum()

Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [10]:
test.isna().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

## 3. 모델 생성 및 훈련

In [40]:
# nan값 채워넣기 및 표준화, one-hot 인코딩
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('standard', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

model = RandomForestClassifier()

# grid search로 최적의 모델 파라미터 찾기
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('grid', GridSearchCV(model,
                                            param_grid={ 'n_estimators' : [10, 100],
                                                         'max_depth' : [6, 8, 10, 12],
                                                         'min_samples_leaf' : [8, 12, 18],
                                                         'min_samples_split' : [8, 16, 20]
                                                       },
                                            cv=3,
                                            refit=True,
                                           n_jobs=-1))
                     ])

# 모델 훈련
reg.fit(X_train, y_train)

RuntimeError: ignored

In [12]:
# 훈련한 모델 파라미터
reg.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('standard',
                                                                   StandardScaler())]),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEn

## 4. 데이터 예측

In [16]:
mod = submission.copy()

In [17]:
mod['Survived'] = reg.predict(test)

In [18]:
mod.to_csv('RandomForest.csv', index=False)

## 무작위 내 데이터 넣어보기

In [19]:
lee = np.array([3, 'Lee', 'male', 24, 1, 2, 3019301, 10, np.nan, 'S'])

In [20]:
test.shape, np.shape(lee)
lee = lee.reshape(-1,10)

In [21]:
test = np.concatenate((test, lee), axis=0)

In [22]:
test = pd.DataFrame(test)

In [23]:
test.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C
418,3,Lee,male,24.0,1,2,3019301,10.0,,S


In [24]:
train.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [25]:
test.columns = train.columns[1:]

In [26]:
predict = reg.predict(test)

In [27]:
predict[-1]

0

# CatBoost

In [56]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,1,0,7.25,,S
2,1,female,38.0,1,0,71.2833,C85,C
3,3,female,26.0,0,0,7.925,,S
4,1,female,35.0,1,0,53.1,C123,S
5,3,male,35.0,0,0,8.05,,S


In [54]:
X_train.drop(['Name', 'Ticket'], axis=1, inplace=True)
test.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [70]:
X_train.drop(['Cabin'], axis=1, inplace=True)
test.drop(['Cabin'], axis=1, inplace=True)

In [71]:
X_train.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [68]:
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_train['Embarked'].fillna(X_train['Embarked'].value_counts().idxmax(), inplace=True)

In [72]:
test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [73]:
test['Age'].fillna(test['Age'].mean(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [None]:
Titanic_Categories = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

from catboost import CatBoostClassifier
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=42
)

model.fit(
    X_train, y_train,
    cat_features=Titanic_Categories
)

model.fit(X_train, y_train, cat_features=Titanic_Categories)

In [75]:
model.predict(test)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [76]:
mod = submission.copy()

In [77]:
mod['Survived'] = model.predict(test)

In [78]:
mod.to_csv('Catboost.csv', index=False)