# 모델 선택 및 개선

In [15]:
import os, warnings
import numpy as np
# 경고 메세지 무시하거나 숨길때(ignore), 다시보이게(default)
# warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore')

In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## 랜덤 포레스트 모델 선택

In [17]:
from sklearn.ensemble import RandomForestClassifier

### 교차 검증(LOOCV)

In [18]:
from sklearn.model_selection import cross_val_score, LeaveOneOut

### 데이터 확인 및 전처리

In [20]:
# 데이터 읽기
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# 결측값 채우기(Age)
train.Age = train.Age.fillna(train.Age.median())
test.Age = test.Age.fillna(train.Age.median())
# 결측값 채우고 숫자형으로 변환(Cabin)
train['Cabin_n'] = train['Cabin'].notnull().astype(int)
test['Cabin_n'] = test['Cabin'].notnull().astype(int)
# 결측값 채우기(train.Embarked)
train.Embarked = train.Embarked.fillna('S')
# 결측값 채우기(test.Fare)
test['Fare'] = test['Fare'].median()

# 데이터 수치형으로 변환(Sex)
train['Sex_n'] = train.Sex.map({'male':1, 'female':2})
test['Sex_n'] = test.Sex.map({'male':1, 'female':2})
# 데이터 수치형으로 변환(Embarked)
train['Embarked_n'] = train['Embarked'].map({'S':1, 'C':2, 'Q':3})
test['Embarked_n'] = test['Embarked'].map({'S':1, 'C':2, 'Q':3})

# 데이터 특성 선택 및 분할
sel = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_n', 'Sex_n', 'Embarked_n']
X = train[sel]
y = train['Survived']

### 모델 객체화

In [24]:
# 모델 생성
rf_c = RandomForestClassifier()

## 교차 검증 : LeaveOneOut

In [22]:
def ML_loo_score(model_name):
    # test_size 변경
    for i in range(1, 6):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (i/10), random_state=0)
        # 모델 훈련
        model_name.fit(X_train,  y_train)
        # 교차검증
        loo = LeaveOneOut()
        # scores = cross_val_score(모델 객체, 입력, 결과, cv = cross_validation)
        score = cross_val_score(model_name, X_test, y_test, cv=loo)
        print(f"훈련 : 테스트 = {10-i} : {i}")
        print(f"교차 검증 점수 : {score}")
        print("교자 검증 점수 평균 :", score.mean())
        print()

## RandomForestClassifier 모델1
* 'Age' : median
* test.fare : median
* statify 설정
* 교차검증 : KFold()
* 점수
    * all => 1(예측 확인)

In [11]:
ML_loo_score(rf_c)

훈련 : 테스트 = 9 : 1
교차 검증 점수 : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
교자 검증 점수 평균 : 1.0

훈련 : 테스트 = 8 : 2
교차 검증 점수 : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
교자 검증 점수 평균 : 1.0

훈련 : 테스트 = 7 : 3
교차 검증 점수 : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.

### 예측 확인

In [27]:
# test_size 변경
for i in range(1, 6):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (i/10), random_state=0)
    # 모델 훈련
    rf_c.fit(X_train,  y_train)
    pred = rf_c.predict(X_test)
    print(f"훈련 : 테스트 = {10-i} : {i}")
    print(pred)
    print()

훈련 : 테스트 = 9 : 1
[0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0]

훈련 : 테스트 = 8 : 2
[0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0
 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1
 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0]

훈련 : 테스트 = 7 : 3
[0 0 0 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0
 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0
 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1
 1 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 0 1 1 0 0 0

In [None]:
# 제출 파일 로드
sub = pd.read_csv("./titanic/sample_submission.csv")
sub.head(15)