# 모델 선택 및 개선

In [7]:
import os, warnings
# 경고 메세지 무시하거나 숨길때(ignore), 다시보이게(default)
# warnings.filterwarnings(action='ignore')
warnings.filterwarnings(action='ignore')

In [8]:
import pandas as pd

## 로지스틱 회귀 모델 선택

In [9]:
from sklearn.linear_model import LogisticRegression

### 데이터 분할

In [10]:
from sklearn.model_selection import train_test_split

### 데이터 확인 및 전처리

In [11]:
# 데이터 읽기
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# 결측값 채우기(Age)
train.Age = train.Age.fillna(train.Age.median())
test.Age = test.Age.fillna(train.Age.median())
# 결측값 채우고 숫자형으로 변환(Cabin)
train['Cabin_n'] = train['Cabin'].notnull().astype(int)
test['Cabin_n'] = test['Cabin'].notnull().astype(int)
# 결측값 채우기(train.Embarked)
train.Embarked = train.Embarked.fillna('S')
# 결측값 채우기(test.Fare)
test['Fare'] = test['Fare'].median()

# 데이터 수치형으로 변환(Sex)
train['Sex_n'] = train.Sex.map({'male':1, 'female':2})
test['Sex_n'] = test.Sex.map({'male':1, 'female':2})
# 데이터 수치형으로 변환(Embarked)
train['Embarked_n'] = train['Embarked'].map({'S':1, 'C':2, 'Q':3})
test['Embarked_n'] = test['Embarked'].map({'S':1, 'C':2, 'Q':3})

# 데이터 특성 선택 및 분할
sel = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin_n', 'Sex_n', 'Embarked_n']
X = train[sel]
y = train['Survived']

### 모델 객체화

In [13]:
# 모델 생성
lg_r = LogisticRegression()

### 예측 확인

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify =y, test_size = 0.3, random_state=0)
# 모델 훈련
lg_r.fit(X_train,  y_train)

# 예측 확인
pred = lg_r.predict(X_test)
print("훈련 : 테스트 = 7 : 3")
print(pred)

훈련 : 테스트 = 7 : 3
[0 1 0 1 0 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1
 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0 0 1 1 1 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0
 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0 0 1 1 0
 0 1 1 0 1 0 0 0 0]


## 예측 출력 하기

In [15]:
test_X = test[sel]
sub = pd.read_csv("data/gender_submission.csv")
sub.head(15)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [16]:
pred = lg_r.predict(test_X)
sub['Survived'] = pred
sub.to_csv('2021.10.20_submit.csv', index=False)

### Kaggle 제출 결과
* test_size / public score
* 0.3 / 0.75358