# scikit-learn을 활용한 타이타닉 탑승객 생존 예측 Classification

In [None]:
# Upload Files
from google.colab import drive
drive.mount('drive')

In [None]:
#1. 데이터 불러오기

import pandas as pd

data = pd.read_csv('/content/drive/My Drive/Titanic_dataset.csv')
data.head()

In [None]:
#2. 데이터 확인하기

data.describe(include='all')

In [None]:
#3. 빠진 값 확인

data.isnull().sum()

In [None]:
#4. 사용하지 않을 feature 제거
data.drop(['cabin', 'boat', 'body', 'home.dest', 'name', 'ticket'], axis=1, inplace=True)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
#5. Fare

data.fare.mean()

In [None]:
data.loc[data.fare.isnull(), 'fare'] = data.fare.mean()
data.isnull().sum()

In [None]:
#6 Age

data.age.mean()

In [None]:
data.loc[data.age.isnull(), 'age'] = data.age.mean()
data.isnull().sum()

In [None]:
#7. embarked

data.groupby('embarked').size()

In [None]:
data.loc[data.embarked.isnull(), 'embarked'] = 'S'
data.isnull().sum()

In [None]:
#8. 시각화
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data.survived.value_counts(normalize=True)

In [None]:
sns.countplot(data.survived)
plt.title('Count of survived')

In [None]:
# 8-1. 성별에 따른 생존자 수
sns.countplot(data.gender, hue=data.survived)
plt.title('Relationship between Gender and Survived')

여성일 경우 생존할 확률이 남성에 비해 2배 가량 높다

In [None]:
#8-2 선실 등급에 따른 생존여부

sns.kdeplot(data.pclass, data.survived)
plt.title('Relationship between Class and Survived')

선실등급이 3등급일 때는 생존하지 못하는 사람의 비율이, 1등급일때는 생존하는 사람의 비율이 높다

In [None]:
#8-3. Feature Heatmap
plt.figure(figsize=(14,12))
sns.heatmap(data.corr(),linewidths=0.1, 
            linecolor='white', annot=True)
plt.show()

In [None]:
#9. 데이터 변환
data.loc[data.gender == 'male', 'gender'] = 0
data.loc[data.gender == 'female', 'gender'] = 1


data.loc[data.embarked == 'S', 'embarked'] = 0
data.loc[data.embarked == 'Q', 'embarked'] = 1
data.loc[data.embarked == 'C', 'embarked'] = 2

data.head()

In [None]:
#10 X/Y  분리

X = data.drop('survived', axis=1)
Y = data.survived

print(X[:5])
print(Y[:5])

In [None]:
#11 훈련셋/평가셋 분리
 
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=109)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

In [None]:
#12 모델 학습

from sklearn.linear_model import LogisticRegression

log_classifier = LogisticRegression()
log_classifier.fit(X_train, Y_train)


In [None]:
#13. 모델 성능 확인

from sklearn.metrics import accuracy_score, recall_score, precision_score

y_predict = log_classifier.predict(X_test)
acc = accuracy_score(Y_test, y_predict)
print(acc)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, y_predict)
sns.heatmap(cm, annot=True, cmap='Blues')


In [None]:
😊