# Titanic Data 실습

In [1]:
# 결측치 처리, 형변환(카테고리로 분류할만한것 분류하기)
# 10대 20대 30대 분류하기

In [2]:
# 모듈 로딩
import pandas as pd
import numpy as np

# 파일 관련 변수 선언
DIR_PATH = '../Data/'
FILE = DIR_PATH + 'titanic.csv'

# DF객체 생성
titanicDF = pd.read_csv(FILE)
titanicDF.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# DF객체 요약정보 확인
titanicDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# 결측치 확인 및 처리방안 구상
titanicDF.isnull().sum()

# 'Age'와 'Cabin', 'Embarked' column에서 결측치 관측

# 'Age' column의 결측치 => 평균 나이로 치환
titanicDF['Age'].fillna(titanicDF['Age'].mean(), inplace=True)

# 'Cabin' column => class의 정보를 활용하면 될거같아서 객실번호에 대한 data는 삭제
titanicDF.drop('Cabin', axis=1, inplace=True)

# 'Embarked' column => 승선지의 정보는 생존률과 무관하다 생각하여 삭제
titanicDF.drop('Embarked', axis=1, inplace=True)

In [5]:
# 요약정보 재확인 및 결측치 검사
titanicDF.info()
titanicDF.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
dtype: int64

In [6]:
# column data의 type처리


# 'PassengerId' => 인덱스로 설정
titanicDF.set_index('PassengerId')

# 'Survived' => int64 유지

# 'Pclass' => category로 설정
titanicDF['Pclass'] = titanicDF['Pclass'].astype('category')

# 'Name' => object 유지

# 'Sex' => category로 설정
titanicDF['Sex'] = titanicDF['Sex'].astype('category')

# 'Age' => int64로 설정
titanicDF['Age'] = titanicDF['Age'].astype('int')

# 'SibSp', 'Parch' => 합쳐서 int64로 설정
titanicDF.insert(7, 'passenger', titanicDF['SibSp'] + titanicDF['Parch'])
titanicDF.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# 'Ticket' => object 유지

# 'Fare' => float64 유지

In [7]:
# 'Age' 구간별로 나누기
bins = [0, 15, 55, titanicDF['Age'].max()]
titanicDF.insert(6, 'Age_boundary', pd.cut(titanicDF['Age'], bins, labels=['child', 'young', 'old']))

# 'Fare' 구간별로 나누기
bins = [0, 10, 50, titanicDF['Fare'].max()]
titanicDF.insert(10, 'Fare_boundary', pd.cut(titanicDF['Fare'], bins, labels=['cheap', 'normal', 'expansive']))

In [8]:
# titanicDF 객체 확인
titanicDF

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Age_boundary,passenger,Ticket,Fare,Fare_boundary
0,1,0,3,"Braund, Mr. Owen Harris",male,22,young,1,A/5 21171,7.2500,cheap
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,young,1,PC 17599,71.2833,expansive
2,3,1,3,"Heikkinen, Miss. Laina",female,26,young,0,STON/O2. 3101282,7.9250,cheap
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,young,1,113803,53.1000,expansive
4,5,0,3,"Allen, Mr. William Henry",male,35,young,0,373450,8.0500,cheap
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27,young,0,211536,13.0000,normal
887,888,1,1,"Graham, Miss. Margaret Edith",female,19,young,0,112053,30.0000,normal
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29,young,3,W./C. 6607,23.4500,normal
889,890,1,1,"Behr, Mr. Karl Howell",male,26,young,0,111369,30.0000,normal


In [10]:
print('===============연령대별 생존률===============\n', 
      titanicDF.groupby('Age_boundary')['Survived'].mean(), '\n')

print('===============성별별 생존률===============\n', 
      titanicDF.groupby('Sex')['Survived'].mean(), '\n')

print('===============객실별 생존률===============\n', 
      titanicDF.groupby('Pclass')['Survived'].mean(), '\n')

print('===============동승자수별 생존률===============\n', 
      titanicDF.groupby('passenger')['Survived'].mean(), '\n')

print('===============요금별 생존률===============\n', 
      titanicDF.groupby('Fare_boundary')['Survived'].mean(), '\n')

print('===============요금별 생존률===============\n', 
      titanicDF.groupby(['Age_boundary', 'Sex'])['Survived'].mean(), '\n')

 Age_boundary
child    0.552632
young    0.365410
old      0.307692
Name: Survived, dtype: float64 

 Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64 

 Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64 

 passenger
0     0.303538
1     0.552795
2     0.578431
3     0.724138
4     0.200000
5     0.136364
6     0.333333
7     0.000000
10    0.000000
Name: Survived, dtype: float64 

 Fare_boundary
cheap        0.205607
normal       0.420253
expansive    0.681250
Name: Survived, dtype: float64 

 Age_boundary  Sex   
child         female    0.634146
              male      0.457143
young         female    0.751908
              male      0.165680
old           female    0.888889
              male      0.133333
Name: Survived, dtype: float64 

