# 타이타닉 생존자 데이터 셋

In [90]:
# -q (quiet) 옵션은 설치 로그를 안보여줌(조용히함)
%pip install -q seaborn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [91]:
# 타이타닉 데이터셋 불러오기
# (pandas에 기본 내장된 예제 데이터셋)
import seaborn as sns
import pandas as pd

# titanic 데이터 DataFrame
titanic = sns.load_dataset('titanic')

# 처음 5개 행 보기
print("타이타닉 데이터 미리보기:")
print(titanic.head())

# 데이터셋 정보 확인
print("\n데이터셋 정보:")
print(titanic.info())

# 데이터 요약 통계
print("\n데이터 요약 통계:")
print(titanic.describe())

타이타닉 데이터 미리보기:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

데이터셋 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0 

## 사전 탐색

In [92]:
# col별 결측치 합계
titanic.isna().sum()

# 승객 등급 등장 빈도 (1, 2, 3 등석)
print(titanic['class'].value_counts())

# 성별 빈도 (남/녀)
print(titanic['sex'].value_counts())

# 생존 여부 빈도 (생존자/사망자)
print(titanic['survived'].value_counts())

class
Third     491
First     216
Second    184
Name: count, dtype: int64
sex
male      577
female    314
Name: count, dtype: int64
survived
0    549
1    342
Name: count, dtype: int64


In [93]:
# 성별에 따른 생존율
print(titanic.groupby('sex')['survived'].mean()*100)
# 위와 같은 방법인데, 더 복잡한 통계 가능
titanic.groupby('sex').agg({'survived': ['mean']})

# 승객 등급에 따른 생존율
print(titanic.groupby('class')['survived'].mean()*100)


sex
female    74.203822
male      18.890815
Name: survived, dtype: float64
class
First     62.962963
Second    47.282609
Third     24.236253
Name: survived, dtype: float64


  print(titanic.groupby('class')['survived'].mean()*100)


In [94]:
# 성별 & 승객 등급에 따른 생존율
# unstack() 쓰면 두번째 index가 col이 됨
titanic.groupby(['sex', 'class'])['survived'].mean().unstack()

# 피벗테이블
titanic.pivot_table(
    values='survived',
    index='sex',
    columns='class',
    aggfunc='mean'
)


# 나이 그룹별 생존율
titanic['age_group'] = pd.cut(
    titanic['age'],
    bins=[0, 12, 18, 35, 60, 100], # 초과 ~ 이하
    labels=['아동', '청소년', '청년', '중장년', '노년'] # 어떻게 나누는게 더 의미있는 인사이트를 도출할까? 이건 분석가의 역량!!
)

titanic.head(3)

# 카테고리가 있는데, 해당되는 내용이 없을 때 표시할래? 말래? observed=True (있는것만 보여줘)
titanic.groupby('age_group')['survived'].mean()


# 성별 + 나이그룹으로 생존율 확인
titanic.groupby(['sex', 'age_group'])['survived'].mean().unstack()

  titanic.groupby(['sex', 'class'])['survived'].mean().unstack()
  titanic.pivot_table(
  titanic.groupby('age_group')['survived'].mean()
  titanic.groupby(['sex', 'age_group'])['survived'].mean().unstack()


age_group,아동,청소년,청년,중장년,노년
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0.59375,0.75,0.783333,0.771429,1.0
male,0.567568,0.088235,0.180672,0.192,0.105263


# 결측치 처리

In [95]:
# 결측치 확인
missing = titanic.isna().sum()

# 결측치 있는 항목만 확인
missing[missing>0]

# 결측 비율
missing_p = titanic.isna().mean() * 100
missing_p[missing_p>0]

age            19.865320
embarked        0.224467
deck           77.216611
embark_town     0.224467
age_group      19.865320
dtype: float64

In [96]:
# 결측치 채우기
# age: 중요한 정보: 평균/중앙값 대체
# embarked: 가장 많은 사람들이 탄 곳으로 대체
# deck: 추측 불가능(의미 없음) -> 삭제
titanic = sns.load_dataset('titanic')

# 보통 카피 떠서 진행! 기존 DF는 조회만, 새로 만든건 변경만!
titanic_processed = titanic.copy()

# 비어있던 행들 마스킹
age_mask = titanic['age'].isna()


# 나이 결측치 채우기 (남녀 상관없이 전체 평균으로 채운 것)
mean_age = titanic['age'].mean()
titanic_processed['age'] = titanic['age'].fillna(mean_age)

# 결측치 있는지 재확인
titanic_processed.isna().sum()

# 평균으로 채워진 값들만 확인
titanic_processed[age_mask]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,29.699118,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,29.699118,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,29.699118,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,29.699118,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,29.699118,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,29.699118,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,29.699118,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,29.699118,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [97]:
# 전체 평균으로 대체하는 게 옳을까...?
# 성별/객실별로 평균 나이를 추정하는건?


# 모든 사람들을 성별/객실 그룹의 평균 나이로 바꾼 Series -> .transform()은 value를 바꿔주는 거
mean_ages = titanic.groupby(['sex','pclass'])['age'].transform('mean')
print(mean_ages)

# titanic의 age col 중, 빈 값만 mean_ages로 채움
titanic_processed['age'] = titanic['age'].fillna(mean_ages)

# 원래 비어있던 애들만 확인
titanic_processed.loc[age_mask]

0      26.507589
1      34.611765
2      21.750000
3      34.611765
4      26.507589
         ...    
886    30.740707
887    34.611765
888    21.750000
889    41.281386
890    26.507589
Name: age, Length: 891, dtype: float64


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,26.507589,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,30.740707,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,21.750000,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,26.507589,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,21.750000,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,26.507589,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,21.750000,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,26.507589,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,26.507589,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [98]:
# embarked (탑승 도시) -> 최빈값으로 채우겠다 

# 비어있는 항구 개수
titanic['embarked'].isna().sum()

# 최빈값 (Series - 벡터)
mode_embarked = titanic['embarked'].mode()[0] # Series에서 1개 뽑기

titanic_processed['embarked'] = titanic['embarked'].fillna(mode_embarked) # 시리즈를 타이타닉_프로세스에 넣어

#처리 완료 확인 (빈값 0)
titanic_processed['embarked'].isna().sum()

np.int64(0)

In [None]:
# deck은 77%가 비어있으니 삭제
titanic['deck'].isna().mean() *100 # 빈값 비율

# deck 컬럼 삭제 - inplace는 실행하면, 2번째 실행 불가능
titanic_processed.drop('deck', axis=1, inplace=True)


In [100]:
titanic_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB
