In [44]:
# --------------------------------------------------------------
# 결측치 처리 - (1) 치환
# - seaborn에서 제공하는 Titanic 데이터 사용
# --------------------------------------------------------------
import seaborn as sns
import pandas as pd
import numpy as np

In [45]:
# 데이터 로딩 ----------------------------------------------------
# load_dataset()
titanicDF = sns.load_dataset("titanic")

In [46]:
# 데이터 확인 ----------------------------------------------------
titanicDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [47]:
# 실제 데이터 확인 ---------------------------------------------
titanicDF.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [48]:
# 결측치 체크 ---------------------------------------------
missingSR = titanicDF.isnull().sum()

type(missingSR)

pandas.core.series.Series

In [49]:
# 결측치가 존재하는 데이터만 추출 ---------------------------
# 비교연산자랑 함께 사용해서 결과가 True, False 반환
# True / False로 구성된 데이터 불린인덱싱(Boolean Indexing)
FILTER = missingSR > 0
MASK = missingSR > 100
missingSR[FILTER], missingSR[MASK]

(age            177
 embarked         2
 deck           688
 embark_town      2
 dtype: int64,
 age     177
 deck    688
 dtype: int64)

In [50]:
# 결측치가 존재하는 컬럼의 값을 치환해서 결측치를 없애기 -----
# => fillna() 메서드
missingSR[FILTER].index

Index(['age', 'embarked', 'deck', 'embark_town'], dtype='object')

In [51]:
# age 컬럼 결측치 처리 ------------------------------------
# fillna(치환값) => 본인이 결정
titanicDF['age'].fillna(0) #.isnull().sum()

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     0.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [52]:
# fillna(치환값) => 평균 => mean()
# titanicDF.age.mean()

titanicDF['age'].fillna(titanicDF.age.mean())

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

In [53]:
# fillna(치환값) => 최빈값 => mode())

modeValues = titanicDF.age.mode()
titanicDF['age'].fillna(modeValues[0])

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    24.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [61]:
titanicDF.mode()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,24.0,0,0,8.05,S,Third,man,True,C,Southampton,no,True


In [62]:
titanicDF.survived.value_counts(), titanicDF.pclass.value_counts()

(0    549
 1    342
 Name: survived, dtype: int64,
 3    491
 1    216
 2    184
 Name: pclass, dtype: int64)

In [54]:
# fillna(치환값) => method 매개변수 = 'ffill', 'bfill' => 앞 행의 나이값으로 채우기
titanicDF['age'].fillna(method='ffill')

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    19.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [55]:
# fillna(치환값) => method 매개변수 = 'ffill', 'bfill' => 뒷 행의 나이값으로 채우기
titanicDF['age'].fillna(method='bfill')

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    26.0
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [56]:
titanicDF.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [59]:
titanicDF.embarked.fillna(" ", inplace=True)

In [60]:
titanicDF.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64