<a href="https://colab.research.google.com/github/kgpark88/bigdata/blob/main/code/Pandas_MissingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 결측치(Missing Data) 처리

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'a':[1, 2, np.nan, 4, 5],
                  'b':[10, np.nan, np.nan, 40, 50],
                  'c':[100, 200, 300, 400, 500]})

In [3]:
df

Unnamed: 0,a,b,c
0,1.0,10.0,100
1,2.0,,200
2,,,300
3,4.0,40.0,400
4,5.0,50.0,500


## 결측치(missing data) 제거

In [4]:
df.dropna()

Unnamed: 0,a,b,c
0,1.0,10.0,100
3,4.0,40.0,400
4,5.0,50.0,500


In [5]:
df.dropna(axis=1)

Unnamed: 0,c
0,100
1,200
2,300
3,400
4,500


### Threshold

In [6]:
df.dropna(thresh=2)

Unnamed: 0,a,b,c
0,1.0,10.0,100
1,2.0,,200
3,4.0,40.0,400
4,5.0,50.0,500


## 결측 데이터 채우기(Filling)

In [7]:
df.fillna(value='FILL VALUE')

Unnamed: 0,a,b,c
0,1.0,10.0,100
1,2.0,FILL VALUE,200
2,FILL VALUE,FILL VALUE,300
3,4.0,40.0,400
4,5.0,50.0,500


In [8]:
df

Unnamed: 0,a,b,c
0,1.0,10.0,100
1,2.0,,200
2,,,300
3,4.0,40.0,400
4,5.0,50.0,500


In [9]:
df['a'].fillna(value=0)

0    1.0
1    2.0
2    0.0
3    4.0
4    5.0
Name: a, dtype: float64

In [10]:
df['a'].fillna(df['a'].mean())

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
Name: a, dtype: float64

In [11]:
df.fillna(df.mean())

Unnamed: 0,a,b,c
0,1.0,10.0,100
1,2.0,33.333333,200
2,3.0,33.333333,300
3,4.0,40.0,400
4,5.0,50.0,500


In [12]:
df.fillna(method='ffill')

Unnamed: 0,a,b,c
0,1.0,10.0,100
1,2.0,10.0,200
2,2.0,10.0,300
3,4.0,40.0,400
4,5.0,50.0,500


#  Titanic 데이터셋 결측치(Missing Data) 처리 사례

### seaborn 라이브러리에서 Titanic 데이터셋 가져오기

In [13]:
import seaborn as sns
df = sns.load_dataset('titanic')

### 누락 데이터  찾기
누락 데이터를 찾는 방법으로 isnull(), notnull() 메소드가 있다.

In [14]:
df.isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
887,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
889,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [15]:
df.notnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
887,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
888,True,True,True,False,True,True,True,True,True,True,True,False,True,True,True
889,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [16]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### 누락 데이터 처리
① 누락 데이터를 평균값으로 치환 : age 컬럼  
② 누락 데티러를 최빈값으로 치환 : embarked 컬럼  
③ 누락 데이터가 많은 경우에는 컬럼 삭제 : deck 컬럼

① 누락 데이터를 평균값으로 치환 : age 컬럼

In [17]:
df['age'].head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: age, dtype: float64

In [18]:
mean_age = df['age'].mean(axis=0)   # age 평균
df['age'].fillna(mean_age, inplace=True)

In [19]:
df['age'].head(10)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64

② 누락 데티러를 최빈값으로 치환 : embarked 컬럼

In [20]:
df[df['embark_town'].isnull()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
61,1,1,female,38.0,0,0,80.0,,First,woman,False,B,,yes,True
829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


In [21]:
most_embark_town = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_embark_town)

Southampton


In [22]:
df['embark_town'].fillna(most_embark_town, inplace=True)

In [23]:
df['embark_town'][61:62]

61    Southampton
Name: embark_town, dtype: object

③ 누락 데이터가 많은 경우에는 컬럼 삭제 : deck 컬럼  
NaN 값이 600개 이상인 컬럼을 모두 삭제

In [24]:
df.dropna(axis=1, thresh=600, inplace=True)

In [25]:
print(df.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


### 데이터프레임 행데이터 중복값 제거

In [26]:
df_dup = df.duplicated()
print(df_dup)

0      False
1      False
2      False
3      False
4      False
       ...  
886     True
887    False
888    False
889    False
890    False
Length: 891, dtype: bool


In [27]:
df = df.drop_duplicates()

In [28]:
df_dup = df.duplicated()
print(df_dup)

0      False
1      False
2      False
3      False
4      False
       ...  
885    False
887    False
888    False
889    False
890    False
Length: 780, dtype: bool
