## Missing Value 결측치 처리

1. 결측치 있는지 확인 isna

2. 행 삭제 dropna()
- 하나라도 있으면 삭제 dropna()
- 특정컬럼 기준으로만삭제 subset=[]
- 모든 값이 NaN일때 how='all'
- 하나라도 NaN이 있으면 삭제 (기본값) how='any'
- thresh=n, 정상데이터가 n개 이상이어야 살린다 !

>inplace=True를 안쓰면 삭제된 결과만 보여주고 원본은 그대로!

3. 결측치 채우기 fillna

- 데이터로 채우기
- 컬럼별로 다르게 채우기 등 특정값으로 채우기가 있다
- 통계값으로 채우기가 있다
- 평균 .mean
- 중앙값 .median
- 최빈값 .mode()[0]

- 앞/뒤값으로 채우기
- ffill() : 앞의 값으로 뒤를 채운다
- bfill():뒤의 값으로 앞을 채운다
- limit : 무한정 채우지 않는법
- interpolate :보간법

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({
    'name': ['IronMan', 'Hulk', np.nan, 'Thor', 'Hulk'],
    'age': [30, 45, np.nan, np.nan, 45],
    'score': [100, 90, 80, np.nan, np.nan],
    'blood_type': ['A', 'B', np.nan, 'A', 'B']
})

df

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
2,,,80.0,
3,Thor,,,A
4,Hulk,45.0,,B


In [7]:
df.isna()
# == df.isnull()

Unnamed: 0,name,age,score,blood_type
0,False,False,False,False
1,False,False,False,False
2,True,True,False,True
3,False,True,True,False
4,False,False,True,False


In [6]:
df.isna().sum()

name          1
age           2
score         2
blood_type    1
dtype: int64

In [8]:
# dropna()

In [9]:
df.dropna()

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B


In [10]:
df.dropna(subset=['name','score'])

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B


In [11]:
df.dropna(how='all')

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
2,,,80.0,
3,Thor,,,A
4,Hulk,45.0,,B


In [15]:
# subset을 할때 둘 중 하나라도 NaN이면 없애는 any가 기본값이기때문에 
#둘다 없을때만 지우고 싶으면 how='all' 명시해줘야한다
df.dropna(subset=['name','score'],how='all')

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
2,,,80.0,
3,Thor,,,A
4,Hulk,45.0,,B


In [17]:
# 정상데이터가 2개이상이어야함 
df.dropna(thresh=2)

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
3,Thor,,,A
4,Hulk,45.0,,B


In [18]:
# 원본에 아직 반영이 되지않았다
df

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
2,,,80.0,
3,Thor,,,A
4,Hulk,45.0,,B


In [19]:
# fillna

In [20]:
df.fillna(0)

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
2,0,0.0,80.0,0
3,Thor,0.0,0.0,A
4,Hulk,45.0,0.0,B


In [21]:
df.fillna({
    'name':'Anoymous',
    'age':0,
    'score':50,
    'blood_type':'Unknown'
})

Unnamed: 0,name,age,score,blood_type
0,IronMan,30.0,100.0,A
1,Hulk,45.0,90.0,B
2,Anoymous,0.0,80.0,Unknown
3,Thor,0.0,50.0,A
4,Hulk,45.0,50.0,B


In [22]:
# 평균 .mean
df['age_mean_filled']=df['age'].fillna(df['age'].mean())
df[['age','age_mean_filled']]

Unnamed: 0,age,age_mean_filled
0,30.0,30.0
1,45.0,45.0
2,,40.0
3,,40.0
4,45.0,45.0


In [24]:
# 중앙값
df['score_median_filled']=df['score'].fillna(df['score'].median())
df[['score', 'score_median_filled']]

Unnamed: 0,score,score_median_filled
0,100.0,100.0
1,90.0,90.0
2,80.0,80.0
3,,90.0
4,,90.0


In [28]:
# 최빈값 
# mode()[0]을 하지 않으면, 에러가 나진 않지만, 값도 채워지지 않는다
# 왜 [0]이 필요하나? > pandas의 mode()는 항상 Series를 반환
mode_series=df['blood_type'].mode()
print(type(mode_series))

<class 'pandas.Series'>


In [32]:
# 값이 변하지도 않았는데 에러도 안나고 있다
df['blood_type_filled'] = df['blood_type'].fillna(df['blood_type'].mode())
df['blood_type_filled']

0      A
1      B
2    NaN
3      A
4      B
Name: blood_type_filled, dtype: str

In [33]:
mode_result=df['blood_type'].mode()[0]
print(type(mode_result))

<class 'str'>


In [31]:
# [0]을 해줘야지 하나의 값으로 꺼내서 채워지는 것을 볼 수 있다
df['blood_type_filled'] = df['blood_type'].fillna(df['blood_type'].mode()[0])
df['blood_type_filled']

0    A
1    B
2    A
3    A
4    B
Name: blood_type_filled, dtype: str

In [34]:
# 앞/뒤 값으로 채우기

In [35]:
ts = pd.DataFrame({
    'date': pd.date_range("2024-01-01", periods=6),
    'sales': [100, np.nan, np.nan, 130, np.nan, 150]
})
ts

Unnamed: 0,date,sales
0,2024-01-01,100.0
1,2024-01-02,
2,2024-01-03,
3,2024-01-04,130.0
4,2024-01-05,
5,2024-01-06,150.0


In [37]:
# ffill 
ts['sales_ffill'] = ts['sales'].ffill()
ts

Unnamed: 0,date,sales,sales_ffill
0,2024-01-01,100.0,100.0
1,2024-01-02,,100.0
2,2024-01-03,,100.0
3,2024-01-04,130.0,130.0
4,2024-01-05,,130.0
5,2024-01-06,150.0,150.0


In [38]:
ts['sales_bfill'] = ts['sales'].bfill()
ts

Unnamed: 0,date,sales,sales_ffill,sales_bfill
0,2024-01-01,100.0,100.0,100.0
1,2024-01-02,,100.0,130.0
2,2024-01-03,,100.0,130.0
3,2024-01-04,130.0,130.0,130.0
4,2024-01-05,,130.0,150.0
5,2024-01-06,150.0,150.0,150.0


In [40]:
ts['sales_ffill_1'] = ts['sales'].ffill(limit=1)
ts

Unnamed: 0,date,sales,sales_ffill,sales_bfill,sales_ffill_1
0,2024-01-01,100.0,100.0,100.0,100.0
1,2024-01-02,,100.0,130.0,100.0
2,2024-01-03,,100.0,130.0,
3,2024-01-04,130.0,130.0,130.0,130.0
4,2024-01-05,,130.0,150.0,130.0
5,2024-01-06,150.0,150.0,150.0,150.0


In [41]:
ts['sales_ffill_1'] = ts['sales'].bfill(limit=1)
ts

Unnamed: 0,date,sales,sales_ffill,sales_bfill,sales_ffill_1
0,2024-01-01,100.0,100.0,100.0,100.0
1,2024-01-02,,100.0,130.0,
2,2024-01-03,,100.0,130.0,130.0
3,2024-01-04,130.0,130.0,130.0,130.0
4,2024-01-05,,130.0,150.0,150.0
5,2024-01-06,150.0,150.0,150.0,150.0


In [45]:
# interpolate, 판매를 기준으로 
ts['sales_interpolated'] = ts['sales'].interpolate(method='linear')
ts

Unnamed: 0,date,sales,sales_ffill,sales_bfill,sales_ffill_1,sales_interpolated
0,2024-01-01,100.0,100.0,100.0,100.0,100.0
1,2024-01-02,,100.0,130.0,,110.0
2,2024-01-03,,100.0,130.0,130.0,120.0
3,2024-01-04,130.0,130.0,130.0,130.0,130.0
4,2024-01-05,,130.0,150.0,150.0,140.0
5,2024-01-06,150.0,150.0,150.0,150.0,150.0


In [61]:
# time
# 반드시 날짜 컬럼이 인덱스(Index)여야 합니다!

In [71]:
df2 = pd.DataFrame({
    "date": [
        "2024-01-01",
        "2024-01-02",
        "2024-01-03",
        "2024-01-07",   # ← 날짜 점프
        "2024-01-08",
        "2024-01-09"
    ],
    "sales": [100, np.nan, np.nan, 200, np.nan, 300]
})

df2
type(df2.index)

pandas.RangeIndex

In [72]:
# value Error!!!
# df2['time'] = df2['sales'].interpolate(method='time')
# df2

In [73]:
# 1. 날짜 컬럼을 찐짜 날짜 로 변환
df2['date'] = pd.to_datetime(df2['date'])
# 2. 날짜를 인덱스로 설정
df2 = df2.set_index('date')

In [74]:
df2=df2.interpolate(method='time')
df2

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2024-01-01,100.0
2024-01-02,116.666667
2024-01-03,133.333333
2024-01-07,200.0
2024-01-08,250.0
2024-01-09,300.0
