# 데이터가 없을 때 할 수 있는 전략

* drop
* 데이터가 최대 몇개 까지 없을 수 잇는 지 정해서 sample drop
* 데이터가 거의 없는 feature는 drop
* 최빈값, 평균값으로 비어있는 데이터 채우기

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Eaxmple from - https://chrisalbon.com/python/pandas_missing_data.html
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'm', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,m,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [3]:
df['empty'] = None
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,empty
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,m,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [4]:
df.isnull().sum() / len(df)

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
empty            1.0
dtype: float64

## 1. drop

In [5]:
# nan 있으면 무조건 drop
df.dropna(axis=1, how='all', inplace=True) # 해당 feature의 모든 Value가 NaN 이면 feature drop
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [6]:
df.mean() # 평균

age              43.750000
preTestScore      3.000000
postTestScore    52.333333
dtype: float64

In [7]:
df.median() # 중간값

age              39.0
preTestScore      3.0
postTestScore    62.0
dtype: float64

In [8]:
df['sex'].mode() # 최빈값

0    m
dtype: object

In [9]:
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,m,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [10]:
df.dropna(thresh=5) # 해당 데이터의 feature가 5개 이상을 가지고 있어야 drop 안됨

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## 2. fillna

In [11]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,0,0,0.0,0,0.0,0.0
2,Tina,Ali,36.0,m,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [21]:
df_cp = pd.DataFrame(df)
df_cp

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,m,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [23]:
df_cp.dropna(how='all', inplace=True)
df_cp

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,m,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [24]:
# replace
dic = {'m':0, 'f':1}
df_cp.replace({'sex': dic}, inplace=True)
df_cp

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,0,4.0,25.0
2,Tina,Ali,36.0,0,,
3,Jake,Milner,24.0,0,2.0,62.0
4,Amy,Cooze,73.0,1,3.0,70.0


In [27]:
# NaN => mean
mean_list = ['preTestScore', 'postTestScore']

for feature in mean_list:
    df_cp[feature].fillna(df_cp[feature].mean(), inplace=True)
    
df_cp

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,0,4.0,25.0
2,Tina,Ali,36.0,0,3.0,52.333333
3,Jake,Milner,24.0,0,2.0,62.0
4,Amy,Cooze,73.0,1,3.0,70.0


In [35]:
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,
2,Tina,Ali,36.0,m,3.0,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [36]:
df.groupby('sex')['postTestScore'].transform('mean')

0    43.5
1     NaN
2    43.5
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [38]:
temp_sr = df.groupby('sex')['postTestScore'].transform('mean')
temp_sr[1] = 40 # 원래는 성별이 NaN 이라 mean transform해도 NaN으로 됨
temp_sr

0    43.5
1    40.0
2    43.5
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [39]:
df['postTestScore'].fillna(
    temp_sr, inplace=True
)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,40.0
2,Tina,Ali,36.0,m,3.0,43.5
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [40]:
df['age'].notnull()

0     True
1    False
2     True
3     True
4     True
Name: age, dtype: bool

In [42]:
df['sex'].notnull()

0     True
1    False
2     True
3     True
4     True
Name: sex, dtype: bool

In [47]:
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,m,3.0,43.5
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [48]:
df[[False, True, False, False, True]]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
1,,,,,3.0,40.0
4,Amy,Cooze,73.0,f,3.0,70.0
