In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 空值/缺失值用什么来表示？

In [3]:
df = pd.DataFrame(np.arange(6).reshape((2, 3)))
df

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5


In [4]:
df.iloc[0, 0] = np.nan
df

Unnamed: 0,0,1,2
0,,1,2
1,3.0,4,5


In [5]:
# np.nan 是什么类型
type(np.nan)

float

## 空值处理案例

In [15]:
movies = pd.read_csv('IMDB/IMDB-Movie-Data.csv')
movies.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [9]:
# 判断是否有空值
np.any(movies.isnull())

True

In [16]:
# 删除包含空值的记录
movies2 = movies.dropna()
movies2.head(2)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0


In [17]:
# 已经不存在空值了
np.any(movies2.isnull())

False

In [20]:
# 用包含空值的列的平均值填充该列的空值

# 判断包含空值的列
# np.any(movies.isnull(), axis=0)
for column_name in movies.columns:
    if np.any(movies[column_name].isnull()):
        print(column_name)

Revenue (Millions)
Metascore


In [22]:
# 手动填充
movies['Revenue (Millions)'].fillna(movies['Revenue (Millions)'].mean(), inplace=True)
movies['Metascore'].fillna(movies['Metascore'].mean(), inplace=True)

In [23]:
np.any(movies.isnull())

False

## 处理不用np.nan来表示的缺失值

In [24]:
data = pd.read_csv('breast-cancer-wisconsin.data.csv')
data.head(2)

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2


In [25]:
# 是否有np.nan
np.any(data.isnull())

False

In [31]:
# 先把用别的符号表示的空值替换成np.nan
data.replace('?', np.nan, inplace=True)

In [32]:
data.isin(['?']).sum()

1000025    0
5          0
1          0
1.1        0
1.2        0
2          0
1.3        0
3          0
1.4        0
1.5        0
2.1        0
dtype: int64

In [33]:
# 用之前的方法做进一步的判断和处理
np.any(data.isnull())

True