# P001-检测数据每列的缺失值

In [68]:
# 对于数据字典，输出每列的缺失值的比例
import numpy as np
import pandas as pd

data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}


In [69]:
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [70]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [71]:
df.isnull().sum() / len(df)

size      0.166667
color     0.000000
gender    0.166667
price     0.166667
weight    0.333333
bought    0.000000
dtype: float64

In [72]:
np.round(df.isnull().sum() / len(df), 2) 

size      0.17
color     0.00
gender    0.17
price     0.17
weight    0.33
bought    0.00
dtype: float64

# P002-填充weight列的缺失值

In [74]:
# 填充weight列的缺失值
# 使用平均值进行填充
from sklearn.impute import SimpleImputer

In [75]:
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

In [76]:
df[["weight"]] = imputer.fit_transform(df[["weight"]])

In [77]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


# P003-获取填充缺失值的统计值

In [79]:
# 已经使用SimpleImputer填充了缺失值，怎么知道填充了哪个数值
imputer.statistics_

array([415.])

In [80]:
imputer.statistics_[0]

415.0

# P004-使用常数数值填充price缺失值

In [82]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [83]:
from sklearn.impute import SimpleImputer

In [84]:
imputer = SimpleImputer(
    missing_values = np.nan,
    strategy = "constant",
    fill_value=99.0
)

In [85]:
df[["price"]] = imputer.fit_transform(df[["price"]])

In [86]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [87]:
imputer.statistics_

array([99.])

# P005-使用最频繁数填充缺失值

In [89]:
# size是字符串列，非数值列，使用出现最频繁的值做缺失值填充

In [90]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [91]:
imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")

In [92]:
df[["size"]] = imputer.fit_transform(df[["size"]])

In [93]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,M,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


# P006-按空值过滤并计算均值

In [117]:
# 问题：
# 1.过滤掉weight为空值的行
# 2.筛选出数字类型的列
# 3.计算这些列的均值

In [123]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [125]:
df[df["weight"].isnull()]

Unnamed: 0,size,color,gender,price,weight,bought
3,,green,female,129.0,,no
5,M,green,male,89.0,,no


In [127]:
df[~df["weight"].isnull()]

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


In [131]:
df[~df["weight"].isnull()].select_dtypes(include=['float'])

Unnamed: 0,price,weight
0,199.0,500.0
1,89.0,450.0
2,,300.0
4,79.0,410.0


In [133]:
df[~df["weight"].isnull()].select_dtypes(include=['float']).mean()

price     122.333333
weight    415.000000
dtype: float64

# P007-使用常量填充字符串列

In [138]:
# 问题
# 1.筛选出字符串类型的列（object类型）
# 2.使用常量字符串empty进行填充

In [140]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [142]:
from sklearn.impute import SimpleImputer

In [144]:
imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="empty")

In [146]:
columns = df.select_dtypes(include=['object']).columns

In [148]:
columns

Index(['size', 'color', 'gender', 'bought'], dtype='object')

In [150]:
imputer.fit_transform(df[columns])

array([['XL', 'red', 'female', 'yes'],
       ['L', 'green', 'male', 'no'],
       ['M', 'blue', 'empty', 'yes'],
       ['empty', 'green', 'female', 'no'],
       ['M', 'red', 'female', 'yes'],
       ['M', 'green', 'male', 'no']], dtype=object)

In [152]:
df.loc[:, columns] = imputer.fit_transform(df[columns])

In [154]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,empty,,300.0,yes
3,empty,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no
