In [25]:
import numpy as np
import pandas as pd


# 创建数据集
data = {
    "ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    "Name": [
        "Alice",
        "Bob",
        "Charlie",
        "David",
        "Eve",
        "Frank",
        "Grace",
        "Hannah",
        "Isaac",
        "Jack",
        "Bob",
        "Grace",
    ],
    "Age": [23, -1, 22, 24, 29, np.nan, 26, 21, 24, 28, -1, 26],
    "Score": [88, 92, 9999, 95, 78, 85, 90, 88, 89, 91, 92, 90],
    "City": [
        "New York",
        "Los Angeles",
        "Chicago",
        -1,
        "Houston",
        -1,
        "San Francisco",
        "New York",
        "Chicago",
        np.nan,
        "Los Angeles",
        "San Francisco",
    ],
}

df = pd.DataFrame(data)
df

Unnamed: 0,ID,Name,Age,Score,City
0,1,Alice,23.0,88,New York
1,2,Bob,-1.0,92,Los Angeles
2,3,Charlie,22.0,9999,Chicago
3,4,David,24.0,95,-1
4,5,Eve,29.0,78,Houston
5,6,Frank,,85,-1
6,7,Grace,26.0,90,San Francisco
7,8,Hannah,21.0,88,New York
8,9,Isaac,24.0,89,Chicago
9,10,Jack,28.0,91,


In [26]:
# 1. 缺失值检查
for col in df.columns:
    print(col, df[col].isnull().sum())

ID 0
Name 0
Age 1
Score 0
City 1


In [27]:
# 2. 删除重复数据
df.drop_duplicates(inplace=True)
# 另一种写法：df = df.drop_duplicates()

# 上述2个写法都可以

# 注意：因为ID是唯一的，没有任何行完全重复，因此，这里怎么做都对
# 但是实践中要具体情况具体分析。因为ID可能是前面的操作，在没有去重的情况下添加的，因此可能ID本身就是错的。
# 所以这道题没有标准答案。
df

Unnamed: 0,ID,Name,Age,Score,City
0,1,Alice,23.0,88,New York
1,2,Bob,-1.0,92,Los Angeles
2,3,Charlie,22.0,9999,Chicago
3,4,David,24.0,95,-1
4,5,Eve,29.0,78,Houston
5,6,Frank,,85,-1
6,7,Grace,26.0,90,San Francisco
7,8,Hannah,21.0,88,New York
8,9,Isaac,24.0,89,Chicago
9,10,Jack,28.0,91,


In [28]:
# 注意：希望大家能够发现，你不能在有-1等特殊值的情况下计算均值！
# 对于数值型列，正确的顺序是先处理特殊值，再处理缺失值，否则计算出来的均值或者中位数会不对。
# 字符型的列，看具体的需要。

# 3. 填补缺失值 4. 替换特殊值
# 注意，题目3和题目4，Age和Score采用同样的替换方式。
# 因此，可以先把特殊值替换成NA，然后一次性按要求替换成指定的值

df["Age"] = df["Age"].replace(-1, np.nan)
df["Score"] = df["Score"].replace(9999, np.nan)

df["Age"] = df["Age"].fillna(df["Age"].mean()).round(2)
df["Score"] = df["Score"].fillna(df["Score"].median()).round(2)

df

Unnamed: 0,ID,Name,Age,Score,City
0,1,Alice,23.0,88.0,New York
1,2,Bob,24.78,92.0,Los Angeles
2,3,Charlie,22.0,90.0,Chicago
3,4,David,24.0,95.0,-1
4,5,Eve,29.0,78.0,Houston
5,6,Frank,24.78,85.0,-1
6,7,Grace,26.0,90.0,San Francisco
7,8,Hannah,21.0,88.0,New York
8,9,Isaac,24.0,89.0,Chicago
9,10,Jack,28.0,91.0,


In [33]:
df["City"] = df["City"].replace(np.nan, "Unknow")

replacement = df["City"].mode()[0]

  df["City"].mode()[0]


'New York'

In [30]:
df["City"] = df["City"].replace(
    "-1",
    df["City"].mode()[0],  # 填充众数
)  # 注意：这里的'-1'是字符串，因此是'-1'。
df

  df["City"].mode()[0],  # 填充众数


Unnamed: 0,ID,Name,Age,Score,City
0,1,Alice,23.0,88.0,New York
1,2,Bob,24.78,92.0,Los Angeles
2,3,Charlie,22.0,90.0,Chicago
3,4,David,24.0,95.0,-1
4,5,Eve,29.0,78.0,Houston
5,6,Frank,24.78,85.0,-1
6,7,Grace,26.0,90.0,San Francisco
7,8,Hannah,21.0,88.0,New York
8,9,Isaac,24.0,89.0,Chicago
9,10,Jack,28.0,91.0,Unknow


In [31]:
df["City"].mode()[0]

  df["City"].mode()[0]


'New York'

In [32]:
# 3. 填补缺失值
# 实践中，这一步要看具体的要求。如果你是自己设计方案，要看你的目的是什么。

df["Age"] = df["Age"].fillna(df["Age"].mean())  # 填充均值
df["Score"] = df["Score"].fillna(df["Score"].median())  # 填充中位数
df["City"] = df["City"].fillna("Unknown")  # 填充Unknown

df  # 看一眼数据

Unnamed: 0,ID,Name,Age,Score,City
0,1,Alice,23.0,88.0,New York
1,2,Bob,24.78,92.0,Los Angeles
2,3,Charlie,22.0,90.0,Chicago
3,4,David,24.0,95.0,-1
4,5,Eve,29.0,78.0,Houston
5,6,Frank,24.78,85.0,-1
6,7,Grace,26.0,90.0,San Francisco
7,8,Hannah,21.0,88.0,New York
8,9,Isaac,24.0,89.0,Chicago
9,10,Jack,28.0,91.0,Unknow
