In [26]:
import pandas as pd

# 从页面中读取表格，返回 DataFrame[]
source_url = "https://www.runoob.com/pandas/pandas-functions.html"
df_list = pd.read_html(source_url)

titles = [
    "读取数据",
    "查看数据",
    "数据清洗",
    "数据选择和切片",
    "数据排序",
    "数据分组和聚合",
    "数据合并",
    "数据选择和过滤",
    "数据统计和描述",
]

with open("./README.md", "w") as f:

    f.write("# Pandas常用函数\n\n")
    f.write(f"> {source_url} \n\n")

    for index, df in enumerate(df_list):
        f.write(f"## {titles[index]}\n")
        f.write(df.to_markdown(index=False))
        f.write("\n\n")

In [38]:
# 实例

# 读取 JSON 数据
df = pd.read_json("./data.json")
print(df)

# 删除缺失值
df = df.dropna()
print(df)

# 用指定的值填充缺失值
# df = df.fillna({"age": 0, "score": 0})

# 重命名列名
df = df.rename(
    columns={"name": "姓名", "age": "年龄", "gender": "性别", "score": "成绩"}
)

# 按成绩排序
df = df.sort_values(by="成绩", ascending=False)
print(df)

# 按性别分组并计算平均年龄和成绩
grouped = df.groupby("性别").agg({"年龄": "mean", "成绩": "mean"})
print(grouped)

# 选择成绩大于等于90的行，并只保留姓名和成绩两列
df = df.loc[df["成绩"] >= 80, ["姓名", "成绩"]]
print(df)

# 计算每列的基本统计信息
stats = df.describe()
print(stats)

# 计算每列的平均值
# mean = df.mean()

# 计算每列的中位数
# median = df.median()

# 计算每列的众数
# mode = df.mode()

# 计算每列非缺失值的数量
# count = df.count()

      name   age  gender  score
0    Alice  25.0  female   80.0
1      Bob   NaN    male   90.0
2  Charlie  30.0    male    NaN
3    David  35.0    male   70.0
    name   age  gender  score
0  Alice  25.0  female   80.0
3  David  35.0    male   70.0
      姓名    年龄      性别    成绩
0  Alice  25.0  female  80.0
3  David  35.0    male  70.0
          年龄    成绩
性别                
female  25.0  80.0
male    35.0  70.0
      姓名    成绩
0  Alice  80.0
         成绩
count   1.0
mean   80.0
std     NaN
min    80.0
25%    80.0
50%    80.0
75%    80.0
max    80.0
