# 创建 DataFrame


In [3]:
from datetime import datetime
import polars as pl

## 类型


### 自动推断类型


In [3]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "length": [155.3, 145.9, 152.1],
        "salary": [12000, 14000, 9000],
        "join_time": [
            datetime(1998, 12, 11, 12, 43, 18),
            datetime(1997, 8, 21),
            datetime(2005, 6, 18, 7, 22, 37),
        ],
    }
)
# 严格类型(没有指定Polars 会自己推断)
df

name,length,salary,join_time
str,f64,i64,datetime[μs]
"""satori""",155.3,12000,1998-12-11 12:43:18
"""scarlet""",145.9,14000,1997-08-21 00:00:00
"""marisa""",152.1,9000,2005-06-18 07:22:37


### 指定类型


In [5]:
df = pl.DataFrame(
    {"col1": [0, 2], "col2": [3, 7]}, schema={"col1": pl.Float32, "col2": pl.Int64}
)
df

col1,col2
f32,i64
0.0,3
2.0,7


In [6]:
pl.DataFrame(
    {"col1": [0, 2], "col2": [3, 7]}, schema=[("col1", pl.Float32), ("col2", pl.Int64)]
)

col1,col2
f32,i64
0.0,3
2.0,7


## 创建方式


### 基于 Series


In [7]:
df = pl.DataFrame(
    [
        pl.Series("col1", [0, 2], dtype=pl.Float32),
        pl.Series("col2", [3, 7], dtype=pl.Int64),
    ]
)
df

col1,col2
f32,i64
0.0,3
2.0,7


### 基于 list


In [12]:
df2 = pl.DataFrame([[0, 2], [3, 7]], schema=["col1", "col2"])
df2

col1,col2
i64,i64
0,3
2,7


### 基于 dict


In [15]:
dict_scores_data = {
    "语文": [62, 72, 93, 88, 93],
    "数学": [95, 65, 86, 66, 87],
    "英语": [66, 75, 82, 69, 82],
}
df = pl.DataFrame(dict_scores_data)
df

语文,数学,英语
i64,i64,i64
62,95,66
72,65,75
93,86,82
88,66,69
93,87,82


### 基于 lis[dict]

如果某个字段不存在，那么会被设置为空，在 Polars 里面空使用 null


In [22]:
df = pl.DataFrame(
    [
        {"col1": 0, "col2": 3},
        {"col1": 2, "col2": 7},
        {"col1": 4, "col2": 5, "col3": 3},
    ]
)
df

col1,col2,col3
i64,i64,i64
0,3,
2,7,
4,5,3.0


## 解释方向 orient

--建议手动指定此参数(除了)

- "row"
- "col"
- None :默认为 None，表示让 Polars 自己推断(如果参数是字典，那么一个键值对就是一列；如果参数是 Series ，那么一个 Series 就是一列)


### col

内部每个列表都是一列


In [13]:
df = pl.DataFrame([[0, 2], [3, 7]], schema=["col1", "col2"], orient="col")
df

col1,col2
i64,i64
0,3
2,7


### row

内部每个列表都是一行


In [14]:
df = pl.DataFrame([[0, 2], [3, 7]], schema=["col1", "col2"], orient="row")
df

col1,col2
i64,i64
0,2
3,7


### 解释方向对 Series/dict 无效

因为这两个固定都是列


In [17]:
df1 = pl.DataFrame(
    [
        pl.Series("col1", [0, 2], dtype=pl.Float32),
        pl.Series("col2", [3, 7], dtype=pl.Int64),
    ],
    orient="row",
)
df2 = pl.DataFrame(
    [
        pl.Series("col1", [0, 2], dtype=pl.Float32),
        pl.Series("col2", [3, 7], dtype=pl.Int64),
    ],
    orient="col",
)

In [18]:
df1

col1,col2
f32,i64
0.0,3
2.0,7


In [19]:
df2

col1,col2
f32,i64
0.0,3
2.0,7


## 定义列名


### 手动指定

通过 schema 指定


In [20]:
df = pl.DataFrame([[0, 2], [3, 7]], schema={"col1": pl.Float32, "col2": pl.Int64})
df

col1,col2
f32,i64
0.0,3
2.0,7


### 自动生成

Polars 会自动以 column_0、column_1、··· 的方式赋予列名


In [24]:
df = pl.DataFrame([[0, 2], [3, 7]])
df

# 读取


## select

- select() 会返回一个新的 DataFrame，不会影响原有的 df


In [24]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "age": [16, 400, 18],
        "length": [155.3, 145.9, 152.1],
        "salary": [6000, 7500, 5000],
    }
)
df

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


### 读取列


#### 普通


In [7]:
df.select("name", "length")

name,length
str,f64
"""satori""",155.3
"""scarlet""",145.9
"""marisa""",152.1


In [8]:
df.select(["name", "length"])

name,length
str,f64
"""satori""",155.3
"""scarlet""",145.9
"""marisa""",152.1


#### col()函数


In [12]:
df.select(pl.col("name"), pl.col("length"))

name,length
str,f64
"""satori""",155.3
"""scarlet""",145.9
"""marisa""",152.1


In [13]:
df.select([pl.col("name"), pl.col("length")])

name,length
str,f64
"""satori""",155.3
"""scarlet""",145.9
"""marisa""",152.1


In [14]:
df.select(pl.col("age", "length"))

age,length
i64,f64
16,155.3
400,145.9
18,152.1


In [15]:
# 通配符 *，表示筛选所有的列
df.select(pl.col("*"))

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


### 读取并修改


In [10]:
df.select("name", pl.col("age") + 100)

name,age
str,i64
"""satori""",116
"""scarlet""",500
"""marisa""",118


# 筛选列 -select

找到符合条件的列


In [31]:
# 用于实现更复杂的筛选的选择器
import polars.selectors as cs

In [4]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "age": [16, 400, 18],
        "length": [155.3, 145.9, 152.1],
        "salary": [6000, 7500, 5000],
    }
)
df

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


## 全部


In [23]:
df.select(cs.all())

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


## 按照字段类型


In [18]:
df.select(pl.col(pl.Int64))

age,salary
i64,i64
16,6000
400,7500
18,5000


## 首尾列


In [26]:
df.select(cs.first(), cs.last())

name,salary
str,i64
"""satori""",6000
"""scarlet""",7500
"""marisa""",5000


## 字符串


In [33]:
# 选择以 "na" 开头尾的列
df.select(cs.starts_with("na"))

name
str
"""satori"""
"""scarlet"""
"""marisa"""


In [36]:
# 选择不以 "na" 开头的的列
df.select(~cs.starts_with("na"))

age,length,salary
i64,f64,i64
16,155.3,6000
400,145.9,7500
18,152.1,5000


In [35]:
# 选择以 "e" 结尾的列
df.select(cs.ends_with("e"))

name,age
str,i64
"""satori""",16
"""scarlet""",400
"""marisa""",18


In [37]:
# 选择包含 "ame" 的列
df.select(cs.contains("ame"))

name
str
"""satori"""
"""scarlet"""
"""marisa"""


# 筛选数据 -filter

找到符合条件的数据


## base


In [38]:
df.filter(pl.col("age") > 18)

name,age,length,salary
str,i64,f64,i64
"""scarlet""",400,145.9,7500


## 与 select 联用


In [5]:
df.filter(pl.col("age") > 18).select(pl.col("name", "age"))

name,age
str,i64
"""scarlet""",400


## 过滤字符串


In [8]:
df.filter(pl.col("name").str.contains("risa"))

name,age,length,salary
str,i64,f64,i64
"""marisa""",18,152.1,5000


# 聚合


## base


In [10]:
df = pl.DataFrame(
    [
        {"language": "Python", "framework": "FastAPI"},
        {"language": "Python", "framework": "Sanic"},
        {"language": "Python", "framework": "Blacksheep"},
        {"language": "Python", "framework": "Flask"},
        {"language": "Golang", "framework": "Gin"},
        {"language": "Golang", "framework": "Beego"},
        {"language": "Golang", "framework": "Iris"},
        {"language": "Rust", "framework": "Axum"},
        {"language": "Rust", "framework": "Tokio"},
    ]
)
# 按照 language 进行分组
df.group_by("language").agg(
    # 获取每组的元素个数
    pl.col("framework").count().alias("count"),
    # 获取每组的第一个元素
    pl.col("framework").first().alias("first"),
    # 获取每组的最后一个元素
    pl.col("framework").last().alias("last"),
    # 获取每组的最大值（字符串会比较字典序），这里求最大值没什么意义
    # pl.col("framework").max(),
    # 当然还可以求和，基本上你能想到的任何操作都可以实现
    # pl.col("framework").sum(),
)

language,count,first,last
str,u32,str,str
"""Golang""",3,"""Gin""","""Iris"""
"""Rust""",2,"""Axum""","""Tokio"""
"""Python""",4,"""FastAPI""","""Flask"""


## 聚合并筛选


In [12]:
df.group_by("language").agg(
    pl.col("framework").count().alias("count"),
    pl.col("framework").first().alias("first"),
    pl.col("framework").last().alias("last"),
).filter(pl.col("count") > 3)

language,count,first,last
str,u32,str,str
"""Python""",4,"""FastAPI""","""Flask"""


# 数据类型转换


In [13]:
df = pl.DataFrame(
    {
        "integers": [1, 2, 3, 4, 5],
        "floats": [4.0, 5.0, 6.0, 7.0, 8.0],
        "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5],
    }
)
df

integers,floats,floats_with_decimal
i64,f64,f64
1,4.0,4.532
2,5.0,5.5
3,6.0,6.5
4,7.0,7.5
5,8.0,8.5


In [14]:
df.select(
    pl.col("integers").cast(pl.Float32).alias("integers_as_floats"),
    pl.col("floats").cast(pl.Int32).alias("floats_as_integers"),
    pl.col("floats_with_decimal")
    .cast(pl.Int32)
    .alias("floats_with_decimal_as_integers"),
)

integers_as_floats,floats_as_integers,floats_with_decimal_as_integers
f32,i32,i32
1.0,4,4
2.0,5,5
3.0,6,6
4.0,7,7
5.0,8,8
