# 创建 DataFrame


## 类型


### 自动推断类型


In [6]:
import polars as pl
import datetime as dt

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1981, 4, 30),
            dt.date(1989, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
# 严格类型(没有指定Polars 会自己推断)
df

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1981-04-30,53.6,1.65
"""Daniel Donovan""",1989-04-30,83.1,1.75


### 指定类型


In [1]:
import polars as pl

df = pl.DataFrame(
    {"col1": [0, 2], "col2": [3, 7]}, schema={"col1": pl.Float32, "col2": pl.Int64}
)

df

col1,col2
f32,i64
0.0,3
2.0,7


In [6]:
pl.DataFrame(
    {"col1": [0, 2], "col2": [3, 7]}, schema=[("col1", pl.Float32), ("col2", pl.Int64)]
)

col1,col2
f32,i64
0.0,3
2.0,7


## 创建方式


### 基于 Series


In [7]:
df = pl.DataFrame(
    [
        pl.Series("col1", [0, 2], dtype=pl.Float32),
        pl.Series("col2", [3, 7], dtype=pl.Int64),
    ]
)
df

col1,col2
f32,i64
0.0,3
2.0,7


### 基于 list


In [12]:
df2 = pl.DataFrame([[0, 2], [3, 7]], schema=["col1", "col2"])
df2

col1,col2
i64,i64
0,3
2,7


### 基于 dict


In [15]:
dict_scores_data = {
    "语文": [62, 72, 93, 88, 93],
    "数学": [95, 65, 86, 66, 87],
    "英语": [66, 75, 82, 69, 82],
}
df = pl.DataFrame(dict_scores_data)
df

语文,数学,英语
i64,i64,i64
62,95,66
72,65,75
93,86,82
88,66,69
93,87,82


### 基于 lis[dict]

如果某个字段不存在，那么会被设置为空，在 Polars 里面空使用 null


In [22]:
df = pl.DataFrame(
    [
        {"col1": 0, "col2": 3},
        {"col1": 2, "col2": 7},
        {"col1": 4, "col2": 5, "col3": 3},
    ]
)
df

col1,col2,col3
i64,i64,i64
0,3,
2,7,
4,5,3.0


### 基于 list[list] k 线


## 方向 orient

--建议手动指定此参数(除了)

- "row"
- "col"
- None :默认为 None，表示让 Polars 自己推断(如果参数是字典，那么一个键值对就是一列；如果参数是 Series ，那么一个 Series 就是一列)


### col

内部每个列表都是一列


In [13]:
df = pl.DataFrame([[0, 2], [3, 7]], schema=["col1", "col2"], orient="col")
df

col1,col2
i64,i64
0,3
2,7


### row

内部每个列表都是一行


In [14]:
df = pl.DataFrame([[0, 2], [3, 7]], schema=["col1", "col2"], orient="row")
df

col1,col2
i64,i64
0,2
3,7


### 解释方向对 Series/dict 无效

因为这两个固定都是列


In [17]:
df1 = pl.DataFrame(
    [
        pl.Series("col1", [0, 2], dtype=pl.Float32),
        pl.Series("col2", [3, 7], dtype=pl.Int64),
    ],
    orient="row",
)
df2 = pl.DataFrame(
    [
        pl.Series("col1", [0, 2], dtype=pl.Float32),
        pl.Series("col2", [3, 7], dtype=pl.Int64),
    ],
    orient="col",
)

In [18]:
df1

col1,col2
f32,i64
0.0,3
2.0,7


In [19]:
df2

col1,col2
f32,i64
0.0,3
2.0,7


## 定义列名


### 手动指定

通过 schema 指定


In [20]:
df = pl.DataFrame([[0, 2], [3, 7]], schema={"col1": pl.Float32, "col2": pl.Int64})
df

col1,col2
f32,i64
0.0,3
2.0,7


### 自动生成

Polars 会自动以 column_0、column_1、··· 的方式赋予列名


In [24]:
df = pl.DataFrame([[0, 2], [3, 7]])
df

# 类型转换


In [None]:
df = pl.DataFrame(
    {
        "integers": [1, 2, 3, 4, 5],
        "floats": [4.0, 5.0, 6.0, 7.0, 8.0],
        "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5],
    }
)
df

integers,floats,floats_with_decimal
i64,f64,f64
1,4.0,4.532
2,5.0,5.5
3,6.0,6.5
4,7.0,7.5
5,8.0,8.5


In [None]:
df.select(
    pl.col("integers").cast(pl.Float32).alias("integers_as_floats"),
    pl.col("floats").cast(pl.Int32).alias("floats_as_integers"),
    pl.col("floats_with_decimal")
    .cast(pl.Int32)
    .alias("floats_with_decimal_as_integers"),
)

integers_as_floats,floats_as_integers,floats_with_decimal_as_integers
f32,i32,i32
1.0,4,4
2.0,5,5
3.0,6,6
4.0,7,7
5.0,8,8


# Contexts(上下文)

- 上下文负责执行表达式
- select、with_columns、filter、group 都是上下文


## select

- select() 会返回一个新的 DataFrame，不会影响原有的 df
- 在基础列上读出更少的列
- 越读越少
- 更详细的 select 见 Expression->分类->列选择


In [69]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "age": [16, 400, 18],
        "length": [155.3, 145.9, 152.1],
        "salary": [6000, 7500, 5000],
    }
)
df

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


In [70]:
df.select(pl.col("name", "age"))

name,age
str,i64
"""satori""",16
"""scarlet""",400
"""marisa""",18


## with_columns

- 在基础列上读出更多的列
- 阅读越多


In [7]:
import polars as pl

df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1981, 4, 30),
            dt.date(1989, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)
df.with_columns(
    birth_year=pl.col("birthdate").dt.year(),
    bmi=pl.col("weight") / (pl.col("height") ** 2),
)

name,birthdate,weight,height,birth_year,bmi
str,date,f64,f64,i32,f64
"""Alice Archer""",1997-01-10,57.9,1.56,1997,23.791913
"""Ben Brown""",1985-02-15,72.5,1.77,1985,23.141498
"""Chloe Cooper""",1981-04-30,53.6,1.65,1981,19.687787
"""Daniel Donovan""",1989-04-30,83.1,1.75,1989,27.134694


## filter 筛选

找到符合条件的数据


In [8]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "age": [16, 400, 18],
        "length": [155.3, 145.9, 152.1],
        "salary": [6000, 7500, 5000],
    }
)

### > < =


In [10]:
df.filter(pl.col("age") > 18)

name,age,length,salary
str,i64,f64,i64
"""scarlet""",400,145.9,7500


### 字符串包含 str.contains


In [8]:
df.filter(pl.col("name").str.contains("risa"))

name,age,length,salary
str,i64,f64,i64
"""marisa""",18,152.1,5000


### 区间 is_between


In [11]:
df.filter(pl.col("age").is_between(10, 30))

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""marisa""",18,152.1,5000


### 与 select 联用


In [5]:
df.filter(pl.col("age") > 18).select(pl.col("name", "age"))

name,age
str,i64
"""scarlet""",400


## group_by 分组


### 分组并统计


In [12]:
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1981, 4, 30),
            dt.date(1989, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

In [13]:
# 计算了每个十年有多少人出生
df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).len()

decade,len
i32,u32
1990,1
1980,3


### 分组并求和 agg


In [14]:
# 计算了每个十年有多少人出生,并算出他们的平均身高、体重
df.group_by(
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
    maintain_order=True,
).agg(
    pl.len().alias("sample_size"),
    pl.col("weight").mean().round(2).alias("avg_weight"),
    pl.col("height").max().alias("tallest"),
)

decade,sample_size,avg_weight,tallest
i32,u32,f64,f64
1990,1,57.9,1.56
1980,3,69.73,1.77


### 分组，求和并筛选


In [12]:
df.group_by("language").agg(
    pl.col("framework").count().alias("count"),
    pl.col("framework").first().alias("first"),
    pl.col("framework").last().alias("last"),
).filter(pl.col("count") > 3)

language,count,first,last
str,u32,str,str
"""Python""",4,"""FastAPI""","""Flask"""


# Expression (表达式)

- 表达式是数据转换的惰性表示
- 表达式需要一个执行它们以产生结果的上下文(例如 select)


## 分类


In [71]:
import numpy as np
import polars as pl

df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)
df

nrs,names,random,groups
i64,str,f64,str
1.0,"""foo""",0.796994,"""A"""
2.0,"""ham""",0.095054,"""A"""
3.0,"""spam""",0.310776,"""B"""
,"""egg""",0.891863,"""C"""
5.0,,0.167284,"""B"""


### 运算

- Numerical 数字
- Logical 逻辑判断


In [72]:
# 数字运算例子

df_numerical = df.select(
    (pl.col("nrs") + 5).alias("nrs + 5"),
    (pl.col("nrs") - 5).alias("nrs - 5"),
    (pl.col("nrs") * pl.col("random")).alias("nrs * random"),
    (pl.col("nrs") / pl.col("random")).alias("nrs / random"),
)
df_numerical

nrs + 5,nrs - 5,nrs * random,nrs / random
i64,i64,f64,f64
6.0,-4.0,0.796994,1.254714
7.0,-3.0,0.190107,21.04074
8.0,-2.0,0.932328,9.653253
,,,
10.0,0.0,0.836419,29.889329


In [73]:
# 逻辑判断例子
df_logical = df.select(
    (pl.col("nrs") > 1).alias("nrs > 1"),
    (pl.col("random") <= 0.5).alias("random <= .5"),
    (pl.col("nrs") != 1).alias("nrs != 1"),
    (pl.col("nrs") == 1).alias("nrs == 1"),
    ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"),  # and
    ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"),  # or
)
df_logical

nrs > 1,random <= .5,nrs != 1,nrs == 1,and_expr,or_expr
bool,bool,bool,bool,bool,bool
False,False,False,True,False,False
True,True,True,False,True,True
True,True,True,False,True,True
,False,,,False,
True,True,True,False,True,True


### 列选择

-


In [77]:
from datetime import date, datetime

import polars as pl

df = pl.DataFrame(
    {
        "id": [9, 4, 2],
        "place": ["Mars", "Earth", "Saturn"],
        "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 3), "1d", eager=True),
        "sales": [33.4, 2142134.1, 44.7],
        "has_people": [False, True, False],
        "logged_at": pl.datetime_range(
            datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True
        ),
    }
).with_row_index("index")
df

index,id,place,date,sales,has_people,logged_at
u32,i64,str,date,f64,bool,datetime[μs]
0,9,"""Mars""",2022-01-01,33.4,False,2022-12-01 00:00:00
1,4,"""Earth""",2022-01-02,2142134.1,True,2022-12-01 00:00:01
2,2,"""Saturn""",2022-01-03,44.7,False,2022-12-01 00:00:02


### selectors

找到符合条件的列


In [None]:
import polars

In [80]:
# 用于实现更复杂的筛选的选择器
import polars.selectors as cs
import polars

In [78]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "age": [16, 400, 18],
        "length": [155.3, 145.9, 152.1],
        "salary": [6000, 7500, 5000],
    }
)
df

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


#### 全部


In [84]:
df.select(cs.first())

name
str
"""satori"""
"""scarlet"""
"""marisa"""


In [81]:
df.select(cs.all())

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


#### 按照字段类型


In [None]:
df.select(pl.col(pl.Int64))

age,salary
i64,i64
16,6000
400,7500
18,5000


#### 首尾列


In [None]:
df.select(cs.first(), cs.last())

name,salary
str,i64
"""satori""",6000
"""scarlet""",7500
"""marisa""",5000


#### 字符串


In [None]:
# 选择以 "na" 开头尾的列
df.select(cs.starts_with("na"))

name
str
"""satori"""
"""scarlet"""
"""marisa"""


In [None]:
# 选择不以 "na" 开头的的列
df.select(~cs.starts_with("na"))

age,length,salary
i64,f64,i64
16,155.3,6000
400,145.9,7500
18,152.1,5000


In [None]:
# 选择以 "e" 结尾的列
df.select(cs.ends_with("e"))

name,age
str,i64
"""satori""",16
"""scarlet""",400
"""marisa""",18


In [None]:
# 选择包含 "ame" 的列
df.select(cs.contains("ame"))

name
str
"""satori"""
"""scarlet"""
"""marisa"""


### Window functions: OVer

- over:不会修改 DataFrame 的原始大小而分组内容
- group by :修改 DataFrame 为按照 xx 分组后的大小

* 和 group by 的区别：将逻辑放在单个表达式中,不影响整个 DataFrame,使 API 更加干净


In [3]:
import polars as pl

# then let's load some csv data with information about pokemon
df = pl.read_csv(
    "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv"
)
df.head()

#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
i64,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,bool
1,"""Bulbasaur""","""Grass""","""Poison""",318,45,49,49,65,65,45,1,False
2,"""Ivysaur""","""Grass""","""Poison""",405,60,62,63,80,80,60,1,False
3,"""Venusaur""","""Grass""","""Poison""",525,80,82,83,100,100,80,1,False
3,"""VenusaurMega Venusaur""","""Grass""","""Poison""",625,80,100,123,122,120,80,1,False
4,"""Charmander""","""Fire""",,309,39,52,43,60,50,65,1,False


In [8]:
out = df.select(
    "Type 1",
    "Type 2",
    pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"),
    pl.col("Attack").mean().alias("avg_attack"),
)
out.head(10)

Type 1,Type 2,avg_attack_by_type,avg_attack
str,str,f64,f64
"""Grass""","""Poison""",72.923077,75.349693
"""Grass""","""Poison""",72.923077,75.349693
"""Grass""","""Poison""",72.923077,75.349693
"""Grass""","""Poison""",72.923077,75.349693
"""Fire""",,88.642857,75.349693
"""Fire""",,88.642857,75.349693
"""Fire""","""Flying""",88.642857,75.349693
"""Fire""","""Dragon""",88.642857,75.349693
"""Fire""","""Flying""",88.642857,75.349693
"""Water""",,74.193548,75.349693


#### 分组并排序


In [10]:
filtered = df.filter(pl.col("Type 2") == "Psychic").select(
    "Name",
    "Type 1",
    "Speed",
)
filtered

Name,Type 1,Speed
str,str,i64
"""Slowpoke""","""Water""",15
"""Slowbro""","""Water""",30
"""SlowbroMega Slowbro""","""Water""",30
"""Exeggcute""","""Grass""",40
"""Exeggutor""","""Grass""",55
"""Starmie""","""Water""",115
"""Jynx""","""Ice""",95


In [11]:
filtered.with_columns(
    pl.col("Name", "Speed").sort_by("Speed", descending=True).over("Type 1"),
)

Name,Type 1,Speed
str,str,i64
"""Starmie""","""Water""",115
"""Slowbro""","""Water""",30
"""SlowbroMega Slowbro""","""Water""",30
"""Exeggutor""","""Grass""",55
"""Exeggcute""","""Grass""",40
"""Slowpoke""","""Water""",15
"""Jynx""","""Ice""",95


# 组合

- join 增加列
- concat 增加行


In [15]:
df1 = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1981, 4, 30),
            dt.date(1989, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

## join


In [17]:
df2 = pl.DataFrame(
    {
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
        "parent": [True, False, False, False],
        "siblings": [1, 2, 3, 4],
    }
)

df.join(df2, on="name", how="left")

name,birthdate,weight,height,parent,siblings
str,date,f64,f64,bool,i64
"""Alice Archer""",1997-01-10,57.9,1.56,False,3
"""Ben Brown""",1985-02-15,72.5,1.77,True,1
"""Chloe Cooper""",1981-04-30,53.6,1.65,False,4
"""Daniel Donovan""",1989-04-30,83.1,1.75,False,2


## concat


In [18]:
df3 = pl.DataFrame(
    {
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
        "birthdate": [
            dt.date(1977, 5, 10),
            dt.date(1975, 6, 23),
            dt.date(1973, 7, 22),
            dt.date(1971, 8, 3),
        ],
        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
    }
)

pl.concat([df, df3], how="vertical")

name,birthdate,weight,height
str,date,f64,f64
"""Alice Archer""",1997-01-10,57.9,1.56
"""Ben Brown""",1985-02-15,72.5,1.77
"""Chloe Cooper""",1981-04-30,53.6,1.65
"""Daniel Donovan""",1989-04-30,83.1,1.75
"""Ethan Edwards""",1977-05-10,67.9,1.76
"""Fiona Foster""",1975-06-23,72.5,1.6
"""Grace Gibson""",1973-07-22,57.6,1.66
"""Henry Harris""",1971-08-03,93.1,1.8


# 分析


In [44]:
# 极值
df = pl.DataFrame(
    {
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
        "birthdate": [
            dt.date(1997, 1, 10),
            dt.date(1985, 2, 15),
            dt.date(1981, 4, 30),
            dt.date(1989, 4, 30),
        ],
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
    }
)

## describe


In [45]:
df.describe()

statistic,name,birthdate,weight,height
str,str,str,f64,f64
"""count""","""4""","""4""",4.0,4.0
"""null_count""","""0""","""0""",0.0,0.0
"""mean""",,"""1988-03-14 18:00:00""",66.775,1.6825
"""std""",,,13.560082,0.097082
"""min""","""Alice Archer""","""1981-04-30""",53.6,1.56
"""25%""",,"""1985-02-15""",57.9,1.65
"""50%""",,"""1989-04-30""",72.5,1.75
"""75%""",,"""1989-04-30""",72.5,1.75
"""max""","""Daniel Donovan""","""1997-01-10""",83.1,1.77


# Window Functions


## over

用来标识或操作分组的数据范围,over一般跟在操作后面


In [9]:
import polars as pl

df = pl.DataFrame(
    {
        "group": ["A", "A", "B", "B", "C", "C", "C"],
        "value": [10, 20, 30, 40, 50, 60, 70],
    }
)

df

group,value
str,i64
"""A""",10
"""A""",20
"""B""",30
"""B""",40
"""C""",50
"""C""",60
"""C""",70


### 分组极值

In [10]:
# 分组求最大
df_result = df.with_columns(pl.col("value").max().over("group").alias("max_in_group"))
df_result

group,value,max_in_group
str,i64,i64
"""A""",10,20
"""A""",20,20
"""B""",30,40
"""B""",40,40
"""C""",50,70
"""C""",60,70
"""C""",70,70


### 分组count

In [18]:
df_result = df.with_columns(
    pl.col("value").cum_count().alias("row_number_in_group")
)
df_result

group,value,row_number_in_group
str,i64,u32
"""A""",10,1
"""A""",20,2
"""B""",30,3
"""B""",40,4
"""C""",50,5
"""C""",60,6
"""C""",70,7


In [17]:
df_result = df.with_columns(
    pl.col("value").cum_count().over("group").alias("row_number_in_group")
)
df_result

group,value,row_number_in_group
str,i64,u32
"""A""",10,1
"""A""",20,2
"""B""",30,1
"""B""",40,2
"""C""",50,1
"""C""",60,2
"""C""",70,3


In [None]:
### 高级

In [24]:
df_result = df.with_columns(
    pl.col("value")
    .cum_count()
    .over(
        pl.when(pl.col("value") >= pl.col("value").shift(1))
        .then(1)
        .otherwise(0)
        .cum_count()
    )
    .alias("row_number_in_group")
)
df_result

group,value,row_number_in_group
str,i64,u32
"""A""",10,1
"""A""",20,1
"""B""",30,1
"""B""",40,1
"""C""",50,1
"""C""",60,1
"""C""",70,1


## rank


# 特殊函数


## 头尾 head/tail


In [None]:
df = pl.DataFrame(
    {
        "name": ["satori", "scarlet", "marisa"],
        "age": [16, 400, 18],
        "length": [155.3, 145.9, 152.1],
        "salary": [6000, 7500, 5000],
    }
)

In [None]:
df.head(2)

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""scarlet""",400,145.9,7500


In [None]:
df.tail(2)

name,age,length,salary
str,i64,f64,i64
"""scarlet""",400,145.9,7500
"""marisa""",18,152.1,5000


## 随机读 sample


In [None]:
df.sample(2)

name,age,length,salary
str,i64,f64,i64
"""satori""",16,155.3,6000
"""marisa""",18,152.1,5000
