# Polars（据说比Pandas快不少）
* 安装方法：pip3 install -U polars[pyarrow]
* 用户指南: https://pola-rs.github.io/polars-book/
* h2oai's db基准测试结果: https://h2oai.github.io/db-benchmark/
* Python文档: https://pola-rs.github.io/polars/py-polars/html/reference/index.html
* 用户指南: https://pola-rs.github.io/polars-book/user-guide/index.html

In [1]:
import pandas as pd
import polars as pl
import timeit
df = pl.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
         "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
     }
 )

In [2]:
df

A,fruits,B,cars
i64,str,i64,str
1,"""banana""",5,"""beetle"""
2,"""banana""",4,"""audi"""
3,"""apple""",3,"""beetle"""
4,"""apple""",2,"""beetle"""
5,"""banana""",1,"""beetle"""


In [3]:
df.sort("fruits").select(
    [
        "fruits",
        "cars",
        pl.lit("fruits").alias("literal_string_fruits"),
        pl.col("B").filter(pl.col("cars") == "beetle").sum(),
        pl.col("A").filter(pl.col("B") > 2).sum().over("cars").alias("sum_A_by_cars"),  # groups by "cars"
        pl.col("A").sum().over("fruits").alias("sum_A_by_fruits"),  # groups by "fruits"
        pl.col("A").reverse().over("fruits").alias("rev_A_by_fruits"),  # groups by "fruits
        pl.col("A").sort_by("B").over("fruits").alias("sort_A_by_B_by_fruits"),  # groups by "fruits"
    ]
)

fruits,cars,literal_string_fruits,B,sum_A_by_cars,sum_A_by_fruits,rev_A_by_fruits,sort_A_by_B_by_fruits
str,str,str,i64,i64,i64,list,list
"""apple""","""beetle""","""fruits""",11,4,7,"[4, 3]","[4, 3]"
"""apple""","""beetle""","""fruits""",11,4,7,"[4, 3]","[4, 3]"
"""banana""","""beetle""","""fruits""",11,4,8,"[5, 2, 1]","[5, 2, 1]"
"""banana""","""audi""","""fruits""",11,2,8,"[5, 2, 1]","[5, 2, 1]"
"""banana""","""beetle""","""fruits""",11,4,8,"[5, 2, 1]","[5, 2, 1]"


In [7]:
# 读取时间对比
start_df = timeit.default_timer()
df = pd.read_csv("E:/data/a.csv")
df = df.sort_values("current", ascending=False).head()
stop_df = timeit.default_timer()
print('Pandas_time: ', stop_df - start_df)

start_pl = timeit.default_timer()
data = pl.read_csv("E:/data/a.csv")
data.sort(by="current", reverse=True).head()
stop_pl = timeit.default_timer()
print('Polars_time: ', stop_pl - start_pl)

# 纵向拼接时间对比
start_df1 = timeit.default_timer()
df_1 = pd.read_csv('E:/data/a.csv')
df_2 = pd.read_csv('E:/data/b.csv')
df_1.append(df_2, ignore_index=True)
stop_df1 = timeit.default_timer()
print('Pandas_time1: ', stop_df1 - start_df1)

start_pl1 = timeit.default_timer()
pl_1 = pl.read_csv('E:/data/a.csv')
pl_2 = pl.read_csv('E:/data/b.csv')
pl_1.vstack(pl_2)
stop_pl1 = timeit.default_timer()
print('Polars_time1: ', stop_pl1 - start_pl1)


Pandas_time:  0.2836948000000348
Polars_time:  0.0953109999999242
Pandas_time1:  0.343936900000017
Polars_time1:  0.042977000000064436
