In [1]:
import polars as pl

working with text, temporal and ensted data types


string methods

In [2]:
corpus = pl.DataFrame(
    {
        "raw_text": [
            " Data Science is amazing",
            "Data_analysis > Data entry",
            " Python&Polars;Fast"
        ]
    }
)
corpus

raw_text
str
""" Data Science is amazing"""
"""Data_analysis > Data entry"""
""" Python&Polars;Fast"""


just me playing around 

In [3]:
(
    corpus
    .select(pl.col("raw_text").str.split(" ").explode().alias("token"))
    .filter(pl.col("token") != "")
    .select(pl.col("token").unique().alias("unique_token"))
    .with_row_index("id")
)

id,unique_token
u32,str
0,""">"""
1,"""Data_analysis"""
2,"""Python&Polars;Fast"""
3,"""Data"""
4,"""amazing"""
5,"""is"""
6,"""entry"""
7,"""Science"""


In [4]:
corpus = corpus.with_columns(
    processed_text=pl.col("raw_text")
    .str.strip_chars()
    .str.to_lowercase()
    .str.replace_all("_", "")
)
corpus

raw_text,processed_text
str,str
""" Data Science is amazing""","""data science is amazing"""
"""Data_analysis > Data entry""","""dataanalysis > data entry"""
""" Python&Polars;Fast""","""python&polars;fast"""


In [None]:
corpus.with_columns(
    first_5_chars=pl.col("processed_text").str.slice(0, 5),
    first_word=pl.col("processed_text")
    .str.split("")
    .list.get(0),
    second_word=pl.col("processed_text").str.split("").list.get(1),
)

raw_text,processed_text,first_5_chars,first_word,second_word
str,str,str,str,str
""" Data Science is amazing""","""data science is amazing""","""data ""","""d""","""a"""
"""Data_analysis > Data entry""","""dataanalysis > data entry""","""dataa""","""d""","""a"""
""" Python&Polars;Fast""","""python&polars;fast""","""pytho""","""p""","""y"""


In [None]:
corpus.with_columns(
    len_chars=pl.col("processed_text").str.len_chars(),
    len_bytes=pl.col("processed_text").str.len_bytes(),
    count_a=pl.col("processed_text").str.count_matches("a",)
)

raw_text,processed_text,len_chars,len_bytes,count_a
str,str,u32,u32,u32
""" Data Science is amazing""","""data science is amazing""",23,23,4
"""Data_analysis > Data entry""","""dataanalysis > data entry""",25,25,6
""" Python&Polars;Fast""","""python&polars;fast""",18,18,2


In [None]:
posts = pl.DataFrame(
    {"post": ["Loving #python and #polars!",
              "a boomer post without a hashtag"]}
)

In [None]:
hashtag_regex = r"#(\w+)"

posts.with_columns(
    hashtags=pl.col("post").str.extract_all(hashtag_regex)
)

post,hashtags
str,list[str]
"""Loving #python and #polars!""","[""#python"", ""#polars""]"
"""a boomer post without a hashta…",[]


categorical stuff

In [None]:
cats = pl.DataFrame(
    {"name": ["Persian cat", "Siamese cat", "Lynx", "Lynx"]},
    schema={"name": pl.Categorical},
)

cats.with_columns(name_physical=pl.col("name").to_physical())

name,name_physical
cat,u32
"""Persian cat""",0
"""Siamese cat""",1
"""Lynx""",2
"""Lynx""",2


In [None]:
more_cats = pl.DataFrame(
    {"name": ["Maine Coon Cat", "Lynx", "lynx", "Siamese Cat"]},
    schema={"name": pl.Categorical},
)

more_cats.with_columns(
    pl.col("name").alias("name_physical").to_physical()
)

name,name_physical
cat,u32
"""Maine Coon Cat""",0
"""Lynx""",1
"""lynx""",2
"""Siamese Cat""",3


In [15]:
cats.join(more_cats, on="name")

  cats.join(more_cats, on="name")


name
cat
"""Lynx"""
"""Lynx"""


In [None]:
bear_enum_dtype = pl.Enum(["Polar", "Panda", "Brown"])

bear_enum_series = pl.Series(
    ["Polar", "Panda", "Brown", "Polar"], dtype=bear_enum_dtype
)

bear_cat_series = pl.Series(
    ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical
)

In [23]:
bear_enum_series

"""Polar"""
"""Panda"""
"""Brown"""
"""Polar"""


In [24]:
bear_cat_series

"""Polar"""
"""Panda"""
"""Brown"""
"""Brown"""
"""Polar"""


In [None]:
data1 = {"name": ["kane", "bane", "shane"],
         "type": ["Polar", "Polar", "Brown"]}

data2 = {"avg_weight": [255, 128, 512], "type": ["Polar", "Panda", "Brown"]}

bear1 = pl.DataFrame(
    data1, schema={"name": pl.String, "type": bear_enum_dtype})
bear2 = pl.DataFrame(
    data2, schema={"avg_weight": pl.Int16, "type": bear_enum_dtype})

In [36]:
bear1

name,type
str,enum
"""kane""","""Polar"""
"""bane""","""Polar"""
"""shane""","""Brown"""


In [37]:
bear2

avg_weight,type
i16,enum
255,"""Polar"""
128,"""Panda"""
512,"""Brown"""


In [38]:
bear1.join(bear2, on="type")

name,type,avg_weight
str,enum,i16
"""kane""","""Polar""",255
"""bane""","""Polar""",255
"""shane""","""Brown""",512


time

In [None]:
pl.read_csv("data/all_stocks.csv", try_parse_dates=True)

symbol,date,open,high,low,close,adj close,volume
str,date,f64,f64,f64,f64,f64,i64
"""ASML""",1999-01-04,11.765625,12.28125,11.765625,12.140625,7.522523,1801867
"""ASML""",1999-01-05,11.859375,14.25,11.71875,13.96875,8.655257,8241600
"""ASML""",1999-01-06,14.25,17.601563,14.203125,16.875,10.456018,16400267
"""ASML""",1999-01-07,14.742188,17.8125,14.53125,16.851563,10.441495,17722133
"""ASML""",1999-01-08,16.078125,16.289063,15.023438,15.796875,9.787995,10696000
…,…,…,…,…,…,…,…
"""TSM""",2023-06-26,102.019997,103.040001,100.089996,100.110001,99.125954,8560000
"""TSM""",2023-06-27,101.150002,102.790001,100.019997,102.080002,101.076591,9732000
"""TSM""",2023-06-28,100.5,101.879997,100.220001,100.919998,99.927986,8160900
"""TSM""",2023-06-29,101.339996,101.519997,100.019997,100.639999,99.650742,7383900
