# Chapter 10: Selecting and Creating Columns

In [1]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.0.0

In [2]:
starwars = pl.read_parquet("data/starwars.parquet")
rebels = (
    starwars
    .drop("films")
    .filter(pl.col("name").is_in(["Luke Skywalker", "Leia Organa", "Han Solo"]))
)

print(rebels[:,:6])
print(rebels[:,6:11])
print(rebels[:,11:])

## Selecting Columns

In [4]:
rebels.select(
    "name",
    pl.col("homeworld"),
    pl.col("^.*_color$"),
    (pl.col("height") / 100).alias("height_m")
)

### Introducing Selectors

In [6]:
import polars.selectors as cs

In [7]:
rebels.select(
    "name",
    cs.by_name("homeworld"),
    cs.by_name("^.*_color$"),
    (cs.by_name("height") / 100).alias("height_m")
)

### Selecting Based on Name

In [9]:
rebels.select(cs.starts_with("birth_"))

In [10]:
rebels.select(cs.ends_with("_color"))

In [11]:
rebels.select(cs.contains("_"))

In [12]:
rebels.select(cs.matches("^[a-z]{4}$"))

### Selecting Based on Data Type

In [14]:
rebels.group_by("hair_color").agg(cs.numeric().mean())

In [15]:
rebels.select(cs.string())

In [16]:
rebels.select(cs.temporal())

In [17]:
rebels.select(cs.by_dtype(pl.List(pl.String)))

### Selecting Based on Position

In [19]:
rebels.select(cs.by_index(range(0, 999, 3)))  

In [20]:
rebels.select("name", cs.by_index(range(-2, 0)))

In [21]:
rebels.select(cs.by_index(20))

In [22]:
rebels.select(cs.by_index(range(20, 22)))

### Combining Selectors

In [24]:
rebels.select(cs.by_name("hair_color") | cs.numeric())

In [25]:
df = pl.DataFrame({"d": 1, "i": True, "s": True, "c": True, "o": 1.0})

print(df)

x = cs.by_name("d", "i", "s")
y = cs.boolean()

print("\nselector => columns")

for s in ["x", "y", "x | y", "x & y", "x - y", "x ^ y", "~x", "x - x"]:
    print(f"{s:8} => {cs.expand_selector(df, eval(s))}")

In [26]:
df.select(x - x)

In [27]:
print(df.select(first := cs.by_name("c", "i"), ~first))
print(df.select(first := cs.last(), ~first))

## Creating Columns

In [29]:
rebels.with_columns(bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2))

In [30]:
from datetime import datetime

rebels.with_columns(
    bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2),
    age_destroy=((datetime(1983, 5, 25) - pl.col("birth_date"))
                .dt.total_days() / 356).cast(pl.UInt8)
)

In [31]:
rebels.with_columns(
    bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2),
    bmi_cat=pl.col("bmi").cut([18.5, 25], labels=["Underweight",
                                                  "Normal",
                                                  "Overweight"])
)

In [32]:
(
    rebels
    .with_columns(bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2))
    .with_columns(bmi_cat=pl.col("bmi").cut([18.5, 25], labels=["Underweight",
                                                                "Normal",
                                                                "Overweight"]))
)

In [33]:
(
    starwars
    .select(
        "name",
        (pl.col("mass") / ((pl.col("height") / 100) ** 2)).alias("bmi"),  
        "species"
    )
    .drop_nulls().top_k(5, by="bmi")  
)

## Related Column Operations

In [35]:
rebels.drop("name", "films", "screen_time", strict=False)  

In [36]:
rebels.select(~cs.by_name("name", "films", "screen_time"))

In [37]:
rebels.select(cs.exclude("name", "films", "screen_time"))

In [38]:
(
    rebels
    .rename({"homeworld": "planet", "mass": "weight"})
    .rename(lambda s: s.removesuffix("_color"))
    .select("name", "planet", "weight", "hair", "skin", "eye")  
)

In [39]:
rebel_names = rebels.select("name")
rebel_colors = rebels.select(cs.ends_with("_color"))
rebel_quotes = pl.Series("quote", ["You know, sometimes I amaze myself.",
                                   "That doesn't sound too hard.",
                                   "I have a bad feeling about this."])

(
    rebel_names
    .hstack(rebel_colors)
    .hstack([rebel_quotes])  
)

In [40]:
rebels.with_row_index(name="rebel_id", offset=1)

## Takeaways