# Chapter 10: Selecting and Creating Columns

In [None]:
import polars as pl
pl.__version__  # The book is built with Polars version 1.20.0

In [None]:
starwars = pl.read_parquet("data/starwars.parquet")
rebels = starwars.drop("films").filter(
    pl.col("name").is_in(["Luke Skywalker", "Leia Organa", "Han Solo"])
)

print(rebels[:, :6])  
print(rebels[:, 6:11])
print(rebels[:, 11:])

## Selecting Columns

In [None]:
rebels.select(
    "name",
    pl.col("homeworld"),
    pl.col("^.*_color$"),
    (pl.col("height") / 100).alias("height_m"),
)

### Introducing Selectors

In [None]:
import polars.selectors as cs

In [None]:
rebels.select(
    "name",
    cs.by_name("homeworld"),
    cs.by_name("^.*_color$"),
    (cs.by_name("height") / 100).alias("height_m"),
)

### Selecting Based on Name

In [None]:
rebels.select(cs.starts_with("birth_"))

In [None]:
rebels.select(cs.ends_with("_color"))

In [None]:
rebels.select(cs.contains("_"))

In [None]:
rebels.select(cs.matches("^[a-z]{4}$"))

### Selecting Based on Data Type

In [None]:
rebels.group_by("hair_color").agg(cs.numeric().mean())

In [None]:
rebels.select(cs.string())

In [None]:
rebels.select(cs.temporal())

In [None]:
rebels.select(cs.by_dtype(pl.List(pl.String)))

### Selecting Based on Position

In [None]:
rebels.select(cs.by_index(range(0, 999, 3), require_all=False))

In [None]:
rebels.select("name", cs.by_index(range(-2, 0)))

In [None]:
# This raises a ColumnNotFoundError:
# rebels.select(cs.by_index(20))

In [None]:
rebels.select(cs.by_index(range(20, 22), require_all=False))

### Combining Selectors

In [None]:
rebels.select(cs.by_name("hair_color") | cs.numeric())

In [None]:
df = pl.DataFrame({"d": 1, "i": True, "s": True, "c": True, "o": 1.0})

print(df)

x = cs.by_name("d", "i", "s")
y = cs.boolean()

print("\nselector => columns")

for s in ["x", "y", "x | y", "x & y", "x - y", "x ^ y", "~x", "x - x"]:
    print(f"{s:8} => {cs.expand_selector(df, eval(s))}")

In [None]:
df.select(x - x)

In [None]:
print(df.select(first := cs.by_name("c", "i"), ~first))
print(f"first: {first}, ~first: {~first}")

In [None]:
print(df.select(first := cs.last(), ~first))
print(f"first: {first}, ~first: {~first}")

## Creating Columns

In [None]:
rebels.with_columns(bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2))

In [None]:
df = pl.DataFrame({"a": [1, 2, 3]})
df.with_columns(pl.col("a") * 2)

In [None]:
df.with_columns(a2=pl.col("a") * 2)

In [None]:
rebels.with_columns(
    bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2),
    age_destroy=(
        (pl.date(1983, 5, 25) - pl.col("birth_date")).dt.total_days() / 365
    ).cast(pl.UInt8),
)

In [None]:
# This raises a ColumnNotFoundError:
# rebels.with_columns(
#     bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2),
#     bmi_cat=pl.col("bmi").cut(
#         [18.5, 25], labels=["Underweight", "Normal", "Overweight"]
#     ),
# )

In [None]:
(
    rebels.with_columns(
        bmi=pl.col("mass") / ((pl.col("height") / 100) ** 2)
    ).with_columns(
        bmi_cat=pl.col("bmi").cut(
            [18.5, 25], labels=["Underweight", "Normal", "Overweight"]
        )
    )
)

In [None]:
# This raises a SyntaxError:
# starwars.select(
#     "name",
#     bmi=(pl.col("mass") / ((pl.col("height") / 100) ** 2)),
#     "species",
# )

In [None]:
(
    starwars.select(
        "name",
        (pl.col("mass") / ((pl.col("height") / 100) ** 2)).alias("bmi"),  
        "species",
    )
    .drop_nulls()
    .top_k(5, by="bmi")  
)

In [None]:
df.with_columns(pl.lit(1).alias("ones"))

In [None]:
df.select(pl.all(), pl.lit(1).alias("ones"))

## Related Column Operations

### Dropping

In [None]:
rebels.drop("name", "screen_time", strict=False)  

In [None]:
rebels.select(~cs.by_name("name", "screen_time"))

In [None]:
rebels.select(cs.exclude("name", "screen_time"))

### Renaming

In [None]:
(
    rebels.rename({"homeworld": "planet", "mass": "weight"})
    .rename(lambda s: s.removesuffix("_color"))
    .select("name", "planet", "weight", "hair", "skin", "eye")  
)

### Stacking

In [None]:
rebel_names = rebels.select("name")
rebel_colors = rebels.select(cs.ends_with("_color"))
rebel_quotes = pl.Series(
    "quote",
    [
        "You know, sometimes I amaze myself.",
        "That doesn't sound too hard.",
        "I have a bad feeling about this.",
    ],
)

(rebel_names.hstack(rebel_colors).hstack([rebel_quotes]))  

### Adding Row Indices

In [None]:
rebels.with_row_index(name="rebel_id", offset=1)

## Takeaways