# Data Manipulation with Polars Using The Fuel Economy Dataset

### Loading Libraries

In [1]:
# ZipFiles
import zipfile

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pl.__version__

'1.14.0'

### Getting Data

In [3]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [4]:
raw = pl.read_csv(path, null_values=['NA'])

#### Placing `tweak_auto` Function

In [5]:
def tweak_auto(df):
    cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 
            'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']
    return (df
            .select(pl.col(cols))
            .with_columns(pl.col('year').cast(pl.Int16),
                          pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                          pl.col(['displ', 'barrels08']).cast(pl.Float32),
                          pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'),
                          is_automatic=pl.col('trany')                    
                          .str.contains('Automatic')
                          .fill_null('Automatic'),
                          num_gears=pl.col('trany')
                          .str.extract(r'(\d+)')
                          .cast(pl.UInt8)
                          .fill_null(6))
           )

In [6]:
autos = tweak_auto(raw)

In [7]:
print(autos)

shape: (48_231, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ i16  ┆ cat        ┆ cat          ┆ f32   ┆   ┆ u8        ┆ datetime[μs ┆ ---         ┆ u8        │
│      ┆            ┆              ┆       ┆   ┆           ┆ ]           ┆ str         ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆              ┆       ┆   ┆           ┆ 00:00:00  

### Adding Columns

In [8]:
print(autos
      .select(pl.col(['highway08', 'city08']),
              mpg_ratio=(pl.col('highway08') / pl.col('city08'))
             )
     )

shape: (48_231, 3)
┌───────────┬────────┬───────────┐
│ highway08 ┆ city08 ┆ mpg_ratio │
│ ---       ┆ ---    ┆ ---       │
│ u8        ┆ u8     ┆ f64       │
╞═══════════╪════════╪═══════════╡
│ 25        ┆ 19     ┆ 1.315789  │
│ 14        ┆ 9      ┆ 1.555556  │
│ 33        ┆ 23     ┆ 1.434783  │
│ 12        ┆ 10     ┆ 1.2       │
│ 23        ┆ 17     ┆ 1.352941  │
│ …         ┆ …      ┆ …         │
│ 26        ┆ 19     ┆ 1.368421  │
│ 28        ┆ 20     ┆ 1.4       │
│ 24        ┆ 18     ┆ 1.333333  │
│ 24        ┆ 18     ┆ 1.333333  │
│ 21        ┆ 16     ┆ 1.3125    │
└───────────┴────────┴───────────┘


In [9]:
# Adding mpg_ratio column to Dataframe
print(autos
      .with_columns(mpg_ratio=pl.col('highway08') / pl.col('city08'))
     )

shape: (48_231, 16)
┌──────┬────────────┬──────────────┬───────┬───┬─────────────┬─────────────┬───────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ createdOn   ┆ is_automati ┆ num_gears ┆ mpg_ratio │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---         ┆ c           ┆ ---       ┆ ---       │
│ i16  ┆ cat        ┆ cat          ┆ f32   ┆   ┆ datetime[μs ┆ ---         ┆ u8        ┆ f64       │
│      ┆            ┆              ┆       ┆   ┆ ]           ┆ str         ┆           ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═════════════╪═════════════╪═══════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 2013-01-01  ┆ false       ┆ 5         ┆ 1.315789  │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆ 00:00:00    ┆             ┆           ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 2013-01-01  ┆ false       ┆ 5         ┆ 1.555556  │
│      ┆            ┆              ┆       ┆   ┆ 00:00:00    ┆         

### Simulating The Index

In [10]:
# Creating & Simulating Index
print(autos
      .with_row_index('index')
     )

shape: (48_231, 16)
┌───────┬──────┬────────────┬──────────────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ index ┆ year ┆ make       ┆ model        ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---   ┆ ---  ┆ ---        ┆ ---          ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ u32   ┆ i16  ┆ cat        ┆ cat          ┆   ┆ u8        ┆ datetime[μs ┆ ---         ┆ u8        │
│       ┆      ┆            ┆              ┆   ┆           ┆ ]           ┆ str         ┆           │
╞═══════╪══════╪════════════╪══════════════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 0     ┆ 1985 ┆ Alfa Romeo ┆ Spider       ┆ … ┆ 25        ┆ 2013-01-01  ┆ false       ┆ 5         │
│       ┆      ┆            ┆ Veloce 2000  ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│ 1     ┆ 1985 ┆ Ferrari    ┆ Testarossa   ┆ … ┆ 14        ┆ 2013-01-01  ┆ false       ┆ 5         │
│       ┆      ┆            ┆              ┆   ┆           ┆ 00:00:00  

### Removing Columns

In [11]:
print(autos
      .drop('createdOn')
      .columns
     )

['year', 'make', 'model', 'displ', 'cylinders', 'trany', 'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'is_automatic', 'num_gears']


In [12]:
final_cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 
              'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 
              'is_automatic', 'num_gears']

In [13]:
print(autos
      .select(final_cols)
     )

shape: (48_231, 14)
┌──────┬────────────┬──────────────────┬───────┬───┬────────┬───────────┬──────────────┬───────────┐
│ year ┆ make       ┆ model            ┆ displ ┆ … ┆ city08 ┆ highway08 ┆ is_automatic ┆ num_gears │
│ ---  ┆ ---        ┆ ---              ┆ ---   ┆   ┆ ---    ┆ ---       ┆ ---          ┆ ---       │
│ i16  ┆ cat        ┆ cat              ┆ f32   ┆   ┆ u8     ┆ u8        ┆ str          ┆ u8        │
╞══════╪════════════╪══════════════════╪═══════╪═══╪════════╪═══════════╪══════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce    ┆ 2.0   ┆ … ┆ 19     ┆ 25        ┆ false        ┆ 5         │
│      ┆            ┆ 2000             ┆       ┆   ┆        ┆           ┆              ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa       ┆ 4.9   ┆ … ┆ 9      ┆ 14        ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ Charger          ┆ 2.2   ┆ … ┆ 23     ┆ 33        ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ B150/B250 Wagon  ┆ 5.2   ┆ … ┆ 10     ┆ 12       

### Renaming Columns

In [14]:
print(autos
      .select(pl.all()
              .exclude(['createdOn', 'barrels08'])
              .name.suffix('_auto'))
             )

shape: (48_231, 13)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ year_auto ┆ make_auto ┆ model_aut ┆ displ_aut ┆ … ┆ city08_au ┆ highway08 ┆ is_automa ┆ num_gear │
│ ---       ┆ ---       ┆ o         ┆ o         ┆   ┆ to        ┆ _auto     ┆ tic_auto  ┆ s_auto   │
│ i16       ┆ cat       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆ cat       ┆ f32       ┆   ┆ u8        ┆ u8        ┆ str       ┆ u8       │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 1985      ┆ Alfa      ┆ Spider    ┆ 2.0       ┆ … ┆ 19        ┆ 25        ┆ false     ┆ 5        │
│           ┆ Romeo     ┆ Veloce    ┆           ┆   ┆           ┆           ┆           ┆          │
│           ┆           ┆ 2000      ┆           ┆   ┆           ┆           ┆           ┆          │
│ 1985      ┆ Ferrari   ┆ Testaross ┆ 4.9       ┆ … ┆ 9         ┆ 14   

In [15]:
final_cols = ['year', 'make', 'model', 'city_mpg', 'highway_mpg']

In [16]:
print(autos
      .with_columns(pl.col('city08').alias('city_mpg'),
                    highway_mpg=pl.col('highway08'))
      .select(final_cols)
     )

shape: (48_231, 5)
┌──────┬────────────┬─────────────────────┬──────────┬─────────────┐
│ year ┆ make       ┆ model               ┆ city_mpg ┆ highway_mpg │
│ ---  ┆ ---        ┆ ---                 ┆ ---      ┆ ---         │
│ i16  ┆ cat        ┆ cat                 ┆ u8       ┆ u8          │
╞══════╪════════════╪═════════════════════╪══════════╪═════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce 2000  ┆ 19       ┆ 25          │
│ 1985 ┆ Ferrari    ┆ Testarossa          ┆ 9        ┆ 14          │
│ 1985 ┆ Dodge      ┆ Charger             ┆ 23       ┆ 33          │
│ 1985 ┆ Dodge      ┆ B150/B250 Wagon 2WD ┆ 10       ┆ 12          │
│ 1993 ┆ Subaru     ┆ Legacy AWD Turbo    ┆ 17       ┆ 23          │
│ …    ┆ …          ┆ …                   ┆ …        ┆ …           │
│ 1993 ┆ Subaru     ┆ Legacy              ┆ 19       ┆ 26          │
│ 1993 ┆ Subaru     ┆ Legacy              ┆ 20       ┆ 28          │
│ 1993 ┆ Subaru     ┆ Legacy AWD          ┆ 18       ┆ 24          │
│ 1993 ┆ Subaru

##### `SyntaxError` due to keyword argument

In [17]:
print(autos
...  .with_columns(highway_mpg=pl.col('highway08'),
...                pl.col('city08').alias('city_mpg'))
...  .select(final_cols)
...  )

SyntaxError: positional argument follows keyword argument (2241669681.py, line 3)

#### `.rename` as a Dictionary Mapping

In [18]:
final_cols = ['year', 'make', 'model', 'city_mpg', 'highway_mpg']

In [19]:
print(autos
      .rename({'city08':'city_mpg', 'highway08':'highway_mpg'})
      .select(final_cols)
     )

shape: (48_231, 5)
┌──────┬────────────┬─────────────────────┬──────────┬─────────────┐
│ year ┆ make       ┆ model               ┆ city_mpg ┆ highway_mpg │
│ ---  ┆ ---        ┆ ---                 ┆ ---      ┆ ---         │
│ i16  ┆ cat        ┆ cat                 ┆ u8       ┆ u8          │
╞══════╪════════════╪═════════════════════╪══════════╪═════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce 2000  ┆ 19       ┆ 25          │
│ 1985 ┆ Ferrari    ┆ Testarossa          ┆ 9        ┆ 14          │
│ 1985 ┆ Dodge      ┆ Charger             ┆ 23       ┆ 33          │
│ 1985 ┆ Dodge      ┆ B150/B250 Wagon 2WD ┆ 10       ┆ 12          │
│ 1993 ┆ Subaru     ┆ Legacy AWD Turbo    ┆ 17       ┆ 23          │
│ …    ┆ …          ┆ …                   ┆ …        ┆ …           │
│ 1993 ┆ Subaru     ┆ Legacy              ┆ 19       ┆ 26          │
│ 1993 ┆ Subaru     ┆ Legacy              ┆ 20       ┆ 28          │
│ 1993 ┆ Subaru     ┆ Legacy AWD          ┆ 18       ┆ 24          │
│ 1993 ┆ Subaru

### Joining Data Frames: Left Join

In [20]:
trucks = pl.DataFrame(
    {'make': ['Ford', 'Tesla', 'Chevy', 'Custom', 'Ford'],
     'model': ['F150', 'Cybertruck', 'Silverado', 'HotRod', 'F250'],
     'year': [2018, 2024, 2019, 1967, 2017],
     'city_mpg': [19, None, 17, 12, 18],
    })

In [21]:
print(trucks)

shape: (5, 4)
┌────────┬────────────┬──────┬──────────┐
│ make   ┆ model      ┆ year ┆ city_mpg │
│ ---    ┆ ---        ┆ ---  ┆ ---      │
│ str    ┆ str        ┆ i64  ┆ i64      │
╞════════╪════════════╪══════╪══════════╡
│ Ford   ┆ F150       ┆ 2018 ┆ 19       │
│ Tesla  ┆ Cybertruck ┆ 2024 ┆ null     │
│ Chevy  ┆ Silverado  ┆ 2019 ┆ 17       │
│ Custom ┆ HotRod     ┆ 1967 ┆ 12       │
│ Ford   ┆ F250       ┆ 2017 ┆ 18       │
└────────┴────────────┴──────┴──────────┘


In [22]:
manufacturer = pl.DataFrame(
    {'name': ['Ford', 'Tesla', 'Chevy', 'Toyota'],
     'country': ['USA', 'USA', 'USA', 'Japan'],
     'founded': [1903, 2003, 1911, 1937],
     'employees': [199_000, 48_000, 225_000, 370_000],
     'vehicles': [80, 3, 45, 30],
    })

In [23]:
print(manufacturer)

shape: (4, 5)
┌────────┬─────────┬─────────┬───────────┬──────────┐
│ name   ┆ country ┆ founded ┆ employees ┆ vehicles │
│ ---    ┆ ---     ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str     ┆ i64     ┆ i64       ┆ i64      │
╞════════╪═════════╪═════════╪═══════════╪══════════╡
│ Ford   ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       │
│ Tesla  ┆ USA     ┆ 2003    ┆ 48000     ┆ 3        │
│ Chevy  ┆ USA     ┆ 1911    ┆ 225000    ┆ 45       │
│ Toyota ┆ Japan   ┆ 1937    ┆ 370000    ┆ 30       │
└────────┴─────────┴─────────┴───────────┴──────────┘


In [24]:
print(manufacturer.join(trucks, how='left'))

ValueError: must specify `on` OR `left_on` and `right_on`

In [25]:
print(manufacturer.join(trucks, how='left', left_on='name', right_on='make'))

shape: (5, 8)
┌────────┬─────────┬─────────┬───────────┬──────────┬────────────┬──────┬──────────┐
│ name   ┆ country ┆ founded ┆ employees ┆ vehicles ┆ model      ┆ year ┆ city_mpg │
│ ---    ┆ ---     ┆ ---     ┆ ---       ┆ ---      ┆ ---        ┆ ---  ┆ ---      │
│ str    ┆ str     ┆ i64     ┆ i64       ┆ i64      ┆ str        ┆ i64  ┆ i64      │
╞════════╪═════════╪═════════╪═══════════╪══════════╪════════════╪══════╪══════════╡
│ Ford   ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       ┆ F150       ┆ 2018 ┆ 19       │
│ Ford   ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       ┆ F250       ┆ 2017 ┆ 18       │
│ Tesla  ┆ USA     ┆ 2003    ┆ 48000     ┆ 3        ┆ Cybertruck ┆ 2024 ┆ null     │
│ Chevy  ┆ USA     ┆ 1911    ┆ 225000    ┆ 45       ┆ Silverado  ┆ 2019 ┆ 17       │
│ Toyota ┆ Japan   ┆ 1937    ┆ 370000    ┆ 30       ┆ null       ┆ null ┆ null     │
└────────┴─────────┴─────────┴───────────┴──────────┴────────────┴──────┴──────────┘


### Right Join

In [26]:
print(trucks.join(manufacturer, how='left', right_on='name', left_on='make'))

shape: (5, 8)
┌────────┬────────────┬──────┬──────────┬─────────┬─────────┬───────────┬──────────┐
│ make   ┆ model      ┆ year ┆ city_mpg ┆ country ┆ founded ┆ employees ┆ vehicles │
│ ---    ┆ ---        ┆ ---  ┆ ---      ┆ ---     ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str        ┆ i64  ┆ i64      ┆ str     ┆ i64     ┆ i64       ┆ i64      │
╞════════╪════════════╪══════╪══════════╪═════════╪═════════╪═══════════╪══════════╡
│ Ford   ┆ F150       ┆ 2018 ┆ 19       ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       │
│ Tesla  ┆ Cybertruck ┆ 2024 ┆ null     ┆ USA     ┆ 2003    ┆ 48000     ┆ 3        │
│ Chevy  ┆ Silverado  ┆ 2019 ┆ 17       ┆ USA     ┆ 1911    ┆ 225000    ┆ 45       │
│ Custom ┆ HotRod     ┆ 1967 ┆ 12       ┆ null    ┆ null    ┆ null      ┆ null     │
│ Ford   ┆ F250       ┆ 2017 ┆ 18       ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       │
└────────┴────────────┴──────┴──────────┴─────────┴─────────┴───────────┴──────────┘


### Inner Joins

In [27]:
print(manufacturer.join(trucks, left_on='name', right_on='make'))

shape: (4, 8)
┌───────┬─────────┬─────────┬───────────┬──────────┬────────────┬──────┬──────────┐
│ name  ┆ country ┆ founded ┆ employees ┆ vehicles ┆ model      ┆ year ┆ city_mpg │
│ ---   ┆ ---     ┆ ---     ┆ ---       ┆ ---      ┆ ---        ┆ ---  ┆ ---      │
│ str   ┆ str     ┆ i64     ┆ i64       ┆ i64      ┆ str        ┆ i64  ┆ i64      │
╞═══════╪═════════╪═════════╪═══════════╪══════════╪════════════╪══════╪══════════╡
│ Ford  ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       ┆ F150       ┆ 2018 ┆ 19       │
│ Tesla ┆ USA     ┆ 2003    ┆ 48000     ┆ 3        ┆ Cybertruck ┆ 2024 ┆ null     │
│ Chevy ┆ USA     ┆ 1911    ┆ 225000    ┆ 45       ┆ Silverado  ┆ 2019 ┆ 17       │
│ Ford  ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       ┆ F250       ┆ 2017 ┆ 18       │
└───────┴─────────┴─────────┴───────────┴──────────┴────────────┴──────┴──────────┘


### Outer Joins

In [28]:
# In how, 'outer' parameter has been replaced by 'full'
print(manufacturer.join(trucks, how='full', left_on='name', right_on='make'))

shape: (6, 9)
┌────────┬─────────┬─────────┬───────────┬───┬────────┬────────────┬──────┬──────────┐
│ name   ┆ country ┆ founded ┆ employees ┆ … ┆ make   ┆ model      ┆ year ┆ city_mpg │
│ ---    ┆ ---     ┆ ---     ┆ ---       ┆   ┆ ---    ┆ ---        ┆ ---  ┆ ---      │
│ str    ┆ str     ┆ i64     ┆ i64       ┆   ┆ str    ┆ str        ┆ i64  ┆ i64      │
╞════════╪═════════╪═════════╪═══════════╪═══╪════════╪════════════╪══════╪══════════╡
│ Ford   ┆ USA     ┆ 1903    ┆ 199000    ┆ … ┆ Ford   ┆ F150       ┆ 2018 ┆ 19       │
│ Tesla  ┆ USA     ┆ 2003    ┆ 48000     ┆ … ┆ Tesla  ┆ Cybertruck ┆ 2024 ┆ null     │
│ Chevy  ┆ USA     ┆ 1911    ┆ 225000    ┆ … ┆ Chevy  ┆ Silverado  ┆ 2019 ┆ 17       │
│ null   ┆ null    ┆ null    ┆ null      ┆ … ┆ Custom ┆ HotRod     ┆ 1967 ┆ 12       │
│ Ford   ┆ USA     ┆ 1903    ┆ 199000    ┆ … ┆ Ford   ┆ F250       ┆ 2017 ┆ 18       │
│ Toyota ┆ Japan   ┆ 1937    ┆ 370000    ┆ … ┆ null   ┆ null       ┆ null ┆ null     │
└────────┴─────────┴─────────

### Semi Joins

In [29]:
print(manufacturer.join(trucks, how='semi', left_on='name', right_on='make'))

shape: (3, 5)
┌───────┬─────────┬─────────┬───────────┬──────────┐
│ name  ┆ country ┆ founded ┆ employees ┆ vehicles │
│ ---   ┆ ---     ┆ ---     ┆ ---       ┆ ---      │
│ str   ┆ str     ┆ i64     ┆ i64       ┆ i64      │
╞═══════╪═════════╪═════════╪═══════════╪══════════╡
│ Ford  ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       │
│ Tesla ┆ USA     ┆ 2003    ┆ 48000     ┆ 3        │
│ Chevy ┆ USA     ┆ 1911    ┆ 225000    ┆ 45       │
└───────┴─────────┴─────────┴───────────┴──────────┘


### Cross Joins

In [30]:
sizes = pl.DataFrame({'size': ['small', 'medium', 'large'],})

colors = pl.DataFrame({'color': ['red', 'green', ],})

In [31]:
print(sizes.join(colors, how='cross'))

shape: (6, 2)
┌────────┬───────┐
│ size   ┆ color │
│ ---    ┆ ---   │
│ str    ┆ str   │
╞════════╪═══════╡
│ small  ┆ red   │
│ small  ┆ green │
│ medium ┆ red   │
│ medium ┆ green │
│ large  ┆ red   │
│ large  ┆ green │
└────────┴───────┘


### Anti Joins

In [32]:
print(manufacturer.join(trucks, how='anti', left_on='name', right_on='make'))

shape: (1, 5)
┌────────┬─────────┬─────────┬───────────┬──────────┐
│ name   ┆ country ┆ founded ┆ employees ┆ vehicles │
│ ---    ┆ ---     ┆ ---     ┆ ---       ┆ ---      │
│ str    ┆ str     ┆ i64     ┆ i64       ┆ i64      │
╞════════╪═════════╪═════════╪═══════════╪══════════╡
│ Toyota ┆ Japan   ┆ 1937    ┆ 370000    ┆ 30       │
└────────┴─────────┴─────────┴───────────┴──────────┘


In [33]:
print(trucks.join(manufacturer, how='anti', right_on='name', left_on='make'))

shape: (1, 4)
┌────────┬────────┬──────┬──────────┐
│ make   ┆ model  ┆ year ┆ city_mpg │
│ ---    ┆ ---    ┆ ---  ┆ ---      │
│ str    ┆ str    ┆ i64  ┆ i64      │
╞════════╪════════╪══════╪══════════╡
│ Custom ┆ HotRod ┆ 1967 ┆ 12       │
└────────┴────────┴──────┴──────────┘


### Join Validation

In [34]:
print(manufacturer.join(trucks, left_on='name', right_on='make', validate='1:1'))

ComputeError: join keys did not fulfill 1:1 validation

In [35]:
print(manufacturer.join(trucks, left_on='name', right_on='make', validate='m:1'))

ComputeError: join keys did not fulfill m:1 validation

In [36]:
print(manufacturer.join(trucks, left_on='name', right_on='make', validate='1:m'))

shape: (4, 8)
┌───────┬─────────┬─────────┬───────────┬──────────┬────────────┬──────┬──────────┐
│ name  ┆ country ┆ founded ┆ employees ┆ vehicles ┆ model      ┆ year ┆ city_mpg │
│ ---   ┆ ---     ┆ ---     ┆ ---       ┆ ---      ┆ ---        ┆ ---  ┆ ---      │
│ str   ┆ str     ┆ i64     ┆ i64       ┆ i64      ┆ str        ┆ i64  ┆ i64      │
╞═══════╪═════════╪═════════╪═══════════╪══════════╪════════════╪══════╪══════════╡
│ Ford  ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       ┆ F150       ┆ 2018 ┆ 19       │
│ Tesla ┆ USA     ┆ 2003    ┆ 48000     ┆ 3        ┆ Cybertruck ┆ 2024 ┆ null     │
│ Chevy ┆ USA     ┆ 1911    ┆ 225000    ┆ 45       ┆ Silverado  ┆ 2019 ┆ 17       │
│ Ford  ┆ USA     ┆ 1903    ┆ 199000    ┆ 80       ┆ F250       ┆ 2017 ┆ 18       │
└───────┴─────────┴─────────┴───────────┴──────────┴────────────┴──────┴──────────┘


In [37]:
print(trucks
      .filter(pl.col('make').is_duplicated())
     )

shape: (2, 4)
┌──────┬───────┬──────┬──────────┐
│ make ┆ model ┆ year ┆ city_mpg │
│ ---  ┆ ---   ┆ ---  ┆ ---      │
│ str  ┆ str   ┆ i64  ┆ i64      │
╞══════╪═══════╪══════╪══════════╡
│ Ford ┆ F150  ┆ 2018 ┆ 19       │
│ Ford ┆ F250  ┆ 2017 ┆ 18       │
└──────┴───────┴──────┴──────────┘


### Speeding Up Joins with Sorting

In [38]:
# Retrieving Data from:
url = 'https://data.openei.org/files/907/'\
   '2016cityandcountylightdutyvehicleinventory.xlsb'

In [39]:
inv_raw = pl.read_excel(url, engine='calamine', 
    read_options=dict(header_row=1), sheet_name='City')

In [40]:
print(inv_raw
      .select(['state_abbr', 'gisjoin', 'city_id', 'city_name','fuel_type_org', 'fuel_type', 'class', '2000', '2001'])
     )

shape: (307_828, 9)
┌────────────┬───────────┬─────────┬───────────┬───┬───────────────┬─────────┬──────────┬──────────┐
│ state_abbr ┆ gisjoin   ┆ city_id ┆ city_name ┆ … ┆ fuel_type     ┆ class   ┆ 2000     ┆ 2001     │
│ ---        ┆ ---       ┆ ---     ┆ ---       ┆   ┆ ---           ┆ ---     ┆ ---      ┆ ---      │
│ str        ┆ str       ┆ i64     ┆ str       ┆   ┆ str           ┆ str     ┆ f64      ┆ f64      │
╞════════════╪═══════════╪═════════╪═══════════╪═══╪═══════════════╪═════════╪══════════╪══════════╡
│ AL         ┆ G01000124 ┆ 100124  ┆ Abbeville ┆ … ┆ Other/Unknown ┆ Car     ┆ null     ┆ 0.000135 │
│ AL         ┆ G01000124 ┆ 100124  ┆ Abbeville ┆ … ┆ Diesel        ┆ Car     ┆ 0.00027  ┆ null     │
│            ┆           ┆         ┆           ┆   ┆ vehicle       ┆         ┆          ┆          │
│ AL         ┆ G01000124 ┆ 100124  ┆ Abbeville ┆ … ┆ Diesel        ┆ Truck   ┆ 0.001622 ┆ 0.001893 │
│            ┆           ┆         ┆           ┆   ┆ vehicle       ┆   

In [41]:
years = [str(i) for i in range(2000, 2019)]

In [42]:
years = [str(i) for i in range(2000, 2019)]

In [43]:
inv_yr = (inv_raw
    .with_columns(
        [pl.when(pl.col(year).cast(pl.Utf8) == "")
         .then(0)
         .otherwise(pl.col(year))
         .cast(pl.Float32)
         .alias(year)
         for year in years]) 
    .unpivot(
        index=['state_abbr', 'gisjoin', 'city_id', 'city_name',
               'fuel_type_org', 'fuel_type', 'class'],
        on=years
    )
    .rename({'variable': 'year', 'value': 'percent'})
    .with_columns(
        percent=pl.col('percent').fill_null(0)
    )
    .select(
        'state_abbr', 'city_name', 'fuel_type_org', 'fuel_type', 'class', 
        year=pl.col('year').cast(pl.Int16),
        percent=(pl.col('percent') * 100)
    )
)

In [44]:
print(inv_yr)

shape: (5_848_732, 7)
┌────────────┬───────────┬──────────────────┬──────────────────────────┬─────────┬──────┬──────────┐
│ state_abbr ┆ city_name ┆ fuel_type_org    ┆ fuel_type                ┆ class   ┆ year ┆ percent  │
│ ---        ┆ ---       ┆ ---              ┆ ---                      ┆ ---     ┆ ---  ┆ ---      │
│ str        ┆ str       ┆ str              ┆ str                      ┆ str     ┆ i16  ┆ f32      │
╞════════════╪═══════════╪══════════════════╪══════════════════════════╪═════════╪══════╪══════════╡
│ AL         ┆ Abbeville ┆ BI               ┆ Other/Unknown            ┆ Car     ┆ 2000 ┆ 0.0      │
│ AL         ┆ Abbeville ┆ DIES             ┆ Diesel vehicle           ┆ Car     ┆ 2000 ┆ 0.027042 │
│ AL         ┆ Abbeville ┆ DIES             ┆ Diesel vehicle           ┆ Truck   ┆ 2000 ┆ 0.16225  │
│ AL         ┆ Abbeville ┆ DIES             ┆ Diesel vehicle           ┆ Unknown ┆ 2000 ┆ 0.0      │
│ AL         ┆ Abbeville ┆ ELECTRIC VEHICLE ┆ Electric vehicle       

In [45]:
# make simple cat
gas_mapping = {
    'Diesel': 'Diesel vehicle',
    'Regular': 'Gasoline vehicle',
    'Premium': 'Gasoline vehicle',
    'Midgrade': 'Gasoline vehicle',
    'Gasoline or E85': 'Flex fuel vehicle',
    'Premium or E85': 'Flex fuel vehicle',
    'Premium Gas or Electricity': 'Plug-in hybrid electric vehicle',
    'Regular Gas or Electricity': 'Plug-in hybrid electric vehicle',
    'Premium and Electricity': 'Hybrid electric vehicle',
    'Regular Gas and Electricity': 'Hybrid electric vehicle',
    'Electricity': 'Electric vehicle',
    'Gasoline or natural gas': 'Other/Unknown',
    'Gasoline or propane': 'Other/Unknown',
    'CNG': 'Other/Unknown',

}
agg_yr = (autos
 .with_columns(VClass=pl.col('VClass').cast(pl.String))
 .with_columns(
    simple_class=pl.when(pl.col('VClass')
                         .str.to_lowercase().str.contains('car'))
                        .then(pl.lit('Car'))
                   .when(pl.col('VClass')
                         .str.to_lowercase().str.contains('truck'))
                        .then(pl.lit('Truck'))
                   .otherwise(pl.lit('Other')),
    fuel_type=pl.col('fuelType').cast(pl.String)
                .replace_strict(gas_mapping, default='Missing')) #replace_strict fixed
 .group_by(['year', 'simple_class', 'fuel_type'])
 .agg(pl.col('city08').mean().alias('mean_mpg'))                                    
 )

In [46]:
print(agg_yr)

shape: (447, 4)
┌──────┬──────────────┬─────────────────────────────────┬───────────┐
│ year ┆ simple_class ┆ fuel_type                       ┆ mean_mpg  │
│ ---  ┆ ---          ┆ ---                             ┆ ---       │
│ i16  ┆ str          ┆ str                             ┆ f64       │
╞══════╪══════════════╪═════════════════════════════════╪═══════════╡
│ 2019 ┆ Car          ┆ Gasoline vehicle                ┆ 22.237232 │
│ 2023 ┆ Truck        ┆ Flex fuel vehicle               ┆ 16.4      │
│ 2022 ┆ Car          ┆ Hybrid electric vehicle         ┆ 27.294118 │
│ 1986 ┆ Other        ┆ Gasoline vehicle                ┆ 16.353919 │
│ 2019 ┆ Car          ┆ Plug-in hybrid electric vehicl… ┆ 24.2      │
│ …    ┆ …            ┆ …                               ┆ …         │
│ 2002 ┆ Other        ┆ Other/Unknown                   ┆ 12.0      │
│ 1996 ┆ Car          ┆ Gasoline vehicle                ┆ 18.94026  │
│ 1993 ┆ Other        ┆ Other/Unknown                   ┆ 15.0      │
│ 20

In [47]:
print (agg_yr
       .join(inv_yr, left_on=['year', 'simple_class', 'fuel_type'], right_on=['year', 'class', 'fuel_type'])
      )

shape: (3_234_418, 8)
┌──────┬──────────────┬──────────────┬───────────┬────────────┬───────────┬─────────────┬──────────┐
│ year ┆ simple_class ┆ fuel_type    ┆ mean_mpg  ┆ state_abbr ┆ city_name ┆ fuel_type_o ┆ percent  │
│ ---  ┆ ---          ┆ ---          ┆ ---       ┆ ---        ┆ ---       ┆ rg          ┆ ---      │
│ i16  ┆ str          ┆ str          ┆ f64       ┆ str        ┆ str       ┆ ---         ┆ f32      │
│      ┆              ┆              ┆           ┆            ┆           ┆ str         ┆          │
╞══════╪══════════════╪══════════════╪═══════════╪════════════╪═══════════╪═════════════╪══════════╡
│ 2000 ┆ Car          ┆ Other/Unknow ┆ 17.6      ┆ AL         ┆ Abbeville ┆ BI          ┆ 0.0      │
│      ┆              ┆ n            ┆           ┆            ┆           ┆             ┆          │
│ 2000 ┆ Car          ┆ Diesel       ┆ 32.0      ┆ AL         ┆ Abbeville ┆ DIES        ┆ 0.027042 │
│      ┆              ┆ vehicle      ┆           ┆            ┆      

##### Placing `Benchmarks`:

In [48]:
agg_yr_shuf = agg_yr.sample(len(agg_yr), with_replacement=False, seed=42)

inv_yr_shuf = inv_yr.sample(len(inv_yr), with_replacement=False, seed=42)

In [49]:
%%timeit
(agg_yr_shuf
 .join(inv_yr_shuf, left_on=['year', 'simple_class', 'fuel_type'], right_on=['year', 'class', 'fuel_type'])
          )

45.1 ms ± 899 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [50]:
# Sortin by Year
agg_yr_sort = agg_yr.sort('year')

inv_yr_sort = inv_yr.sort('year')

In [51]:
%%timeit
(agg_yr_sort
 .join(inv_yr_sort, left_on=['year', 'simple_class', 'fuel_type'], right_on=['year', 'class', 'fuel_type'])
)

45 ms ± 666 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [52]:
# Sort by Year & Class
agg_yr_sort2 = agg_yr.sort('year', 'simple_class', 'fuel_type')

inv_yr_sort2 = inv_yr.sort('year', 'class', 'fuel_type')

In [53]:
%%timeit
(agg_yr_sort2
 .join(inv_yr_sort2, left_on=['year', 'simple_class', 'fuel_type'], right_on=['year', 'class', 'fuel_type'])
)

43.3 ms ± 1.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [54]:
# Switching into Categorical Values
with pl.StringCache():
    agg_yr_cat = agg_yr_sort2.with_columns(
        pl.col('simple_class', 'fuel_type').cast(pl.Categorical))
    inv_yr_cat = inv_yr_sort2.with_columns(
        pl.col('class', 'fuel_type').cast(pl.Categorical))

In [55]:
%%timeit
(agg_yr_cat
 .join(inv_yr_cat, left_on=['year', 'simple_class', 'fuel_type'], right_on=['year', 'class', 'fuel_type'])
)

26.1 ms ± 228 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [56]:
# Comparing with Panda Sorting
agg_pd = agg_yr_sort2.to_pandas().sort_values(['year', 'simple_class', 'fuel_type'])

inv_pd = inv_yr_sort2.to_pandas().sort_values(['year', 'class', 'fuel_type'])

In [57]:
%%timeit
(agg_pd
 .merge(inv_pd, left_on=['year', 'simple_class', 'fuel_type'], right_on=['year', 'class', 'fuel_type'])
)

794 ms ± 8.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Visualizing The Join

##### Note: On pivot argument `column` has been replaced for `on`

In [61]:
print(agg_yr
      .join(inv_yr, left_on=['year', 'simple_class', 'fuel_type'],
            right_on=['year', 'class', 'fuel_type'])
      .filter(city_name='Salt Lake City')
      .pivot(index='year', on='simple_class', values='mean_mpg', aggregate_function='mean')
     )

shape: (19, 3)
┌──────┬───────────┬───────────┐
│ year ┆ Car       ┆ Truck     │
│ ---  ┆ ---       ┆ ---       │
│ i16  ┆ f64       ┆ f64       │
╞══════╪═══════════╪═══════════╡
│ 2000 ┆ 20.465806 ┆ 12.558559 │
│ 2001 ┆ 19.972288 ┆ 12.685439 │
│ 2002 ┆ 20.834157 ┆ 12.618696 │
│ 2003 ┆ 20.505458 ┆ 12.45977  │
│ 2004 ┆ 19.001403 ┆ 12.260606 │
│ …    ┆ …         ┆ …         │
│ 2014 ┆ 37.992402 ┆ 16.691919 │
│ 2015 ┆ 35.871053 ┆ 17.464186 │
│ 2016 ┆ 34.458447 ┆ 17.664933 │
│ 2017 ┆ 43.256047 ┆ 17.57901  │
│ 2018 ┆ 41.715381 ┆ 17.953026 │
└──────┴───────────┴───────────┘


In [64]:
import altair as alt
print(alt.__version__)

5.5.0


In [67]:
# print(agg_yr
#       .join(inv_yr, left_on=['year', 'simple_class', 'fuel_type'],
#             right_on=['year', 'class', 'fuel_type'])
#       .filter(city_name='Salt Lake City')
#       .pivot(index='year', on='simple_class', values='mean_mpg', aggregate_function='mean')
#       .plot.line(x='year', y=['Car', 'Truck'], title='Salt Lake City Mileage')
#      )

### Adding Rows

In [68]:
print(autos
      .tail(10)
      .vstack(autos.head(10))
     )

shape: (20, 15)
┌──────┬────────┬─────────┬───────┬───┬───────────┬─────────────────────┬──────────────┬───────────┐
│ year ┆ make   ┆ model   ┆ displ ┆ … ┆ highway08 ┆ createdOn           ┆ is_automatic ┆ num_gears │
│ ---  ┆ ---    ┆ ---     ┆ ---   ┆   ┆ ---       ┆ ---                 ┆ ---          ┆ ---       │
│ i16  ┆ cat    ┆ cat     ┆ f32   ┆   ┆ u8        ┆ datetime[μs]        ┆ str          ┆ u8        │
╞══════╪════════╪═════════╪═══════╪═══╪═══════════╪═════════════════════╪══════════════╪═══════════╡
│ 1993 ┆ Saab   ┆ 900     ┆ 2.1   ┆ … ┆ 24        ┆ 2013-01-01 00:00:00 ┆ false        ┆ 5         │
│ 1993 ┆ Saturn ┆ SL      ┆ 1.9   ┆ … ┆ 33        ┆ 2013-01-01 00:00:00 ┆ true         ┆ 4         │
│ 1993 ┆ Saturn ┆ SL      ┆ 1.9   ┆ … ┆ 30        ┆ 2013-01-01 00:00:00 ┆ true         ┆ 4         │
│ 1993 ┆ Saturn ┆ SL      ┆ 1.9   ┆ … ┆ 33        ┆ 2013-01-01 00:00:00 ┆ false        ┆ 5         │
│ 1993 ┆ Saturn ┆ SL      ┆ 1.9   ┆ … ┆ 32        ┆ 2013-01-01 00:00:00 ┆ f

### Reshaping & Pivoting Data

In [69]:
top_n = (autos
         .group_by('make')
         .agg(pl.col('city08').count())
         .sort('city08')
         .tail(5)
        )

In [70]:
print(top_n)

shape: (5, 2)
┌───────────┬────────┐
│ make      ┆ city08 │
│ ---       ┆ ---    │
│ cat       ┆ u32    │
╞═══════════╪════════╡
│ BMW       ┆ 2537   │
│ Dodge     ┆ 2696   │
│ GMC       ┆ 2823   │
│ Ford      ┆ 3835   │
│ Chevrolet ┆ 4472   │
└───────────┴────────┘


In [72]:
print(autos
      .filter(pl.col('make').is_in(top_n['make']))
      .pivot(index='year', on='make', values='city08', aggregate_function='median')
     )

shape: (42, 6)
┌──────┬───────┬──────┬───────────┬──────┬──────┐
│ year ┆ Dodge ┆ BMW  ┆ Chevrolet ┆ Ford ┆ GMC  │
│ ---  ┆ ---   ┆ ---  ┆ ---       ┆ ---  ┆ ---  │
│ i16  ┆ f64   ┆ f64  ┆ f64       ┆ f64  ┆ f64  │
╞══════╪═══════╪══════╪═══════════╪══════╪══════╡
│ 1985 ┆ 17.0  ┆ 18.0 ┆ 15.0      ┆ 15.0 ┆ 15.0 │
│ 1993 ┆ 14.0  ┆ 15.0 ┆ 15.0      ┆ 15.0 ┆ 14.0 │
│ 1994 ┆ 14.5  ┆ 15.5 ┆ 15.0      ┆ 15.0 ┆ 14.0 │
│ 1995 ┆ 15.0  ┆ 16.5 ┆ 15.0      ┆ 15.0 ┆ 14.0 │
│ 1996 ┆ 13.0  ┆ 18.0 ┆ 15.0      ┆ 15.0 ┆ 14.0 │
│ …    ┆ …     ┆ …    ┆ …         ┆ …    ┆ …    │
│ 2021 ┆ 15.0  ┆ 21.0 ┆ 16.0      ┆ 20.0 ┆ 15.0 │
│ 2022 ┆ 15.0  ┆ 21.0 ┆ 16.0      ┆ 20.0 ┆ 15.0 │
│ 2023 ┆ 15.0  ┆ 22.0 ┆ 17.0      ┆ 19.0 ┆ 16.0 │
│ 2024 ┆ 16.0  ┆ 23.0 ┆ 17.0      ┆ 19.0 ┆ 16.0 │
│ 2025 ┆ 17.0  ┆ 25.0 ┆ 16.0      ┆ 21.5 ┆ 16.0 │
└──────┴───────┴──────┴───────────┴──────┴──────┘


In [73]:
pivoted = (autos
           .filter(pl.col('make').is_in(top_n['make']))     
           .pivot(index='year', on='make', values='city08', aggregate_function='median')
           .sort('year')
          )

In [74]:
print(pivoted)

shape: (42, 6)
┌──────┬───────┬──────┬───────────┬──────┬──────┐
│ year ┆ Dodge ┆ BMW  ┆ Chevrolet ┆ Ford ┆ GMC  │
│ ---  ┆ ---   ┆ ---  ┆ ---       ┆ ---  ┆ ---  │
│ i16  ┆ f64   ┆ f64  ┆ f64       ┆ f64  ┆ f64  │
╞══════╪═══════╪══════╪═══════════╪══════╪══════╡
│ 1984 ┆ 18.0  ┆ 18.0 ┆ 16.0      ┆ 14.5 ┆ 15.0 │
│ 1985 ┆ 17.0  ┆ 18.0 ┆ 15.0      ┆ 15.0 ┆ 15.0 │
│ 1986 ┆ 14.5  ┆ 15.0 ┆ 15.0      ┆ 16.0 ┆ 15.0 │
│ 1987 ┆ 15.0  ┆ 15.0 ┆ 15.0      ┆ 15.0 ┆ 15.0 │
│ 1988 ┆ 14.0  ┆ 15.0 ┆ 15.0      ┆ 15.0 ┆ 15.0 │
│ …    ┆ …     ┆ …    ┆ …         ┆ …    ┆ …    │
│ 2021 ┆ 15.0  ┆ 21.0 ┆ 16.0      ┆ 20.0 ┆ 15.0 │
│ 2022 ┆ 15.0  ┆ 21.0 ┆ 16.0      ┆ 20.0 ┆ 15.0 │
│ 2023 ┆ 15.0  ┆ 22.0 ┆ 17.0      ┆ 19.0 ┆ 16.0 │
│ 2024 ┆ 16.0  ┆ 23.0 ┆ 17.0      ┆ 19.0 ┆ 16.0 │
│ 2025 ┆ 17.0  ┆ 25.0 ┆ 16.0      ┆ 21.5 ┆ 16.0 │
└──────┴───────┴──────┴───────────┴──────┴──────┘


In [76]:
# Preserving order with `.set_sorted`
pivoted = (autos
           .filter(pl.col('make').is_in(top_n['make']))
           .sort('year')
           .set_sorted('year')
           .pivot(index='year', on='make', values='city08', aggregate_function='median')
          )

In [77]:
print(pivoted)

shape: (42, 6)
┌──────┬───────────┬──────┬──────┬───────┬──────┐
│ year ┆ Chevrolet ┆ Ford ┆ BMW  ┆ Dodge ┆ GMC  │
│ ---  ┆ ---       ┆ ---  ┆ ---  ┆ ---   ┆ ---  │
│ i16  ┆ f64       ┆ f64  ┆ f64  ┆ f64   ┆ f64  │
╞══════╪═══════════╪══════╪══════╪═══════╪══════╡
│ 1984 ┆ 16.0      ┆ 14.5 ┆ 18.0 ┆ 18.0  ┆ 15.0 │
│ 1985 ┆ 15.0      ┆ 15.0 ┆ 18.0 ┆ 17.0  ┆ 15.0 │
│ 1986 ┆ 15.0      ┆ 16.0 ┆ 15.0 ┆ 14.5  ┆ 15.0 │
│ 1987 ┆ 15.0      ┆ 15.0 ┆ 15.0 ┆ 15.0  ┆ 15.0 │
│ 1988 ┆ 15.0      ┆ 15.0 ┆ 15.0 ┆ 14.0  ┆ 15.0 │
│ …    ┆ …         ┆ …    ┆ …    ┆ …     ┆ …    │
│ 2021 ┆ 16.0      ┆ 20.0 ┆ 21.0 ┆ 15.0  ┆ 15.0 │
│ 2022 ┆ 16.0      ┆ 20.0 ┆ 21.0 ┆ 15.0  ┆ 15.0 │
│ 2023 ┆ 17.0      ┆ 19.0 ┆ 22.0 ┆ 15.0  ┆ 16.0 │
│ 2024 ┆ 17.0      ┆ 19.0 ┆ 23.0 ┆ 16.0  ┆ 16.0 │
│ 2025 ┆ 16.0      ┆ 21.5 ┆ 25.0 ┆ 17.0  ┆ 16.0 │
└──────┴───────────┴──────┴──────┴───────┴──────┘


In [79]:
pivoted = (autos
           .filter(pl.col('make').is_in(top_n['make']))
           .sort('year').pivot(index='year', on='make', values='city08', aggregate_function='median', maintain_order=True)
          )

In [80]:
print(pivoted)

shape: (42, 6)
┌──────┬───────────┬──────┬──────┬───────┬──────┐
│ year ┆ Chevrolet ┆ Ford ┆ BMW  ┆ Dodge ┆ GMC  │
│ ---  ┆ ---       ┆ ---  ┆ ---  ┆ ---   ┆ ---  │
│ i16  ┆ f64       ┆ f64  ┆ f64  ┆ f64   ┆ f64  │
╞══════╪═══════════╪══════╪══════╪═══════╪══════╡
│ 1984 ┆ 16.0      ┆ 14.5 ┆ 18.0 ┆ 18.0  ┆ 15.0 │
│ 1985 ┆ 15.0      ┆ 15.0 ┆ 18.0 ┆ 17.0  ┆ 15.0 │
│ 1986 ┆ 15.0      ┆ 16.0 ┆ 15.0 ┆ 14.5  ┆ 15.0 │
│ 1987 ┆ 15.0      ┆ 15.0 ┆ 15.0 ┆ 15.0  ┆ 15.0 │
│ 1988 ┆ 15.0      ┆ 15.0 ┆ 15.0 ┆ 14.0  ┆ 15.0 │
│ …    ┆ …         ┆ …    ┆ …    ┆ …     ┆ …    │
│ 2021 ┆ 16.0      ┆ 20.0 ┆ 21.0 ┆ 15.0  ┆ 15.0 │
│ 2022 ┆ 16.0      ┆ 20.0 ┆ 21.0 ┆ 15.0  ┆ 15.0 │
│ 2023 ┆ 17.0      ┆ 19.0 ┆ 22.0 ┆ 15.0  ┆ 16.0 │
│ 2024 ┆ 17.0      ┆ 19.0 ┆ 23.0 ┆ 16.0  ┆ 16.0 │
│ 2025 ┆ 16.0      ┆ 21.5 ┆ 25.0 ┆ 17.0  ┆ 16.0 │
└──────┴───────────┴──────┴──────┴───────┴──────┘


### Melting Data