# Getting Started with Polars

### Loading Libraries

In [1]:
# ZipFiles
import zipfile

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
pl.__version__

'1.14.0'

### Laziness

In [3]:
# DataFrame Operation
df_ops = set(x for x in dir(pl.DataFrame()) if not x.startswith('_'))

# Lazy Operations
lazy_ops = set(x for x in dir(pl.LazyFrame()) if not x.startswith('_'))

In [4]:
print(sorted(df_ops - lazy_ops))

['corr', 'drop_in_place', 'equals', 'estimated_size', 'extend', 'flags', 'fold', 'get_column', 'get_column_index', 'get_columns', 'glimpse', 'hash_rows', 'height', 'hstack', 'insert_column', 'is_duplicated', 'is_empty', 'is_unique', 'item', 'iter_columns', 'iter_rows', 'iter_slices', 'map_rows', 'max_horizontal', 'mean_horizontal', 'min_horizontal', 'n_chunks', 'n_unique', 'partition_by', 'pivot', 'plot', 'product', 'rechunk', 'replace_column', 'row', 'rows', 'rows_by_key', 'sample', 'shape', 'shrink_to_fit', 'style', 'sum_horizontal', 'to_arrow', 'to_dict', 'to_dicts', 'to_dummies', 'to_init_repr', 'to_jax', 'to_numpy', 'to_pandas', 'to_series', 'to_struct', 'to_torch', 'transpose', 'unstack', 'upsample', 'vstack', 'write_avro', 'write_clipboard', 'write_csv', 'write_database', 'write_delta', 'write_excel', 'write_ipc', 'write_ipc_stream', 'write_json', 'write_ndjson', 'write_parquet']


In [5]:
print(sorted(lazy_ops & df_ops))

['approx_n_unique', 'bottom_k', 'cast', 'clear', 'clone', 'collect_schema', 'columns', 'count', 'describe', 'deserialize', 'drop', 'drop_nulls', 'dtypes', 'explode', 'fill_nan', 'fill_null', 'filter', 'gather_every', 'group_by', 'group_by_dynamic', 'head', 'interpolate', 'join', 'join_asof', 'join_where', 'lazy', 'limit', 'max', 'mean', 'median', 'melt', 'merge_sorted', 'min', 'null_count', 'pipe', 'quantile', 'rename', 'reverse', 'rolling', 'schema', 'select', 'select_seq', 'serialize', 'set_sorted', 'shift', 'slice', 'sort', 'sql', 'std', 'sum', 'tail', 'top_k', 'unique', 'unnest', 'unpivot', 'update', 'var', 'width', 'with_columns', 'with_columns_seq', 'with_row_count', 'with_row_index']


### Contexts & Expressions

In [6]:
col = pl.col('sample')

In [7]:
col_ops = set(x for x in dir(col) if not x.startswith('_'))

print(sorted(col_ops))

['abs', 'add', 'agg_groups', 'alias', 'all', 'and_', 'any', 'append', 'approx_n_unique', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg_max', 'arg_min', 'arg_sort', 'arg_true', 'arg_unique', 'arr', 'backward_fill', 'bin', 'bitwise_and', 'bitwise_count_ones', 'bitwise_count_zeros', 'bitwise_leading_ones', 'bitwise_leading_zeros', 'bitwise_or', 'bitwise_trailing_ones', 'bitwise_trailing_zeros', 'bitwise_xor', 'bottom_k', 'bottom_k_by', 'cast', 'cat', 'cbrt', 'ceil', 'clip', 'cos', 'cosh', 'cot', 'count', 'cum_count', 'cum_max', 'cum_min', 'cum_prod', 'cum_sum', 'cumulative_eval', 'cut', 'degrees', 'deserialize', 'diff', 'dot', 'drop_nans', 'drop_nulls', 'dt', 'entropy', 'eq', 'eq_missing', 'ewm_mean', 'ewm_mean_by', 'ewm_std', 'ewm_var', 'exclude', 'exp', 'explode', 'extend_constant', 'fill_nan', 'fill_null', 'filter', 'first', 'flatten', 'floor', 'floordiv', 'forward_fill', 'from_json', 'gather', 'gather_every', 'ge', 'get', 'gt', 'has_nulls', 'hash', 'head', 'hist'

In [8]:
(col
 .cast(pl.Int32)
 .fill_null(col.mean())
 .clip(upper_bound=100)
 .sample(10)
 .mean()
)

In [9]:
# Same Previous Approach:
ex1 = col.cast(pl.Int64)
ex2 = ex1.fill_null(col.cast(pl.Int64))
ex3 = ex2.clip(upper_bound=100)
ex4 = ex3.sample(10)
ex5 = ex4.mean()

In [10]:
pl.col.a_column

### Reading CSV Files

In [11]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [12]:
df = pl.read_csv(path, null_values=['NA'])

print(df)

shape: (48_231, 84)
┌───────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬─────────┬──────────┐
│ barrels08 ┆ barrelsA08 ┆ charge120 ┆ charge240 ┆ … ┆ startStop ┆ phevCity ┆ phevHwy ┆ phevComb │
│ ---       ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---      ┆ ---     ┆ ---      │
│ f64       ┆ f64        ┆ f64       ┆ f64       ┆   ┆ str       ┆ i64      ┆ i64     ┆ i64      │
╞═══════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪═════════╪══════════╡
│ 14.167143 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 11.018889 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 15.658421 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0

### Lazy CSV Reading

In [13]:
lazy = pl.scan_csv(path, null_values=['NA'])
print(lazy)

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

Csv SCAN [/Users/isisromero/desktop/polars/datasets/vehicles.csv]
PROJECT */84 COLUMNS


In [14]:
print(lazy
      .filter((pl.col('year') >= 1990) & (pl.col('year') < 2000))
      .select(['year', 'make', 'model'])
     )

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

 SELECT [col("year"), col("make"), col("model")] FROM
  FILTER [(col("year")) >= (1990)] FROM
    FILTER [(col("year")) < (2000)] FROM
      Csv SCAN [/Users/isisromero/desktop/polars/datasets/vehicles.csv]
      PROJECT */84 COLUMNS


### Data Type Inference & Manual Overrides

In [15]:
print(df.dtypes)

[Float64, Float64, Float64, Float64, Int64, Float64, Int64, Float64, Float64, Float64, Float64, Int64, Int64, Float64, Float64, Int64, Float64, Int64, Float64, Float64, Float64, Float64, Int64, Float64, String, Int64, String, Int64, Int64, Int64, String, String, Int64, Int64, Int64, Float64, Int64, Float64, Float64, Float64, Float64, Int64, Int64, Int64, Int64, Int64, String, String, String, Boolean, Int64, Int64, Int64, Float64, Float64, Float64, Float64, String, Float64, Float64, Float64, Float64, String, Int64, Int64, String, String, String, String, String, String, String, String, String, String, String, Float64, String, String, String, String, Int64, Int64, Int64]


In [16]:
print(df.schema)

Schema([('barrels08', Float64), ('barrelsA08', Float64), ('charge120', Float64), ('charge240', Float64), ('city08', Int64), ('city08U', Float64), ('cityA08', Int64), ('cityA08U', Float64), ('cityCD', Float64), ('cityE', Float64), ('cityUF', Float64), ('co2', Int64), ('co2A', Int64), ('co2TailpipeAGpm', Float64), ('co2TailpipeGpm', Float64), ('comb08', Int64), ('comb08U', Float64), ('combA08', Int64), ('combA08U', Float64), ('combE', Float64), ('combinedCD', Float64), ('combinedUF', Float64), ('cylinders', Int64), ('displ', Float64), ('drive', String), ('engId', Int64), ('eng_dscr', String), ('feScore', Int64), ('fuelCost08', Int64), ('fuelCostA08', Int64), ('fuelType', String), ('fuelType1', String), ('ghgScore', Int64), ('ghgScoreA', Int64), ('highway08', Int64), ('highway08U', Float64), ('highwayA08', Int64), ('highwayA08U', Float64), ('highwayCD', Float64), ('highwayE', Float64), ('highwayUF', Float64), ('hlv', Int64), ('hpv', Int64), ('id', Int64), ('lv2', Int64), ('lv4', Int64), (

In [17]:
cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 'drive',
        'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']

In [18]:
print(df.select(cols))

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i64  ┆ str        ┆ str            ┆ f64   ┆   ┆ f64       ┆ i64    ┆ i64       ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

In [19]:
print(df.select(pl.col(cols)))

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i64  ┆ str        ┆ str            ┆ f64   ┆   ┆ f64       ┆ i64    ┆ i64       ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

In [20]:
# On Chaining Method Design
print(df
      .select(pl.col(cols))
      .select(pl.col(pl.Int64))
      .describe()
     )

shape: (9, 5)
┌────────────┬────────────┬───────────┬───────────┬───────────┐
│ statistic  ┆ year       ┆ cylinders ┆ city08    ┆ highway08 │
│ ---        ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│ str        ┆ f64        ┆ f64       ┆ f64       ┆ f64       │
╞════════════╪════════════╪═══════════╪═══════════╪═══════════╡
│ count      ┆ 48231.0    ┆ 47266.0   ┆ 48231.0   ┆ 48231.0   │
│ null_count ┆ 0.0        ┆ 965.0     ┆ 0.0       ┆ 0.0       │
│ mean       ┆ 2004.60571 ┆ 5.698388  ┆ 19.932533 ┆ 25.822002 │
│ std        ┆ 12.687496  ┆ 1.772877  ┆ 12.656606 ┆ 11.084974 │
│ min        ┆ 1984.0     ┆ 2.0       ┆ 6.0       ┆ 9.0       │
│ 25%        ┆ 1993.0     ┆ 4.0       ┆ 15.0      ┆ 20.0      │
│ 50%        ┆ 2006.0     ┆ 6.0       ┆ 18.0      ┆ 24.0      │
│ 75%        ┆ 2016.0     ┆ 6.0       ┆ 21.0      ┆ 28.0      │
│ max        ┆ 2025.0     ┆ 16.0      ┆ 153.0     ┆ 142.0     │
└────────────┴────────────┴───────────┴───────────┴───────────┘


In [21]:
# Adding & Selecting Columns 
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int16),
                    pl.col('cylinders').cast(pl.UInt8),
                    pl.col('highway08').cast(pl.UInt8),
                    pl.col('city08').cast(pl.UInt8))
     )

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i16  ┆ str        ┆ str            ┆ f64   ┆   ┆ f64       ┆ u8     ┆ u8        ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

It's strongly recommended using `.describe` to examine all the integer columns and
determine if the column can be cast to a smaller integer type

In [22]:
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int8))
     )

InvalidOperationError: conversion from `i64` to `i8` failed in column 'year' for 250 out of 250 values: [1999, 1999, … 2000]

### Automatic Type Shrinking

In [23]:
print(df
      .select(pl.col(cols).shrink_dtype())
     )

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i16  ┆ str        ┆ str            ┆ f32   ┆   ┆ f32       ┆ i16    ┆ i16       ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

### Float Conversion

In [24]:
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int16),
                    pl.col('cylinders').cast(pl.UInt8),
                    pl.col('highway08').cast(pl.UInt8),
                    pl.col('city08').cast(pl.UInt8))
      .select(pl.col(pl.Float64))
      .sample(n=10, seed=42)
     )

shape: (10, 2)
┌───────┬───────────┐
│ displ ┆ barrels08 │
│ ---   ┆ ---       │
│ f64   ┆ f64       │
╞═══════╪═══════════╡
│ 2.2   ┆ 12.39625  │
│ 2.4   ┆ 13.523182 │
│ 4.0   ┆ 19.834    │
│ 5.3   ┆ 18.594375 │
│ 2.8   ┆ 14.8755   │
│ 1.3   ┆ 8.750294  │
│ 3.0   ┆ 14.167143 │
│ 5.3   ┆ 17.500588 │
│ 4.7   ┆ 21.250714 │
│ 1.8   ┆ 11.442692 │
└───────┴───────────┘


In [25]:
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int16),
                    pl.col('cylinders').cast(pl.UInt8),
                    pl.col('highway08').cast(pl.UInt8),
                    pl.col('city08').cast(pl.UInt8),
                    pl.col('displ').cast(pl.Float32),
                    pl.col('barrels08').cast(pl.Float32),
                   )
                   )

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i16  ┆ str        ┆ str            ┆ f32   ┆   ┆ f32       ┆ u8     ┆ u8        ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

### Extracting Numbers from Strings

In [26]:
print(df
      .select('trany')
     )

shape: (48_231, 1)
┌─────────────────┐
│ trany           │
│ ---             │
│ str             │
╞═════════════════╡
│ Manual 5-spd    │
│ Manual 5-spd    │
│ Manual 5-spd    │
│ Automatic 3-spd │
│ Manual 5-spd    │
│ …               │
│ Automatic 4-spd │
│ Manual 5-spd    │
│ Automatic 4-spd │
│ Manual 5-spd    │
│ Automatic 4-spd │
└─────────────────┘


In [27]:
# Switching it by chaining expression
print(df
      .select('trany',
              pl.col('trany')
              .str.to_lowercase()
              .str.contains('automatic')
              .alias('automatic'))
     )

shape: (48_231, 2)
┌─────────────────┬───────────┐
│ trany           ┆ automatic │
│ ---             ┆ ---       │
│ str             ┆ bool      │
╞═════════════════╪═══════════╡
│ Manual 5-spd    ┆ false     │
│ Manual 5-spd    ┆ false     │
│ Manual 5-spd    ┆ false     │
│ Automatic 3-spd ┆ true      │
│ Manual 5-spd    ┆ false     │
│ …               ┆ …         │
│ Automatic 4-spd ┆ true      │
│ Manual 5-spd    ┆ false     │
│ Automatic 4-spd ┆ true      │
│ Manual 5-spd    ┆ false     │
│ Automatic 4-spd ┆ true      │
└─────────────────┴───────────┘


In [28]:
print(df
      .with_columns('trany',
                    pl.col('trany')
                    .str.to_lowercase()
                    .str.contains('automatic')
                    .alias('is_automatic'))
     )

shape: (48_231, 85)
┌───────────┬────────────┬───────────┬───────────┬───┬──────────┬─────────┬──────────┬─────────────┐
│ barrels08 ┆ barrelsA08 ┆ charge120 ┆ charge240 ┆ … ┆ phevCity ┆ phevHwy ┆ phevComb ┆ is_automati │
│ ---       ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---      ┆ ---     ┆ ---      ┆ c           │
│ f64       ┆ f64        ┆ f64       ┆ f64       ┆   ┆ i64      ┆ i64     ┆ i64      ┆ ---         │
│           ┆            ┆           ┆           ┆   ┆          ┆         ┆          ┆ bool        │
╞═══════════╪════════════╪═══════════╪═══════════╪═══╪══════════╪═════════╪══════════╪═════════════╡
│ 14.167143 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0       ┆ 0        ┆ false       │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0       ┆ 0        ┆ false       │
│ 11.018889 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0       ┆ 0        ┆ false       │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0    

In [29]:
print(df
      .with_columns('trany',
                    is_automatic=pl.col('trany')
                    .str.to_lowercase()
                    .str.contains('automatic'))
     )

shape: (48_231, 85)
┌───────────┬────────────┬───────────┬───────────┬───┬──────────┬─────────┬──────────┬─────────────┐
│ barrels08 ┆ barrelsA08 ┆ charge120 ┆ charge240 ┆ … ┆ phevCity ┆ phevHwy ┆ phevComb ┆ is_automati │
│ ---       ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---      ┆ ---     ┆ ---      ┆ c           │
│ f64       ┆ f64        ┆ f64       ┆ f64       ┆   ┆ i64      ┆ i64     ┆ i64      ┆ ---         │
│           ┆            ┆           ┆           ┆   ┆          ┆         ┆          ┆ bool        │
╞═══════════╪════════════╪═══════════╪═══════════╪═══╪══════════╪═════════╪══════════╪═════════════╡
│ 14.167143 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0       ┆ 0        ┆ false       │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0       ┆ 0        ┆ false       │
│ 11.018889 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0       ┆ 0        ┆ false       │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0        ┆ 0    

In [30]:
# Checking on Entry Differential
print(df
      .group_by('trany')
      .len()
      .sort('len', descending=True)
     )

shape: (41, 2)
┌────────────────────┬───────┐
│ trany              ┆ len   │
│ ---                ┆ ---   │
│ str                ┆ u32   │
╞════════════════════╪═══════╡
│ Automatic 4-spd    ┆ 11048 │
│ Manual 5-spd       ┆ 8392  │
│ Automatic (S6)     ┆ 3352  │
│ Automatic (S8)     ┆ 3302  │
│ Automatic 3-spd    ┆ 3151  │
│ …                  ┆ …     │
│ null               ┆ 11    │
│ Automatic (AM-S9)  ┆ 6     │
│ Automatic (L4)     ┆ 2     │
│ Automatic (L3)     ┆ 2     │
│ Automatic (AM-S10) ┆ 2     │
└────────────────────┴───────┘


In [31]:
print(df
      .select(pl.col('trany')
              .value_counts(sort=True))
      .unnest('trany')
     )

shape: (41, 2)
┌────────────────────┬───────┐
│ trany              ┆ count │
│ ---                ┆ ---   │
│ str                ┆ u32   │
╞════════════════════╪═══════╡
│ Automatic 4-spd    ┆ 11048 │
│ Manual 5-spd       ┆ 8392  │
│ Automatic (S6)     ┆ 3352  │
│ Automatic (S8)     ┆ 3302  │
│ Automatic 3-spd    ┆ 3151  │
│ …                  ┆ …     │
│ null               ┆ 11    │
│ Automatic (AM-S9)  ┆ 6     │
│ Automatic (L4)     ┆ 2     │
│ Automatic (L3)     ┆ 2     │
│ Automatic (AM-S10) ┆ 2     │
└────────────────────┴───────┘


In [32]:
print(df
      .filter(pl.col('trany').is_null())
      .select('year', 'make', 'model', 'VClass')
     )

shape: (11, 4)
┌──────┬────────┬────────────────────────┬─────────────────────────────┐
│ year ┆ make   ┆ model                  ┆ VClass                      │
│ ---  ┆ ---    ┆ ---                    ┆ ---                         │
│ i64  ┆ str    ┆ str                    ┆ str                         │
╞══════╪════════╪════════════════════════╪═════════════════════════════╡
│ 2000 ┆ Nissan ┆ Altra EV               ┆ Midsize Station Wagons      │
│ 2000 ┆ Toyota ┆ RAV4 EV                ┆ Sport Utility Vehicle - 2WD │
│ 2001 ┆ Toyota ┆ RAV4 EV                ┆ Sport Utility Vehicle - 2WD │
│ 2001 ┆ Ford   ┆ Th!nk                  ┆ Two Seaters                 │
│ 2001 ┆ Ford   ┆ Explorer USPS Electric ┆ Sport Utility Vehicle - 2WD │
│ …    ┆ …      ┆ …                      ┆ …                           │
│ 2002 ┆ Toyota ┆ RAV4 EV                ┆ Sport Utility Vehicle - 2WD │
│ 2002 ┆ Ford   ┆ Explorer USPS Electric ┆ Sport Utility Vehicle - 2WD │
│ 2003 ┆ Toyota ┆ RAV4 EV           

In [33]:
# Transmission Type Extraction
print(df
      .select('trany',
              is_automatic=pl.col('trany')
              .str.contains('Automatic')
              .fill_null(True)
             )
     )

shape: (48_231, 2)
┌─────────────────┬──────────────┐
│ trany           ┆ is_automatic │
│ ---             ┆ ---          │
│ str             ┆ bool         │
╞═════════════════╪══════════════╡
│ Manual 5-spd    ┆ false        │
│ Manual 5-spd    ┆ false        │
│ Manual 5-spd    ┆ false        │
│ Automatic 3-spd ┆ true         │
│ Manual 5-spd    ┆ false        │
│ …               ┆ …            │
│ Automatic 4-spd ┆ true         │
│ Manual 5-spd    ┆ false        │
│ Automatic 4-spd ┆ true         │
│ Manual 5-spd    ┆ false        │
│ Automatic 4-spd ┆ true         │
└─────────────────┴──────────────┘


In [34]:
print(df
      .select(num_gears=pl.col('trany')
              .str.extract(r'(\d+)')
              .cast(pl.UInt8))
             )

shape: (48_231, 1)
┌───────────┐
│ num_gears │
│ ---       │
│ u8        │
╞═══════════╡
│ 5         │
│ 5         │
│ 5         │
│ 3         │
│ 5         │
│ …         │
│ 4         │
│ 5         │
│ 4         │
│ 5         │
│ 4         │
└───────────┘


In [35]:
# Looking for Unique Values
print(df
      .select(num_gears=pl.col('trany')
              .str.extract(r'(\d+)')
              .cast(pl.UInt8)
              .unique())
             )

shape: (11, 1)
┌───────────┐
│ num_gears │
│ ---       │
│ u8        │
╞═══════════╡
│ null      │
│ 1         │
│ 2         │
│ 3         │
│ 4         │
│ …         │
│ 6         │
│ 7         │
│ 8         │
│ 9         │
│ 10        │
└───────────┘


In [36]:
print(df
      .select(num_gears=pl.col('trany')
              .str.extract('r(\d+)')
              .cast(pl.UInt8))
      .filter(pl.col('num_gears').is_null())
             )

shape: (48_231, 1)
┌───────────┐
│ num_gears │
│ ---       │
│ u8        │
╞═══════════╡
│ null      │
│ null      │
│ null      │
│ null      │
│ null      │
│ …         │
│ null      │
│ null      │
│ null      │
│ null      │
│ null      │
└───────────┘


In [37]:
print(df
      .with_columns(
          is_automatic=pl.col('trany')
          .str.contains('Automatic')
          .fill_null(True),
          num_gears=pl.col('trany')
          .str.extract(r'(\d+)')
          .cast(pl.UInt8)
          .fill_null(6)
      )
     )

shape: (48_231, 86)
┌───────────┬────────────┬───────────┬───────────┬───┬─────────┬──────────┬────────────┬───────────┐
│ barrels08 ┆ barrelsA08 ┆ charge120 ┆ charge240 ┆ … ┆ phevHwy ┆ phevComb ┆ is_automat ┆ num_gears │
│ ---       ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---     ┆ ---      ┆ ic         ┆ ---       │
│ f64       ┆ f64        ┆ f64       ┆ f64       ┆   ┆ i64     ┆ i64      ┆ ---        ┆ u8        │
│           ┆            ┆           ┆           ┆   ┆         ┆          ┆ bool       ┆           │
╞═══════════╪════════════╪═══════════╪═══════════╪═══╪═════════╪══════════╪════════════╪═══════════╡
│ 14.167143 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0       ┆ 0        ┆ false      ┆ 5         │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0       ┆ 0        ┆ false      ┆ 5         │
│ 11.018889 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0       ┆ 0        ┆ false      ┆ 5         │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ 0       ┆ 0     

### Strings Columns

In [40]:
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int16),
                    pl.col('cylinders').cast(pl.UInt8),
                    pl.col('highway08').cast(pl.UInt8),
                    pl.col('city08').cast(pl.UInt8),
                    pl.col('displ').cast(pl.Float32),
                    pl.col('barrels08').cast(pl.Float32),
                   )
      .select(pl.col(pl.String))
     )

shape: (48_231, 7)
┌────────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────┬──────────────┐
│ make       ┆ model        ┆ trany        ┆ drive        ┆ VClass       ┆ fuelType ┆ createdOn    │
│ ---        ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ ---      ┆ ---          │
│ str        ┆ str          ┆ str          ┆ str          ┆ str          ┆ str      ┆ str          │
╞════════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════╪══════════════╡
│ Alfa Romeo ┆ Spider       ┆ Manual 5-spd ┆ Rear-Wheel   ┆ Two Seaters  ┆ Regular  ┆ Tue Jan 01   │
│            ┆ Veloce 2000  ┆              ┆ Drive        ┆              ┆          ┆ 00:00:00 EST │
│            ┆              ┆              ┆              ┆              ┆          ┆ 2013         │
│ Ferrari    ┆ Testarossa   ┆ Manual 5-spd ┆ Rear-Wheel   ┆ Two Seaters  ┆ Regular  ┆ Tue Jan 01   │
│            ┆              ┆              ┆ Drive        ┆             

In [41]:
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int16),
                    pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                    pl.col(['displ', 'barrels08']).cast(pl.Float32),
                    pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                   )
     )

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i16  ┆ cat        ┆ cat            ┆ f32   ┆   ┆ f32       ┆ u8     ┆ u8        ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

In [43]:
print(df
      .select(pl.col(cols))
      .with_columns(pl.col('year').cast(pl.Int16),
                    pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                    pl.col(['displ', 'barrels08']).cast(pl.Float32),
                    pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                    is_automatic=pl.col('trany')
                    .str.contains('Automatic')
                    .fill_null(True),
                    num_gears=pl.col('trany')
                    .str.extract(r'(\d+)')
                    .cast(pl.UInt8)
                    .fill_null(6))
                   )

shape: (48_231, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ i16  ┆ cat        ┆ cat          ┆ f32   ┆   ┆ u8        ┆ str         ┆ ---         ┆ u8        │
│      ┆            ┆              ┆       ┆   ┆           ┆             ┆ bool        ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ Tue Jan 01  ┆ false       ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│      ┆            ┆              ┆       ┆   ┆           ┆ EST 2013    ┆             ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ Tue Jan 01

### Parsing Dates