# Getting Started with Polars

### Loading Libraries

In [16]:
# ZipFiles
import zipfile

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pl.__version__

'1.14.0'

### Laziness

In [3]:
# DataFrame Operation
df_ops = set(x for x in dir(pl.DataFrame()) if not x.startswith('_'))

# Lazy Operations
lazy_ops = set(x for x in dir(pl.LazyFrame()) if not x.startswith('_'))

In [5]:
print(sorted(df_ops - lazy_ops))

['corr', 'drop_in_place', 'equals', 'estimated_size', 'extend', 'flags', 'fold', 'get_column', 'get_column_index', 'get_columns', 'glimpse', 'hash_rows', 'height', 'hstack', 'insert_column', 'is_duplicated', 'is_empty', 'is_unique', 'item', 'iter_columns', 'iter_rows', 'iter_slices', 'map_rows', 'max_horizontal', 'mean_horizontal', 'min_horizontal', 'n_chunks', 'n_unique', 'partition_by', 'pivot', 'plot', 'product', 'rechunk', 'replace_column', 'row', 'rows', 'rows_by_key', 'sample', 'shape', 'shrink_to_fit', 'style', 'sum_horizontal', 'to_arrow', 'to_dict', 'to_dicts', 'to_dummies', 'to_init_repr', 'to_jax', 'to_numpy', 'to_pandas', 'to_series', 'to_struct', 'to_torch', 'transpose', 'unstack', 'upsample', 'vstack', 'write_avro', 'write_clipboard', 'write_csv', 'write_database', 'write_delta', 'write_excel', 'write_ipc', 'write_ipc_stream', 'write_json', 'write_ndjson', 'write_parquet']


In [6]:
print(sorted(lazy_ops & df_ops))

['approx_n_unique', 'bottom_k', 'cast', 'clear', 'clone', 'collect_schema', 'columns', 'count', 'describe', 'deserialize', 'drop', 'drop_nulls', 'dtypes', 'explode', 'fill_nan', 'fill_null', 'filter', 'gather_every', 'group_by', 'group_by_dynamic', 'head', 'interpolate', 'join', 'join_asof', 'join_where', 'lazy', 'limit', 'max', 'mean', 'median', 'melt', 'merge_sorted', 'min', 'null_count', 'pipe', 'quantile', 'rename', 'reverse', 'rolling', 'schema', 'select', 'select_seq', 'serialize', 'set_sorted', 'shift', 'slice', 'sort', 'sql', 'std', 'sum', 'tail', 'top_k', 'unique', 'unnest', 'unpivot', 'update', 'var', 'width', 'with_columns', 'with_columns_seq', 'with_row_count', 'with_row_index']


### Contexts & Expressions

In [7]:
col = pl.col('sample')

In [8]:
col_ops = set(x for x in dir(col) if not x.startswith('_'))

print(sorted(col_ops))

['abs', 'add', 'agg_groups', 'alias', 'all', 'and_', 'any', 'append', 'approx_n_unique', 'arccos', 'arccosh', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg_max', 'arg_min', 'arg_sort', 'arg_true', 'arg_unique', 'arr', 'backward_fill', 'bin', 'bitwise_and', 'bitwise_count_ones', 'bitwise_count_zeros', 'bitwise_leading_ones', 'bitwise_leading_zeros', 'bitwise_or', 'bitwise_trailing_ones', 'bitwise_trailing_zeros', 'bitwise_xor', 'bottom_k', 'bottom_k_by', 'cast', 'cat', 'cbrt', 'ceil', 'clip', 'cos', 'cosh', 'cot', 'count', 'cum_count', 'cum_max', 'cum_min', 'cum_prod', 'cum_sum', 'cumulative_eval', 'cut', 'degrees', 'deserialize', 'diff', 'dot', 'drop_nans', 'drop_nulls', 'dt', 'entropy', 'eq', 'eq_missing', 'ewm_mean', 'ewm_mean_by', 'ewm_std', 'ewm_var', 'exclude', 'exp', 'explode', 'extend_constant', 'fill_nan', 'fill_null', 'filter', 'first', 'flatten', 'floor', 'floordiv', 'forward_fill', 'from_json', 'gather', 'gather_every', 'ge', 'get', 'gt', 'has_nulls', 'hash', 'head', 'hist'

In [13]:
(col
 .cast(pl.Int32)
 .fill_null(col.mean())
 .clip(upper_bound=100)
 .sample(10)
 .mean()
)

In [14]:
# Same Previous Approach:
ex1 = col.cast(pl.Int64)
ex2 = ex1.fill_null(col.cast(pl.Int64))
ex3 = ex2.clip(upper_bound=100)
ex4 = ex3.sample(10)
ex5 = ex4.mean()

In [15]:
pl.col.a_column

### Reading CSV Files

In [20]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [22]:
df = pl.read_csv(path, null_values=['NA'])

print(df)

shape: (48_231, 84)
┌───────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬─────────┬──────────┐
│ barrels08 ┆ barrelsA08 ┆ charge120 ┆ charge240 ┆ … ┆ startStop ┆ phevCity ┆ phevHwy ┆ phevComb │
│ ---       ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---      ┆ ---     ┆ ---      │
│ f64       ┆ f64        ┆ f64       ┆ f64       ┆   ┆ str       ┆ i64      ┆ i64     ┆ i64      │
╞═══════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪═════════╪══════════╡
│ 14.167143 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 11.018889 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 27.046364 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0        │
│ 15.658421 ┆ 0.0        ┆ 0.0       ┆ 0.0       ┆ … ┆ null      ┆ 0        ┆ 0       ┆ 0

### Lazy CSV Reading

In [23]:
lazy = pl.scan_csv(path, null_values=['NA'])
print(lazy)

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

Csv SCAN [/Users/isisromero/desktop/polars/datasets/vehicles.csv]
PROJECT */84 COLUMNS


In [24]:
print(lazy
      .filter((pl.col('year') >= 1990) & (pl.col('year') < 2000))
      .select(['year', 'make', 'model'])
     )

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

 SELECT [col("year"), col("make"), col("model")] FROM
  FILTER [(col("year")) >= (1990)] FROM
    FILTER [(col("year")) < (2000)] FROM
      Csv SCAN [/Users/isisromero/desktop/polars/datasets/vehicles.csv]
      PROJECT */84 COLUMNS


### Data Type Inference & Manual Overrides

In [25]:
print(df.dtypes)

[Float64, Float64, Float64, Float64, Int64, Float64, Int64, Float64, Float64, Float64, Float64, Int64, Int64, Float64, Float64, Int64, Float64, Int64, Float64, Float64, Float64, Float64, Int64, Float64, String, Int64, String, Int64, Int64, Int64, String, String, Int64, Int64, Int64, Float64, Int64, Float64, Float64, Float64, Float64, Int64, Int64, Int64, Int64, Int64, String, String, String, Boolean, Int64, Int64, Int64, Float64, Float64, Float64, Float64, String, Float64, Float64, Float64, Float64, String, Int64, Int64, String, String, String, String, String, String, String, String, String, String, String, Float64, String, String, String, String, Int64, Int64, Int64]


In [27]:
print(df.schema)

Schema([('barrels08', Float64), ('barrelsA08', Float64), ('charge120', Float64), ('charge240', Float64), ('city08', Int64), ('city08U', Float64), ('cityA08', Int64), ('cityA08U', Float64), ('cityCD', Float64), ('cityE', Float64), ('cityUF', Float64), ('co2', Int64), ('co2A', Int64), ('co2TailpipeAGpm', Float64), ('co2TailpipeGpm', Float64), ('comb08', Int64), ('comb08U', Float64), ('combA08', Int64), ('combA08U', Float64), ('combE', Float64), ('combinedCD', Float64), ('combinedUF', Float64), ('cylinders', Int64), ('displ', Float64), ('drive', String), ('engId', Int64), ('eng_dscr', String), ('feScore', Int64), ('fuelCost08', Int64), ('fuelCostA08', Int64), ('fuelType', String), ('fuelType1', String), ('ghgScore', Int64), ('ghgScoreA', Int64), ('highway08', Int64), ('highway08U', Float64), ('highwayA08', Int64), ('highwayA08U', Float64), ('highwayCD', Float64), ('highwayE', Float64), ('highwayUF', Float64), ('hlv', Int64), ('hpv', Int64), ('id', Int64), ('lv2', Int64), ('lv4', Int64), (

In [28]:
cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 'drive',
        'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']

In [29]:
print(df.select(cols))

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i64  ┆ str        ┆ str            ┆ f64   ┆   ┆ f64       ┆ i64    ┆ i64       ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆

In [30]:
print(df.select(pl.col(cols)))

shape: (48_231, 13)
┌──────┬────────────┬────────────────┬───────┬───┬───────────┬────────┬───────────┬────────────────┐
│ year ┆ make       ┆ model          ┆ displ ┆ … ┆ barrels08 ┆ city08 ┆ highway08 ┆ createdOn      │
│ ---  ┆ ---        ┆ ---            ┆ ---   ┆   ┆ ---       ┆ ---    ┆ ---       ┆ ---            │
│ i64  ┆ str        ┆ str            ┆ f64   ┆   ┆ f64       ┆ i64    ┆ i64       ┆ str            │
╞══════╪════════════╪════════════════╪═══════╪═══╪═══════════╪════════╪═══════════╪════════════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider Veloce  ┆ 2.0   ┆ … ┆ 14.167143 ┆ 19     ┆ 25        ┆ Tue Jan 01     │
│      ┆            ┆ 2000           ┆       ┆   ┆           ┆        ┆           ┆ 00:00:00 EST   │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆           ┆ 2013           │
│ 1985 ┆ Ferrari    ┆ Testarossa     ┆ 4.9   ┆ … ┆ 27.046364 ┆ 9      ┆ 14        ┆ Tue Jan 01     │
│      ┆            ┆                ┆       ┆   ┆           ┆        ┆