# Being Lazy & Streaming

### Loading Libraries

In [1]:
# ZipFiles & IO
import io
import os
import pprint
import zipfile

#URL
import urllib.request

# Time-Zone
import pytz

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import hvplot
import altair as alt
import seaborn as sns
import holoviews as hv
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

# Scikit-Learn
import sklearn
from sklearn import decomposition
from sklearn import preprocessing, decomposition
from sklearn.model_selection import train_test_split

# Java Script Object Notation
import json

# Date & Time
from datetime import datetime
from datetime import timedelta

In [2]:
hvplot.extension('matplotlib')

### Loading Data

In [8]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [21]:
df_pl = pl.read_csv(path, null_values=['NA'])

In [22]:
def tweak_auto(df):
    cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 
            'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']
    return (df
            .select(pl.col(cols))
            .with_columns(pl.col('year').cast(pl.Int16),
                          pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                          pl.col(['displ', 'barrels08']).cast(pl.Float32),
                          pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'),
                          is_automatic=pl.col('trany')                    
                          .str.contains('Automatic')
                          .fill_null('Automatic'),
                          num_gears=pl.col('trany')
                          .str.extract(r'(\d+)')
                          .cast(pl.UInt8)
                          .fill_null(6))
           )

In [23]:
autos = tweak_auto(df_pl)

#### `City Mileage by Origin` Filter Function() Logic, as follows:

```sh
    pl.when(pl.col('make') == 'Chevrolet')
         .then('USA')
         .when(pl.col('make') == 'Ford')
         .then('USA')
            ...
         .when(pl.col('make') == 'Tesla')
         .then('USA')
         .otherwise('Unknown')

In [24]:
def make_to_origin_expr(make_col):
    origin_dict = {
        'Chevrolet': 'USA',
        'Ford': 'USA',
        'Dodge': 'USA',
        'GMC': 'USA',
        'Toyota': 'Japan',
        'BMW': 'Germany',
        'Mercedes-Benz': 'Germany',
        'Nissan': 'Japan',
        'Volkswagen': 'Germany',
        'Mitsubishi': 'Japan',
        'Porsche': 'Germany',
        'Mazda': 'Japan',
        'Audi': 'Germany',
        'Honda': 'Japan',
        'Jeep': 'USA',
        'Pontiac': 'USA',
        'Subaru': 'Japan',
        'Volvo': 'Sweden',
        'Hyundai': 'South Korea',
        'Chrysler': 'USA',
        'Tesla': 'USA'
    }
    expr = None
    col = pl.col(make_col)
    for k, v in origin_dict.items():
        if expr is None:
            expr = pl.when(col == k).then(pl.lit(v))
        else:
            expr = expr.when(col == k).then(pl.lit(v))
            expr = expr.otherwise(pl.lit('Unknown'))
            return expr

In [25]:
result = (df_pl
          .with_columns(
              pl.col('createdOn')
              .str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_expr('make'))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          .pivot(index='year', on='origin', values='avg_city08')
          .sort('year')
         )

In [26]:
print(result)

shape: (36, 2)
┌──────┬───────────┐
│ year ┆ USA       │
│ ---  ┆ ---       │
│ i64  ┆ f64       │
╞══════╪═══════════╡
│ 1984 ┆ 16.045267 │
│ 1985 ┆ 16.593258 │
│ 1986 ┆ 16.632787 │
│ 1987 ┆ 16.243056 │
│ 1988 ┆ 16.05948  │
│ …    ┆ …         │
│ 2015 ┆ 21.159763 │
│ 2016 ┆ 21.649718 │
│ 2017 ┆ 21.231183 │
│ 2018 ┆ 21.328283 │
│ 2019 ┆ 20.34715  │
└──────┴───────────┘


In [27]:
%%timeit
df_pl = pl.read_csv(path, null_values=['NA'])

result = (df_pl
          .with_columns(
              pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_expr('make'))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          .pivot(index='year', on='origin', values='avg_city08')
          .sort('year')
         )

30.2 ms ± 493 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### The `Replace` Method()

In [56]:
# An Easier Solution()
def make_to_origin_replace(make_col):
    origin_dict = {
        'Chevrolet': 'USA',
        'Ford': 'USA',
        'Dodge': 'USA',
        'GMC': 'USA',
        'Toyota': 'Japan',
        'BMW': 'Germany',
        'Mercedes-Benz': 'Germany',
        'Nissan': 'Japan',
        'Volkswagen': 'Germany',
        'Mitsubishi': 'Japan',
        'Porsche': 'Germany',
        'Mazda': 'Japan',
        'Audi': 'Germany',
        'Honda': 'Japan',
        'Jeep': 'USA',
        'Pontiac': 'USA',
        'Subaru': 'Japan',
        'Volvo': 'Sweden',
        'Hyundai': 'South Korea',
        'Chrysler': 'USA',
        'Tesla': 'USA'
    }   
    return make_col.replace_strict(origin_dict, default='Unknown')

In [57]:
result = (df_pl
         .with_columns(
             pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
         .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
         .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
         .group_by(['origin', 'year'])
         .agg(avg_city08=pl.col("city08").mean())
         .pivot(index='year', on='origin', values='avg_city08')
         .sort('year')
         )

In [58]:
%%timeit
df_pl = pl.read_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

result = (df_pl
          .with_columns(
              pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          .pivot(index='year', on='origin', values='avg_city08')
          .sort('year')
         )

32.1 ms ± 409 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### `Lazy Version` Take One

In [59]:
df_pl_lazy = pl.scan_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

result = (df_pl_lazy
          .with_columns(
              pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          .pivot(index='year', columns='origin', values='avg_city08')
          .sort('year')
         )

AttributeError: 'LazyFrame' object has no attribute 'pivot'

In [60]:
# Applying Collect()
df_pl_lazy = pl.scan_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

result = (df_pl_lazy
          .with_columns(
              pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          .collect()
          .pivot(index='year', on='origin', values='avg_city08')
          .sort('year')
         )

In [61]:
print(result)

shape: (36, 6)
┌──────┬───────────┬───────────┬───────────┬───────────┬─────────────┐
│ year ┆ USA       ┆ Germany   ┆ Sweden    ┆ Japan     ┆ South Korea │
│ ---  ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ i64  ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64         │
╞══════╪═══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ 1984 ┆ 16.458456 ┆ 20.06338  ┆ 18.540541 ┆ 21.705882 ┆ null        │
│ 1985 ┆ 16.576883 ┆ 18.175676 ┆ 17.529412 ┆ 21.533333 ┆ null        │
│ 1986 ┆ 16.424023 ┆ 18.072464 ┆ 19.0      ┆ 20.606383 ┆ 24.0        │
│ 1987 ┆ 16.042879 ┆ 17.724138 ┆ 17.5625   ┆ 20.085202 ┆ 24.0        │
│ 1988 ┆ 16.085179 ┆ 16.5      ┆ 17.714286 ┆ 19.919283 ┆ 23.333333   │
│ …    ┆ …         ┆ …         ┆ …         ┆ …         ┆ …           │
│ 2015 ┆ 21.473016 ┆ 20.730556 ┆ 20.666667 ┆ 23.891192 ┆ 22.75       │
│ 2016 ┆ 24.283951 ┆ 20.75     ┆ 21.111111 ┆ 26.324022 ┆ 23.711111   │
│ 2017 ┆ 23.414634 ┆ 21.09375  ┆ 22.272727 ┆ 26.353488 ┆ 29.29

#### Let's Time It Up

In [65]:
%%timeit
df_pl_lazy = pl.scan_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

result = (df_pl_lazy
          .with_columns(
              pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          .collect()
          .pivot(index='year', on='origin', values='avg_city08')
          .sort('year')
         )

8.73 ms ± 228 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Fetching Data

In [68]:
# Replacing with Fetch()
df_pl_lazy = pl.scan_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

result = (df_pl_lazy
          .with_columns(
              pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
          .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
          .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
          .group_by(['origin', 'year'])
          .agg(avg_city08=pl.col("city08").mean())
          #.collect()
          .fetch(5)
          .pivot(index='year', on='origin', values='avg_city08')
          .sort('year')
         )

  .fetch(5)


In [69]:
print(result)

shape: (2, 3)
┌──────┬───────┬──────┐
│ year ┆ Japan ┆ USA  │
│ ---  ┆ ---   ┆ ---  │
│ i64  ┆ f64   ┆ f64  │
╞══════╪═══════╪══════╡
│ 1985 ┆ null  ┆ 16.5 │
│ 1993 ┆ 17.0  ┆ null │
└──────┴───────┴──────┘


### Pandas Comparison

In [74]:
%%timeit
import pandas as pd
def make_to_origin(make):
    origin_dict = {
        'Chevrolet': 'USA',
        'Ford': 'USA',
        'Dodge': 'USA',
        'GMC': 'USA',
        'Toyota': 'Japan',
        'BMW': 'Germany',
        'Mercedes-Benz': 'Germany',
        'Nissan': 'Japan',
        'Volkswagen': 'Germany',
        'Mitsubishi': 'Japan',
        'Porsche': 'Germany',
        'Mazda': 'Japan',
        'Audi': 'Germany',
        'Honda': 'Japan',
        'Jeep': 'USA',
        'Pontiac': 'USA',
        'Subaru': 'Japan',
        'Volvo': 'Sweden',
        'Hyundai': 'South Korea',
        'Chrysler': 'USA',
        'Tesla': 'USA'
    }
    
    return origin_dict.get(make, "Unknown")

76.5 ns ± 0.634 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [76]:
df_pd = pd.read_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', engine='pyarrow', dtype_backend='pyarrow')

(df_pd
 .assign(origin=lambda df: df['make'].apply(make_to_origin), createdOn=lambda df: df['createdOn']
           .str.replace('EDT', '-04:00').str.replace('EST', '-05:00')
        )
 .assign(
     createdOn=lambda df: pd.to_datetime(df['createdOn'], format='%a %b %d %H:%M:%S %z %Y', utc=True),
 )
 .query('origin != "Unknown" and year < 2020')
 .loc[:, ['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn']]
 .groupby(['origin', 'year'])
 .city08
 .mean()
 .unstack('origin')
)

### Viewing Plans

In [77]:
df_pl_lazy = pl.scan_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

In [78]:
print(df_pl_lazy
      .with_columns(
          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
      .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
      .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
      .group_by(['origin', 'year'])
      .agg(avg_city08=pl.col("city08").mean())
      #.collect()
     )

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

AGGREGATE
	[col("city08").mean().alias("avg_city08")] BY [col("origin"), col("year")] FROM
   SELECT [col("make"), col("model"), col("year"), col("city08"), col("highway08"), col("origin"), col("createdOn")] FROM
    FILTER [(col("origin")) != (String(Unknown))] FROM
      FILTER [(col("year")) < (2020)] FROM
         WITH_COLUMNS:
         [col("createdOn").str.strptime([String(raise)]), col("make").replace_strict([Series, Series, String(Unknown)]).alias("origin")] 
          Csv SCAN [/Users/isisromero/desktop/polars/datasets/vehicles.csv]
          PROJECT */84 COLUMNS


In [None]:
df_pl_lazy = pl.scan_csv('/Users/isisromero/desktop/polars/datasets/vehicles.csv', null_values='NA')

In [79]:
print(df_pl_lazy
      .with_columns(
          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
      .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
      .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
      .group_by(['origin', 'year'])
      .agg(avg_city08=pl.col("city08").mean())
      .explain()
      #.collect()
     )

AGGREGATE
	[col("city08").mean().alias("avg_city08")] BY [col("origin"), col("year")] FROM
  simple π 3/4 ["year", "city08", "origin"]
    FILTER [([(col("year")) < (2020)]) & ([(col("origin")) != (String(Unknown))])] FROM
       WITH_COLUMNS:
       [col("make").replace_strict([Series, Series, String(Unknown)]).alias("origin")] 
        Csv SCAN [/Users/isisromero/desktop/polars/datasets/vehicles.csv]
        PROJECT 3/84 COLUMNS


### Streaming

In [80]:
factor = 400

In [84]:
df_pl_lazy = pl.scan_csv(['/Users/isisromero/desktop/polars/datasets/vehicles.csv']*factor, null_values='NA')

In [85]:
print(df_pl_lazy
      .with_columns(
          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'), origin=make_to_origin_replace(pl.col('make')))
      .filter((pl.col("origin") != "Unknown") & (pl.col("year") < 2020))
      .select(['make', 'model', 'year', 'city08', 'highway08', 'origin', 'createdOn'])
      .group_by(['origin', 'year'])
      .agg(avg_city08=pl.col("city08").mean())
      .collect(streaming=True)
      .pivot(index='year', on='origin', values='avg_city08')
      .sort('year')
     )

shape: (36, 6)
┌──────┬───────────┬───────────┬───────────┬───────────┬─────────────┐
│ year ┆ Sweden    ┆ Germany   ┆ USA       ┆ Japan     ┆ South Korea │
│ ---  ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---         │
│ i64  ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64         │
╞══════╪═══════════╪═══════════╪═══════════╪═══════════╪═════════════╡
│ 1984 ┆ 18.540541 ┆ 20.06338  ┆ 16.458456 ┆ 21.705882 ┆ null        │
│ 1985 ┆ 17.529412 ┆ 18.175676 ┆ 16.576883 ┆ 21.533333 ┆ null        │
│ 1986 ┆ 19.0      ┆ 18.072464 ┆ 16.424023 ┆ 20.606383 ┆ 24.0        │
│ 1987 ┆ 17.5625   ┆ 17.724138 ┆ 16.042879 ┆ 20.085202 ┆ 24.0        │
│ 1988 ┆ 17.714286 ┆ 16.5      ┆ 16.085179 ┆ 19.919283 ┆ 23.333333   │
│ …    ┆ …         ┆ …         ┆ …         ┆ …         ┆ …           │
│ 2015 ┆ 20.666667 ┆ 20.730556 ┆ 21.473016 ┆ 23.891192 ┆ 22.75       │
│ 2016 ┆ 21.111111 ┆ 20.75     ┆ 24.283951 ┆ 26.324022 ┆ 23.711111   │
│ 2017 ┆ 22.272727 ┆ 21.09375  ┆ 23.414634 ┆ 26.353488 ┆ 29.29