# Extending Polars

### Loading Libraries

In [22]:
# ZipFiles & IO
import io
import os
import pprint
import zipfile

#URL
import urllib.request

# Time-Zone
import pytz

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.testing as pt
import polars.selectors as cs

# Numba
import numba
from numba import njit

# Data Visualization
import hvplot
import altair as alt
import seaborn as sns
import holoviews as hv
import matplotlib.pyplot as plt
from IPython.display import display, Math

# XGBoost
import xgboost as xgb

# Scikit-Learn
import sklearn
from sklearn import decomposition
from sklearn import preprocessing, decomposition
from sklearn.model_selection import train_test_split

# Java Script Object Notation
import json

# Date & Time
from datetime import datetime
from datetime import timedelta

In [14]:
hvplot.extension('matplotlib')

### Loans with Polars

#### Loan Formula:
$$
P = L \cdot \frac{c(1 + c)^n}{(1 + c)^n - 1}
$$

In [15]:
# Loan amount
L = 600_000

# Monthly interest
c = 0.055 / 12

# Number of payments
n = 360

In [16]:
# Loan payment amount due
P = (L * 
     (c * (1 + c) ** n) / ((1 + c) ** n - 1))

In [17]:
print(P)

3406.734008082003


#### Dictionary Sequences:

In [18]:
def payment_schedule_gen(principal, number_of_payments, monthly_interest_rate, monthly_payment):
    remaining_balance = principal
    done = False
    for month in range(number_of_payments):
        if remaining_balance < monthly_payment:
            interest_payment = remaining_balance * monthly_interest_rate
            monthly_payment = remaining_balance + interest_payment
            principal_payment = remaining_balance
            remaining_balance = 0
            done = True
        else:
            interest_payment = remaining_balance * monthly_interest_rate
            principal_payment = monthly_payment - interest_payment
            remaining_balance -= principal_payment
            yield {'month': month,
                   'Principal': principal_payment,
                   'Interest': interest_payment,
                   'Remaining Balance': remaining_balance,
                   'Monthly Payment': monthly_payment}
        if done:
            break

In [19]:
print(pl.DataFrame(payment_schedule_gen(L, n, c, P)))

shape: (359, 5)
┌───────┬─────────────┬─────────────┬───────────────────┬─────────────────┐
│ month ┆ Principal   ┆ Interest    ┆ Remaining Balance ┆ Monthly Payment │
│ ---   ┆ ---         ┆ ---         ┆ ---               ┆ ---             │
│ i64   ┆ f64         ┆ f64         ┆ f64               ┆ f64             │
╞═══════╪═════════════╪═════════════╪═══════════════════╪═════════════════╡
│ 0     ┆ 656.734008  ┆ 2750.0      ┆ 599343.265992     ┆ 3406.734008     │
│ 1     ┆ 659.744039  ┆ 2746.989969 ┆ 598683.521953     ┆ 3406.734008     │
│ 2     ┆ 662.767866  ┆ 2743.966142 ┆ 598020.754087     ┆ 3406.734008     │
│ 3     ┆ 665.805552  ┆ 2740.928456 ┆ 597354.948535     ┆ 3406.734008     │
│ 4     ┆ 668.857161  ┆ 2737.876847 ┆ 596686.091375     ┆ 3406.734008     │
│ …     ┆ …           ┆ …           ┆ …                 ┆ …               │
│ 354   ┆ 3314.533509 ┆ 92.200499   ┆ 16801.939082      ┆ 3406.734008     │
│ 355   ┆ 3329.725121 ┆ 77.008887   ┆ 13472.213962      ┆ 3406.734008   

In [20]:
%%timeit
pl.DataFrame(payment_schedule_gen(L, n, c, P))

190 μs ± 3.74 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### Using Numba

In [21]:
print(numba.__version__)

0.60.0


In [26]:
@njit
def payment_schedule_numba(principal, number_of_payments, monthly_interest_rate, monthly_payment):
    remaining_balance = principal
    done = False
    results = np.zeros((number_of_payments, 5), dtype=np.float64)
    for month in range(number_of_payments):
        if remaining_balance < monthly_payment:
            interest_payment = remaining_balance * monthly_interest_rate
            monthly_payment = remaining_balance + interest_payment
            principal_payment = remaining_balance
            remaining_balance = 0
            done = True
        else:
            interest_payment = remaining_balance * monthly_interest_rate
            principal_payment = monthly_payment - interest_payment
            remaining_balance -= principal_payment
            results[month, 0] = month
            results[month, 1] = principal_payment
            results[month, 2] = interest_payment
            results[month, 3] = remaining_balance
            results[month, 4] = monthly_payment
        if done:
            break
        return results

In [27]:
print(pl.DataFrame(payment_schedule_numba(L, n, c, P))
     .rename({'column_0': 'month', 'column_1': 'Principal', 
              'column_2': 'Interest', 'column_3': 'Remaining Balance', 
              'column_4': 'Monthly Payment'}))        

shape: (360, 5)
┌───────┬────────────┬──────────┬───────────────────┬─────────────────┐
│ month ┆ Principal  ┆ Interest ┆ Remaining Balance ┆ Monthly Payment │
│ ---   ┆ ---        ┆ ---      ┆ ---               ┆ ---             │
│ f64   ┆ f64        ┆ f64      ┆ f64               ┆ f64             │
╞═══════╪════════════╪══════════╪═══════════════════╪═════════════════╡
│ 0.0   ┆ 656.734008 ┆ 2750.0   ┆ 599343.265992     ┆ 3406.734008     │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0               ┆ 0.0             │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0               ┆ 0.0             │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0               ┆ 0.0             │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0               ┆ 0.0             │
│ …     ┆ …          ┆ …        ┆ …                 ┆ …               │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0               ┆ 0.0             │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0               ┆ 0.0             │
│ 0.0   ┆ 0.0        ┆ 0.0      ┆ 0.0           

In [28]:
%%timeit
>>> (pl.DataFrame(payment_schedule_numba(L, n, c, P))
     .rename({'column_0': 'month', 'column_1': 'Principal', 
              'column_2': 'Interest', 'column_3': 'Remaining Balance', 
              'column_4': 'Monthly Payment'}))

15.5 μs ± 189 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


### Closed Form Solution

#### Formula, as follows:
$$
B = P \cdot \frac{(1 + c)^n - (1 + c)^p}{(1 + c)^n - 1}
$$

In [29]:
# Schedule
sched = pl.DataFrame({'month': np.arange(360)})

In [30]:
def remaining_balance_pl(principal, number_of_payments, monthly_interest_rate, num_month):
    return principal * ((
        (1 + monthly_interest_rate)**number_of_payments - ( 1 + monthly_interest_rate)**num_month ) / 
                        ((1 + monthly_interest_rate)**number_of_payments - 1 ))

In [31]:
print(sched
      .with_columns(
          remaining_balance_pl(L, n, c, pl.col('month')).alias('Remaining Balance'), pl.lit(P).alias('Monthly Payment'))
      .with_columns(Interest=(pl.col('Remaining Balance') * c))
      .with_columns(Principal=(pl.col('Monthly Payment') - pl.col('Interest')))
     )

shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

In [33]:
%%timeit
print(sched
      .with_columns(
          remaining_balance_pl(L, n, c, pl.col('month')).alias('Remaining Balance'), pl.lit(P).alias('Monthly Payment'))
      .with_columns(Interest=(pl.col('Remaining Balance') * c))
      .with_columns(Principal=(pl.col('Monthly Payment') - pl.col('Interest')))
     )

shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



shape: (360, 5)
┌───────┬───────────────────┬─────────────────┬─────────────┬─────────────┐
│ month ┆ Remaining Balance ┆ Monthly Payment ┆ Interest    ┆ Principal   │
│ ---   ┆ ---               ┆ ---             ┆ ---         ┆ ---         │
│ i64   ┆ f64               ┆ f64             ┆ f64         ┆ f64         │
╞═══════╪═══════════════════╪═════════════════╪═════════════╪═════════════╡
│ 0     ┆ 600000.0          ┆ 3406.734008     ┆ 2750.0      ┆ 656.734008  │
│ 1     ┆ 599343.265992     ┆ 3406.734008     ┆ 2746.989969 ┆ 659.744039  │
│ 2     ┆ 598683.521953     ┆ 3406.734008     ┆ 2743.966142 ┆ 662.767866  │
│ 3     ┆ 598020.754087     ┆ 3406.734008     ┆ 2740.928456 ┆ 665.805552  │
│ 4     ┆ 597354.948535     ┆ 3406.734008     ┆ 2737.876847 ┆ 668.857161  │
│ …     ┆ …                 ┆ …               ┆ …           ┆ …           │
│ 355   ┆ 16801.939082      ┆ 3406.734008     ┆ 77.008887   ┆ 3329.725121 │
│ 356   ┆ 13472.213962      ┆ 3406.734008     ┆ 61.747647   ┆ 3344.98636

### Adding `PCA API` to Polars

#### Using `Fuel Economy` Dataset

In [34]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [35]:
raw = pl.read_csv(path, null_values=['NA'])

In [36]:
def tweak_auto(df):
    cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 
            'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']
    return (df
            .select(pl.col(cols))
            .with_columns(pl.col('year').cast(pl.Int16),
                          pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                          pl.col(['displ', 'barrels08']).cast(pl.Float32),
                          pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'),
                          is_automatic=pl.col('trany')                    
                          .str.contains('Automatic')
                          .fill_null('Automatic'),
                          num_gears=pl.col('trany')
                          .str.extract(r'(\d+)')
                          .cast(pl.UInt8)
                          .fill_null(6))
           )

In [37]:
autos = tweak_auto(raw)

### Calculating The Principal Components

In [38]:
X = (autos
     .select(cs.numeric().fill_null(0))
     .select((pl.all() - pl.all().mean()) / pl.all().std()) # 1
)

num_df = X
centered = (num_df # 2
            .select((pl.all() - pl.all().mean()))
           )

cov = np.cov(centered.transpose()) # 3

vals, vecs = np.linalg.eig(cov) # 4

exp_var = pl.DataFrame( # 5
    {'PC': [f'PC{i+1}' for i in range(len(num_df.columns))],
     'var':sorted(vals, reverse=True)})

idxs = np.argsort(vals)[::-1]

comps = (pl.DataFrame(vecs[:, idxs]) # 6
         .rename(mapping={f'column_{i}': f'PC{i+1}' 
                          for i in range(len(num_df.columns))})
)

pcas = (pl.DataFrame(np.dot(centered, comps)) # 7
    .rename(mapping={f'column_{i}': f'PC{i+1}' 
                     for i in range(len(num_df.columns))})
)

In [39]:
print(pcas)

shape: (48_231, 7)
┌───────────┬──────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ PC1       ┆ PC2      ┆ PC3       ┆ PC4       ┆ PC5       ┆ PC6       ┆ PC7       │
│ ---       ┆ ---      ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ f64       ┆ f64      ┆ f64       ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞═══════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.549902  ┆ 1.378063 ┆ 1.077457  ┆ 0.657628  ┆ 0.288561  ┆ -0.039205 ┆ -0.035689 │
│ -4.218674 ┆ 0.901778 ┆ -1.210626 ┆ 0.921096  ┆ 0.413656  ┆ 1.603396  ┆ -0.020481 │
│ 1.271853  ┆ 1.275372 ┆ 0.637652  ┆ 1.069356  ┆ -0.084872 ┆ -0.221518 ┆ 0.185363  │
│ -3.38195  ┆ 1.991488 ┆ -0.796244 ┆ -0.308666 ┆ 0.790254  ┆ 0.188206  ┆ -0.003548 │
│ 0.275128  ┆ 0.96946  ┆ 1.004431  ┆ 0.090781  ┆ 0.39297   ┆ -0.052182 ┆ 0.004378  │
│ …         ┆ …        ┆ …         ┆ …         ┆ …         ┆ …         ┆ …         │
│ 0.71862   ┆ 1.316193 ┆ 0.692686  ┆ -0.017141