# Data Import & Export

### Loading Libraries

In [1]:
# ZipFiles & IO
import io
import os
import pprint
import zipfile

#URL
import urllib.request

# Time-Zone
import pytz

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import hvplot
import altair as alt
import seaborn as sns
import holoviews as hv
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

# Scikit-Learn
import sklearn
from sklearn import decomposition
from sklearn import preprocessing, decomposition
from sklearn.model_selection import train_test_split

# Java Script Object Notation
import json

# Date & Time
from datetime import datetime
from datetime import timedelta

In [2]:
hvplot.extension('matplotlib')

### Loading Data

In [3]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [4]:
raw = pl.read_csv(path, null_values=['NA'])

In [5]:
def tweak_auto(df):
    cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 
            'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']
    return (df
            .select(pl.col(cols))
            .with_columns(pl.col('year').cast(pl.Int16),
                          pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                          pl.col(['displ', 'barrels08']).cast(pl.Float32),
                          pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'),
                          is_automatic=pl.col('trany')                    
                          .str.contains('Automatic')
                          .fill_null('Automatic'),
                          num_gears=pl.col('trany')
                          .str.extract(r'(\d+)')
                          .cast(pl.UInt8)
                          .fill_null(6))
           )

In [6]:
autos = tweak_auto(raw)

### Exporting to `CSV`

In [7]:
print(autos
      .head(3)
      .write_csv()
     )

year,make,model,displ,cylinders,trany,drive,VClass,fuelType,barrels08,city08,highway08,createdOn,is_automatic,num_gears
1985,Alfa Romeo,Spider Veloce 2000,2.0,4,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,14.167143,19,25,2013-01-01T00:00:00.000000,false,5
1985,Ferrari,Testarossa,4.9,12,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,27.046364,9,14,2013-01-01T00:00:00.000000,false,5
1985,Dodge,Charger,2.2,4,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,11.018888,23,33,2013-01-01T00:00:00.000000,false,5



In [8]:
print(autos
      .head(3)
      .write_csv(float_precision=2)
     )

year,make,model,displ,cylinders,trany,drive,VClass,fuelType,barrels08,city08,highway08,createdOn,is_automatic,num_gears
1985,Alfa Romeo,Spider Veloce 2000,2.00,4,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,14.17,19,25,2013-01-01T00:00:00.000000,false,5
1985,Ferrari,Testarossa,4.90,12,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,27.05,9,14,2013-01-01T00:00:00.000000,false,5
1985,Dodge,Charger,2.20,4,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,11.02,23,33,2013-01-01T00:00:00.000000,false,5



### Exporting to `JSON`

In [9]:
pprint.pprint(json.loads(
    autos.head(2)
    .write_json()))

[{'VClass': 'Two Seaters',
  'barrels08': 14.167143,
  'city08': 19,
  'createdOn': '2013-01-01 00:00:00',
  'cylinders': 4,
  'displ': 2.0,
  'drive': 'Rear-Wheel Drive',
  'fuelType': 'Regular',
  'highway08': 25,
  'is_automatic': 'false',
  'make': 'Alfa Romeo',
  'model': 'Spider Veloce 2000',
  'num_gears': 5,
  'trany': 'Manual 5-spd',
  'year': 1985},
 {'VClass': 'Two Seaters',
  'barrels08': 27.046364,
  'city08': 9,
  'createdOn': '2013-01-01 00:00:00',
  'cylinders': 12,
  'displ': 4.9,
  'drive': 'Rear-Wheel Drive',
  'fuelType': 'Regular',
  'highway08': 14,
  'is_automatic': 'false',
  'make': 'Ferrari',
  'model': 'Testarossa',
  'num_gears': 5,
  'trany': 'Manual 5-spd',
  'year': 1985}]


### Reading `JSON`

In [10]:
from io import StringIO

print(pl.read_json(StringIO(autos.write_json())))

shape: (48_231, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ i64  ┆ str        ┆ str          ┆ f64   ┆   ┆ i64       ┆ str         ┆ ---         ┆ i64       │
│      ┆            ┆              ┆       ┆   ┆           ┆             ┆ str         ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆              ┆       ┆   ┆           ┆ 00:00:00  

In [11]:
# autos.head(3).to_pandas().to_json('tmp/pd.json', orient='records')

In [12]:
# pprint.pprint(json.loads(autos.head(2)
#                          .to_pandas().to_json(orient='records')))

In [13]:
# from_pd = pl.read_json('/tmp/pd.json')

# print(from_pd)

In [14]:
# print(from_pd
#       .with_columns(createdOn=pl.from_epoch('createdOn', time_unit='s'))
#      )

### `Custom JSON` Handling

In [15]:
# Using Default Pandas Orientation
pprint.pprint(json.loads(autos.head(2)
                         .to_pandas().to_json()))

{'VClass': {'0': 'Two Seaters', '1': 'Two Seaters'},
 'barrels08': {'0': 14.167142868, '1': 27.0463638306},
 'city08': {'0': 19, '1': 9},
 'createdOn': {'0': 1356998400, '1': 1356998400},
 'cylinders': {'0': 4, '1': 12},
 'displ': {'0': 2.0, '1': 4.9000000954},
 'drive': {'0': 'Rear-Wheel Drive', '1': 'Rear-Wheel Drive'},
 'fuelType': {'0': 'Regular', '1': 'Regular'},
 'highway08': {'0': 25, '1': 14},
 'is_automatic': {'0': 'false', '1': 'false'},
 'make': {'0': 'Alfa Romeo', '1': 'Ferrari'},
 'model': {'0': 'Spider Veloce 2000', '1': 'Testarossa'},
 'num_gears': {'0': 5, '1': 5},
 'trany': {'0': 'Manual 5-spd', '1': 'Manual 5-spd'},
 'year': {'0': 1985, '1': 1985}}


In [16]:
print(pl.read_json(io.StringIO(autos.head(2).to_pandas().to_json())))

shape: (1, 15)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ year      ┆ make      ┆ model     ┆ displ     ┆ … ┆ highway08 ┆ createdOn ┆ is_automa ┆ num_gear │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ tic       ┆ s        │
│ struct[2] ┆ struct[2] ┆ struct[2] ┆ struct[2] ┆   ┆ struct[2] ┆ struct[2] ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆ struct[2] ┆ struct[2 │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ ]        │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ {1985,198 ┆ {"Alfa    ┆ {"Spider  ┆ {2.0,4.9} ┆ … ┆ {25,14}   ┆ {13569984 ┆ {"false", ┆ {5,5}    │
│ 5}        ┆ Romeo","F ┆ Veloce    ┆           ┆   ┆           ┆ 00,135699 ┆ "false"}  ┆          │
│           ┆ errari"}  ┆ 2000","Te ┆           ┆   ┆           ┆ 8400}     

In [17]:
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}],
})
     )

shape: (3, 3)
┌─────┬───────────┬───────────┐
│ num ┆ listy     ┆ structy   │
│ --- ┆ ---       ┆ ---       │
│ i64 ┆ list[i64] ┆ struct[2] │
╞═════╪═══════════╪═══════════╡
│ 1   ┆ [1, 2]    ┆ {1,2}     │
│ 2   ┆ [3, 4]    ┆ {3,4}     │
│ 3   ┆ [5, 6]    ┆ {5,6}     │
└─────┴───────────┴───────────┘


In [18]:
# Exploiding Data
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}],
})
      .explode('listy')
     )

shape: (6, 3)
┌─────┬───────┬───────────┐
│ num ┆ listy ┆ structy   │
│ --- ┆ ---   ┆ ---       │
│ i64 ┆ i64   ┆ struct[2] │
╞═════╪═══════╪═══════════╡
│ 1   ┆ 1     ┆ {1,2}     │
│ 1   ┆ 2     ┆ {1,2}     │
│ 2   ┆ 3     ┆ {3,4}     │
│ 2   ┆ 4     ┆ {3,4}     │
│ 3   ┆ 5     ┆ {5,6}     │
│ 3   ┆ 6     ┆ {5,6}     │
└─────┴───────┴───────────┘


In [19]:
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}],
})
      .explode('structy')
     )

InvalidOperationError: `explode` operation not supported for dtype `struct[2]`

In [None]:
# Converting struct to a list
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}]})
      .with_columns(structy=pl.col('structy').map_elements(
          lambda d: list(d.values())
      )
                   )
     )

In [None]:
print(pl.read_json(io.StringIO(autos.to_pandas().to_json()))
      .with_columns(pl.all().map_elements(lambda d: list(d.values())))
      .explode(pl.all())
      .with_columns(createdOn=pl.from_epoch('createdOn', time_unit='s'))
     )

### Munging `JSON`

In [None]:
pprint.pprint(json.loads(autos.head(2).to_pandas()
                         .to_json(orient='split')))

In [20]:
# Reading into Polars
print(pl.read_json(io.StringIO(autos.to_pandas()
                               .to_json(orient='split'))))

shape: (1, 3)
┌─────────────────────────────────┬─────────────────┬─────────────────────────────────┐
│ columns                         ┆ index           ┆ data                            │
│ ---                             ┆ ---             ┆ ---                             │
│ list[str]                       ┆ list[i64]       ┆ list[list[str]]                 │
╞═════════════════════════════════╪═════════════════╪═════════════════════════════════╡
│ ["year", "make", … "num_gears"… ┆ [0, 1, … 48230] ┆ [["1985", "Alfa Romeo", … "5"]… │
└─────────────────────────────────┴─────────────────┴─────────────────────────────────┘


In [21]:
def split_json_to_dict(json_str):
    """ Convert pandas "split" json to a sequence of dictionaries
    representing the rows of the dataframe.
    """
    data = json.loads(json_str)
    columns = data['columns']
    for row in data['data']:
        yield dict(zip(columns, row))

In [22]:
print(pl.DataFrame(
    split_json_to_dict(autos.to_pandas().to_json(orient='split'))))

shape: (48_231, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬────────────┬──────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn  ┆ is_automatic ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---        ┆ ---          ┆ ---       │
│ i64  ┆ str        ┆ str          ┆ f64   ┆   ┆ i64       ┆ i64        ┆ str          ┆ i64       │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪════════════╪══════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 1356998400 ┆ false        ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆            ┆              ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 1356998400 ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ Charger      ┆ 2.2   ┆ … ┆ 33        ┆ 1356998400 ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ B150/B250    ┆ 5.2   ┆ … ┆ 12        ┆ 1356998400

### Exporting to `Excel`

In [23]:
(autos
 .head(3)
 .write_excel('/Users/isisromero/desktop/polars/datasets/autos.xlsx')
)

<xlsxwriter.workbook.Workbook at 0x16dc47450>

In [24]:
a3 = (pl.read_excel('/Users/isisromero/desktop/polars/datasets/autos.xlsx'))

print(a3)

shape: (3, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬────────────┬──────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn  ┆ is_automatic ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---        ┆ ---          ┆ ---       │
│ i64  ┆ str        ┆ str          ┆ f64   ┆   ┆ i64       ┆ date       ┆ str          ┆ i64       │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪════════════╪══════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01 ┆ false        ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆            ┆              ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01 ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ Charger      ┆ 2.2   ┆ … ┆ 33        ┆ 2013-01-01 ┆ false        ┆ 5         │
└──────┴────────────┴──────────────┴───────┴───┴───────────┴────────────┴───

### Exporting to `Parquet`

In [25]:
(autos
 .head(3)
 .write_parquet('/Users/isisromero/desktop/polars/datasets/a3.parquet')
)

In [26]:
pd.read_parquet('/Users/isisromero/desktop/polars/datasets/a3.parquet').to_parquet('/Users/isisromero/desktop/polars/datasets/a4.parquet')

In [27]:
a4 = pl.read_parquet('/Users/isisromero/desktop/polars/datasets/a4.parquet')

print(a4)

shape: (3, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ i16  ┆ cat        ┆ cat          ┆ f32   ┆   ┆ u8        ┆ datetime[μs ┆ ---         ┆ u8        │
│      ┆            ┆              ┆       ┆   ┆           ┆ ]           ┆ str         ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆              ┆       ┆   ┆           ┆ 00:00:00    ┆  

In [28]:
autos.head(3).equals(a4)

True

In [29]:
(autos
 .head(3)
 .select(pl.all().shrink_dtype())
 .write_parquet('/Users/isisromero/desktop/polars/datasets/a3-shrink.parquet')
)

### Exporting to `SQL`

In [30]:
import sqlite3

In [31]:
with sqlite3.connect('/tmp/vehicles.db') as conn:
    uri = 'sqlite:////tmp/vehicles.db'
    autos.head(3).write_database(table_name='autos', connection=uri, 
                                 if_table_exists='replace')

In [32]:
from sqlalchemy import create_engine

In [33]:
uri = 'sqlite:////Users/isisromero/desktop/polars/datasets/vehicles.db'

In [34]:
# with create_engine(uri).connect() as conn:
#     query = 'SELECT * FROM autos'
#     a4 = pl.read_database(query=query, connection=conn)

In [35]:
# print(a4)

### Using `Arrow` to Convert DataFrames

In [36]:
import duckdb

In [37]:
sql = '''SELECT mean(city08) AS mean_city08,
    mean(highway08) AS mean_highway08,
    year
    FROM autos
    GROUP BY year'''

In [38]:
agg = duckdb.sql(sql)

In [39]:
print(agg)

┌────────────────────┬────────────────────┬───────┐
│    mean_city08     │   mean_highway08   │ year  │
│       double       │       double       │ int16 │
├────────────────────┼────────────────────┼───────┤
│  17.87830687830688 │  23.04232804232804 │  1985 │
│ 16.569803516028955 │ 22.671147880041364 │  1995 │
│ 17.113300492610836 │  23.54679802955665 │  1998 │
│ 17.982688391038696 │ 23.075356415478616 │  1984 │
│   24.2846034214619 │ 29.767496111975117 │  2021 │
│ 17.135170603674542 │ 23.451443569553806 │  1997 │
│ 17.275521405049396 │  23.32821075740944 │  2001 │
│ 17.665289256198346 │ 22.699173553719007 │  1986 │
│  16.78065134099617 │ 22.836206896551722 │  2003 │
│ 17.333615580016936 │ 24.018628281117696 │  2009 │
│          ·         │          ·         │    ·  │
│          ·         │          ·         │    ·  │
│          ·         │          ·         │    ·  │
│ 17.289780077619664 │  23.56921086675291 │  1996 │
│ 16.893333333333334 │  23.03076923076923 │  2002 │
│ 16.8055307

In [40]:
# Getting Pandas DataFrame outcomes
pd_agg = agg.pl().to_pandas()

In [41]:
print(pd_agg)

    mean_city08  mean_highway08  year
0     16.918534       22.725051  1994
1     17.272300       23.552817  1999
2     16.848940       22.253534  1991
3     19.325480       26.070681  2012
4     23.820350       29.780604  2020
5     16.998170       22.780421  1993
6     16.740642       23.064171  2004
7     16.605684       23.083481  2007
8     17.310345       22.445068  1987
9     18.668739       25.168739  2011
10    16.626812       23.048913  2006
11    16.900590       23.455771  2008
12    17.221429       23.414286  2000
13    18.106691       24.950271  2010
14    17.143972       22.465742  1989
15    17.033395       22.337662  1990
16    21.050736       27.987725  2014
17    21.455113       28.590164  2015
18    22.611852       29.309630  2018
19    29.485714       33.742857  2023
20    35.170084       38.342581  2025
21    16.851630       23.297599  2005
22    17.333628       22.702655  1988
23    20.656780       27.497458  2013
24    22.850309       29.627315  2017
25    23.461

In [42]:
# Plotting outcomes
fig, ax = plt.subplots(figsize=(10, 5))

(agg
 .pl()
 .to_pandas()
 .set_index('year')
 .plot(ax=ax, title='Average MPG by Year'))

fig.savefig('/Users/isisromero/desktop/polars/img/agg1.png', bbox_inches='tight')

##### Sorting By

In [43]:
autos = tweak_auto(raw)

In [44]:
sql = '''SELECT mean(city08) AS mean_city08,
    mean(highway08) AS mean_highway08,
    year
    FROM autos
    GROUP BY year
    ORDER BY year'''

In [45]:
agg = duckdb.sql(sql)

In [46]:
fig, ax = plt.subplots(figsize=(10, 5))

(agg
 .pl()
 .to_pandas()
 .set_index('year')
 .plot(ax=ax, title='Average MPG by Year'))

fig.savefig('/Users/isisromero/desktop/polars/img/agg2.png', bbox_inches='tight')

### Converting to Pandas

In [47]:
autos2 = (autos.to_pandas(use_pyarrow_extension_array=True)
          .pipe(pl.from_pandas)
         )

In [48]:
autos2.equals(autos)

False

## Working with Other Libraries

### Using XGBoost to Predict Mileage

In [49]:
# Features
X = (autos
     .select(cs.numeric() - cs.matches('(city08|highway08)'))
    )

In [50]:
# Label
y = (autos.select(pl.col('city08')))

In [51]:
print(X)

shape: (48_231, 5)
┌──────┬───────┬───────────┬───────────┬───────────┐
│ year ┆ displ ┆ cylinders ┆ barrels08 ┆ num_gears │
│ ---  ┆ ---   ┆ ---       ┆ ---       ┆ ---       │
│ i16  ┆ f32   ┆ u8        ┆ f32       ┆ u8        │
╞══════╪═══════╪═══════════╪═══════════╪═══════════╡
│ 1985 ┆ 2.0   ┆ 4         ┆ 14.167143 ┆ 5         │
│ 1985 ┆ 4.9   ┆ 12        ┆ 27.046364 ┆ 5         │
│ 1985 ┆ 2.2   ┆ 4         ┆ 11.018888 ┆ 5         │
│ 1985 ┆ 5.2   ┆ 8         ┆ 27.046364 ┆ 3         │
│ 1993 ┆ 2.2   ┆ 4         ┆ 15.658422 ┆ 5         │
│ …    ┆ …     ┆ …         ┆ …         ┆ …         │
│ 1993 ┆ 2.2   ┆ 4         ┆ 13.523182 ┆ 4         │
│ 1993 ┆ 2.2   ┆ 4         ┆ 12.935218 ┆ 5         │
│ 1993 ┆ 2.2   ┆ 4         ┆ 14.167143 ┆ 4         │
│ 1993 ┆ 2.2   ┆ 4         ┆ 14.167143 ┆ 5         │
│ 1993 ┆ 2.2   ┆ 4         ┆ 16.528334 ┆ 4         │
└──────┴───────┴───────────┴───────────┴───────────┘


In [52]:
print(y)

shape: (48_231, 1)
┌────────┐
│ city08 │
│ ---    │
│ u8     │
╞════════╡
│ 19     │
│ 9      │
│ 23     │
│ 10     │
│ 17     │
│ …      │
│ 19     │
│ 20     │
│ 18     │
│ 18     │
│ 16     │
└────────┘


In [53]:
# Splitting Prodecure
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [54]:
# Model Regressor
xg = xgb.XGBRegressor()

In [55]:
# Fitting Model
xg.fit(X_train, y_train)

In [56]:
# Getting score
xg.score(X_test, y_test)

0.9917415380477905

### Plotting Residuals

In [57]:
residuals = y_test.to_series() - xg.predict(X_test) 

In [58]:
print(residuals)

shape: (12_058,)
Series: 'city08' [f32]
[
	-0.931131
	0.365319
	1.016228
	1.363846
	0.539216
	…
	0.278222
	-5.394417
	0.027104
	0.871807
	-0.27842
]


In [59]:
fig, ax = plt.subplots(figsize=(10, 5))

In [60]:
fig, ax = plt.subplots(figsize=(10, 5))

ax.scatter(y_test, residuals)
ax.scatter(pl.DataFrame(xg.predict(X_test)) , residuals, alpha=0.1)
ax.set_title('City MPG Residuals vs. Predicted City MPG')
ax.set_xlabel('Predicted City MPG')
ax.set_ylabel('Residuals')

fig.savefig('/Users/isisromero/desktop/polars/img/agg3.png', bbox_inches='tight')

### PCA of The Autos Data

In [61]:
# Setting Up
sklearn.set_config(transform_output='polars')

In [63]:
# Placing StandardScaler()
std = preprocessing.StandardScaler()

In [64]:
X_std = std.fit_transform(
    autos.select(pl.col(['displ', 'cylinders', 'barrels08', 'city08', 'highway08'])
                 .fill_null(0)))

In [65]:
# PCA Model
pca = decomposition.PCA(n_components=2)

In [66]:
# Fitting Model
res = pca.fit_transform(X_std)

In [67]:
print(res)

shape: (48_231, 2)
┌───────────┬───────────┐
│ pca0      ┆ pca1      │
│ ---       ┆ ---       │
│ f64       ┆ f64       │
╞═══════════╪═══════════╡
│ -0.767061 ┆ -0.897247 │
│ 4.078251  ┆ 1.287975  │
│ -1.490803 ┆ -0.351204 │
│ 3.296114  ┆ 0.431262  │
│ -0.402969 ┆ -0.975991 │
│ …         ┆ …         │
│ -0.811728 ┆ -0.789896 │
│ -0.987886 ┆ -0.669054 │
│ -0.629879 ┆ -0.90998  │
│ -0.629879 ┆ -0.90998  │
│ -0.198157 ┆ -1.093019 │
└───────────┴───────────┘


In [68]:
def fix_naming(name):
    return f'{name.upper()[:2]}{int(name[-1])+1}'   

In [69]:
(res
 .rename(fix_naming)
 .with_columns(color=autos['cylinders'])
 .plot.scatter('PC1', 'PC2', color='color', title='PCA of Autos', cmap='viridis')
)

TypeError: _EncodingMixin.encode() got an unexpected keyword argument 'title'

In [75]:
print(sampled_data.columns)

Index(['pca0', 'pca1'], dtype='object')


In [76]:
print(autos.columns)

['year', 'make', 'model', 'displ', 'cylinders', 'trany', 'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn', 'is_automatic', 'num_gears']


In [77]:
res = res.with_columns(color=autos["cylinders"])

In [78]:
print(res.columns)

['pca0', 'pca1', 'color']


In [79]:
sampled_data = res.sample(n=5000).to_pandas()

print(sampled_data.columns)

Index(['pca0', 'pca1', 'color'], dtype='object')


In [80]:
sampled_data["color"] = sampled_data["color"].fillna(0)

In [83]:
import altair as alt

scatter_plot = (
    alt.Chart(sampled_data)
    .mark_point()
    .encode(
        x=alt.X("pca0:Q", title="PC1"),
        y=alt.Y("pca1:Q", title="PC2"),
        color=alt.Color("color:Q", scale=alt.Scale(scheme="viridis"), title="Cylinders"),
    )
    .properties(title="PCA of Autos", width=600, height=400)
)

scatter_plot.show()


### Configuration

```sh
pl.Config.set_tbl_width_chars(70)
pl.Config.set_float_precision(2)
pl.Config.set_tbl_cols(6)

with pl.Config(set_tbl_width_chars=70):
    print(df)