# Data Import & Export

### Loading Libraries

In [1]:
# ZipFiles & IO
import io
import os
import pprint
import zipfile

#URL
import urllib.request

# Time-Zone
import pytz

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import hvplot
import altair as alt
import seaborn as sns
import holoviews as hv
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

# Scikit-Learn
from sklearn import decomposition

# Java Script Object Notation
import json

# Date & Time
from datetime import datetime
from datetime import timedelta

In [2]:
hvplot.extension('matplotlib')

### Loading Data

In [3]:
# Path
path = '/Users/isisromero/desktop/polars/datasets/vehicles.csv'

In [4]:
raw = pl.read_csv(path, null_values=['NA'])

In [5]:
def tweak_auto(df):
    cols = ['year', 'make', 'model', 'displ', 'cylinders', 'trany', 
            'drive', 'VClass', 'fuelType', 'barrels08', 'city08', 'highway08', 'createdOn']
    return (df
            .select(pl.col(cols))
            .with_columns(pl.col('year').cast(pl.Int16),
                          pl.col(['cylinders', 'highway08', 'city08']).cast(pl.UInt8),
                          pl.col(['displ', 'barrels08']).cast(pl.Float32),
                          pl.col(['make', 'model', 'VClass', 'drive', 'fuelType']).cast(pl.Categorical),
                          pl.col('createdOn').str.to_datetime('%a %b %d %H:%M:%S %Z %Y'),
                          is_automatic=pl.col('trany')                    
                          .str.contains('Automatic')
                          .fill_null('Automatic'),
                          num_gears=pl.col('trany')
                          .str.extract(r'(\d+)')
                          .cast(pl.UInt8)
                          .fill_null(6))
           )

In [6]:
autos = tweak_auto(raw)

### Exporting to `CSV`

In [7]:
print(autos
      .head(3)
      .write_csv()
     )

year,make,model,displ,cylinders,trany,drive,VClass,fuelType,barrels08,city08,highway08,createdOn,is_automatic,num_gears
1985,Alfa Romeo,Spider Veloce 2000,2.0,4,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,14.167143,19,25,2013-01-01T00:00:00.000000,false,5
1985,Ferrari,Testarossa,4.9,12,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,27.046364,9,14,2013-01-01T00:00:00.000000,false,5
1985,Dodge,Charger,2.2,4,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,11.018888,23,33,2013-01-01T00:00:00.000000,false,5



In [8]:
print(autos
      .head(3)
      .write_csv(float_precision=2)
     )

year,make,model,displ,cylinders,trany,drive,VClass,fuelType,barrels08,city08,highway08,createdOn,is_automatic,num_gears
1985,Alfa Romeo,Spider Veloce 2000,2.00,4,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,14.17,19,25,2013-01-01T00:00:00.000000,false,5
1985,Ferrari,Testarossa,4.90,12,Manual 5-spd,Rear-Wheel Drive,Two Seaters,Regular,27.05,9,14,2013-01-01T00:00:00.000000,false,5
1985,Dodge,Charger,2.20,4,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,11.02,23,33,2013-01-01T00:00:00.000000,false,5



### Exporting to `JSON`

In [9]:
pprint.pprint(json.loads(
    autos.head(2)
    .write_json()))

[{'VClass': 'Two Seaters',
  'barrels08': 14.167143,
  'city08': 19,
  'createdOn': '2013-01-01 00:00:00',
  'cylinders': 4,
  'displ': 2.0,
  'drive': 'Rear-Wheel Drive',
  'fuelType': 'Regular',
  'highway08': 25,
  'is_automatic': 'false',
  'make': 'Alfa Romeo',
  'model': 'Spider Veloce 2000',
  'num_gears': 5,
  'trany': 'Manual 5-spd',
  'year': 1985},
 {'VClass': 'Two Seaters',
  'barrels08': 27.046364,
  'city08': 9,
  'createdOn': '2013-01-01 00:00:00',
  'cylinders': 12,
  'displ': 4.9,
  'drive': 'Rear-Wheel Drive',
  'fuelType': 'Regular',
  'highway08': 14,
  'is_automatic': 'false',
  'make': 'Ferrari',
  'model': 'Testarossa',
  'num_gears': 5,
  'trany': 'Manual 5-spd',
  'year': 1985}]


### Reading `JSON`

In [10]:
from io import StringIO

print(pl.read_json(StringIO(autos.write_json())))

shape: (48_231, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ i64  ┆ str        ┆ str          ┆ f64   ┆   ┆ i64       ┆ str         ┆ ---         ┆ i64       │
│      ┆            ┆              ┆       ┆   ┆           ┆             ┆ str         ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆              ┆       ┆   ┆           ┆ 00:00:00  

In [11]:
# autos.head(3).to_pandas().to_json('tmp/pd.json', orient='records')

In [12]:
# pprint.pprint(json.loads(autos.head(2)
#                          .to_pandas().to_json(orient='records')))

In [13]:
# from_pd = pl.read_json('/tmp/pd.json')

# print(from_pd)

In [14]:
# print(from_pd
#       .with_columns(createdOn=pl.from_epoch('createdOn', time_unit='s'))
#      )

### `Custom JSON` Handling

In [15]:
# Using Default Pandas Orientation
pprint.pprint(json.loads(autos.head(2)
                         .to_pandas().to_json()))

{'VClass': {'0': 'Two Seaters', '1': 'Two Seaters'},
 'barrels08': {'0': 14.167142868, '1': 27.0463638306},
 'city08': {'0': 19, '1': 9},
 'createdOn': {'0': 1356998400, '1': 1356998400},
 'cylinders': {'0': 4, '1': 12},
 'displ': {'0': 2.0, '1': 4.9000000954},
 'drive': {'0': 'Rear-Wheel Drive', '1': 'Rear-Wheel Drive'},
 'fuelType': {'0': 'Regular', '1': 'Regular'},
 'highway08': {'0': 25, '1': 14},
 'is_automatic': {'0': 'false', '1': 'false'},
 'make': {'0': 'Alfa Romeo', '1': 'Ferrari'},
 'model': {'0': 'Spider Veloce 2000', '1': 'Testarossa'},
 'num_gears': {'0': 5, '1': 5},
 'trany': {'0': 'Manual 5-spd', '1': 'Manual 5-spd'},
 'year': {'0': 1985, '1': 1985}}


In [16]:
print(pl.read_json(io.StringIO(autos.head(2).to_pandas().to_json())))

shape: (1, 15)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ year      ┆ make      ┆ model     ┆ displ     ┆ … ┆ highway08 ┆ createdOn ┆ is_automa ┆ num_gear │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ tic       ┆ s        │
│ struct[2] ┆ struct[2] ┆ struct[2] ┆ struct[2] ┆   ┆ struct[2] ┆ struct[2] ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆ struct[2] ┆ struct[2 │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ ]        │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ {1985,198 ┆ {"Alfa    ┆ {"Spider  ┆ {2.0,4.9} ┆ … ┆ {25,14}   ┆ {13569984 ┆ {"false", ┆ {5,5}    │
│ 5}        ┆ Romeo","F ┆ Veloce    ┆           ┆   ┆           ┆ 00,135699 ┆ "false"}  ┆          │
│           ┆ errari"}  ┆ 2000","Te ┆           ┆   ┆           ┆ 8400}     

In [17]:
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}],
})
     )

shape: (3, 3)
┌─────┬───────────┬───────────┐
│ num ┆ listy     ┆ structy   │
│ --- ┆ ---       ┆ ---       │
│ i64 ┆ list[i64] ┆ struct[2] │
╞═════╪═══════════╪═══════════╡
│ 1   ┆ [1, 2]    ┆ {1,2}     │
│ 2   ┆ [3, 4]    ┆ {3,4}     │
│ 3   ┆ [5, 6]    ┆ {5,6}     │
└─────┴───────────┴───────────┘


In [18]:
# Exploiding Data
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}],
})
      .explode('listy')
     )

shape: (6, 3)
┌─────┬───────┬───────────┐
│ num ┆ listy ┆ structy   │
│ --- ┆ ---   ┆ ---       │
│ i64 ┆ i64   ┆ struct[2] │
╞═════╪═══════╪═══════════╡
│ 1   ┆ 1     ┆ {1,2}     │
│ 1   ┆ 2     ┆ {1,2}     │
│ 2   ┆ 3     ┆ {3,4}     │
│ 2   ┆ 4     ┆ {3,4}     │
│ 3   ┆ 5     ┆ {5,6}     │
│ 3   ┆ 6     ┆ {5,6}     │
└─────┴───────┴───────────┘


In [19]:
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}],
})
      .explode('structy')
     )

InvalidOperationError: `explode` operation not supported for dtype `struct[2]`

In [None]:
# Converting struct to a list
print(pl.DataFrame({
    'num': [1, 2, 3], 
    'listy': [[1, 2], [3, 4], [5, 6]],
    'structy': [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}]})
      .with_columns(structy=pl.col('structy').map_elements(
          lambda d: list(d.values())
      )
                   )
     )

In [20]:
print(pl.read_json(io.StringIO(autos.to_pandas().to_json()))
      .with_columns(pl.all().map_elements(lambda d: list(d.values())))
      .explode(pl.all())
      .with_columns(createdOn=pl.from_epoch('createdOn', time_unit='s'))
     )

shape: (48_231, 15)
┌──────┬───────────┬──────────────┬───────┬───┬───────────┬──────────────┬─────────────┬───────────┐
│ year ┆ make      ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn    ┆ is_automati ┆ num_gears │
│ ---  ┆ ---       ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---          ┆ c           ┆ ---       │
│ i64  ┆ str       ┆ str          ┆ f64   ┆   ┆ i64       ┆ datetime[μs] ┆ ---         ┆ i64       │
│      ┆           ┆              ┆       ┆   ┆           ┆              ┆ str         ┆           │
╞══════╪═══════════╪══════════════╪═══════╪═══╪═══════════╪══════════════╪═════════════╪═══════════╡
│ 1984 ┆ Chevrolet ┆ El Camino    ┆ 5.7   ┆ … ┆ 24        ┆ 2013-01-01   ┆ true        ┆ 3         │
│      ┆           ┆ Pickup 2WD   ┆       ┆   ┆           ┆ 00:00:00     ┆             ┆           │
│ 2016 ┆ Chevrolet ┆ Colorado 4WD ┆ 2.5   ┆ … ┆ 24        ┆ 2015-07-16   ┆ true        ┆ 6         │
│      ┆           ┆              ┆       ┆   ┆           ┆ 00:00:00   



### Munging `JSON`

In [21]:
pprint.pprint(json.loads(autos.head(2).to_pandas()
                         .to_json(orient='split')))

{'columns': ['year',
             'make',
             'model',
             'displ',
             'cylinders',
             'trany',
             'drive',
             'VClass',
             'fuelType',
             'barrels08',
             'city08',
             'highway08',
             'createdOn',
             'is_automatic',
             'num_gears'],
 'data': [[1985,
           'Alfa Romeo',
           'Spider Veloce 2000',
           2.0,
           4,
           'Manual 5-spd',
           'Rear-Wheel Drive',
           'Two Seaters',
           'Regular',
           14.167142868,
           19,
           25,
           1356998400,
           'false',
           5],
          [1985,
           'Ferrari',
           'Testarossa',
           4.9000000954,
           12,
           'Manual 5-spd',
           'Rear-Wheel Drive',
           'Two Seaters',
           'Regular',
           27.0463638306,
           9,
           14,
           1356998400,
           'false',
       

In [22]:
# Reading into Polars
print(pl.read_json(io.StringIO(autos.to_pandas()
                               .to_json(orient='split'))))

shape: (1, 3)
┌─────────────────────────────────┬─────────────────┬─────────────────────────────────┐
│ columns                         ┆ index           ┆ data                            │
│ ---                             ┆ ---             ┆ ---                             │
│ list[str]                       ┆ list[i64]       ┆ list[list[str]]                 │
╞═════════════════════════════════╪═════════════════╪═════════════════════════════════╡
│ ["year", "make", … "num_gears"… ┆ [0, 1, … 48230] ┆ [["1985", "Alfa Romeo", … "5"]… │
└─────────────────────────────────┴─────────────────┴─────────────────────────────────┘


In [23]:
def split_json_to_dict(json_str):
    """ Convert pandas "split" json to a sequence of dictionaries
    representing the rows of the dataframe.
    """
    data = json.loads(json_str)
    columns = data['columns']
    for row in data['data']:
        yield dict(zip(columns, row))

In [24]:
print(pl.DataFrame(
    split_json_to_dict(autos.to_pandas().to_json(orient='split'))))

shape: (48_231, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬────────────┬──────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn  ┆ is_automatic ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---        ┆ ---          ┆ ---       │
│ i64  ┆ str        ┆ str          ┆ f64   ┆   ┆ i64       ┆ i64        ┆ str          ┆ i64       │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪════════════╪══════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 1356998400 ┆ false        ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆            ┆              ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 1356998400 ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ Charger      ┆ 2.2   ┆ … ┆ 33        ┆ 1356998400 ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ B150/B250    ┆ 5.2   ┆ … ┆ 12        ┆ 1356998400

### Exporting to `Excel`

In [26]:
(autos
 .head(3)
 .write_excel('/Users/isisromero/desktop/polars/datasets/autos.xlsx')
)

<xlsxwriter.workbook.Workbook at 0x317435350>

In [27]:
a3 = (pl.read_excel('/Users/isisromero/desktop/polars/datasets/autos.xlsx'))

print(a3)

shape: (3, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬────────────┬──────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn  ┆ is_automatic ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---        ┆ ---          ┆ ---       │
│ i64  ┆ str        ┆ str          ┆ f64   ┆   ┆ i64       ┆ date       ┆ str          ┆ i64       │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪════════════╪══════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01 ┆ false        ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆            ┆              ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01 ┆ false        ┆ 5         │
│ 1985 ┆ Dodge      ┆ Charger      ┆ 2.2   ┆ … ┆ 33        ┆ 2013-01-01 ┆ false        ┆ 5         │
└──────┴────────────┴──────────────┴───────┴───┴───────────┴────────────┴───

### Exporting to `Parquet`

In [28]:
(autos
 .head(3)
 .write_parquet('/Users/isisromero/desktop/polars/datasets/a3.parquet')
)

In [29]:
pd.read_parquet('/Users/isisromero/desktop/polars/datasets/a3.parquet').to_parquet('/Users/isisromero/desktop/polars/datasets/a4.parquet')

In [31]:
a4 = pl.read_parquet('/Users/isisromero/desktop/polars/datasets/a4.parquet')

print(a4)

shape: (3, 15)
┌──────┬────────────┬──────────────┬───────┬───┬───────────┬─────────────┬─────────────┬───────────┐
│ year ┆ make       ┆ model        ┆ displ ┆ … ┆ highway08 ┆ createdOn   ┆ is_automati ┆ num_gears │
│ ---  ┆ ---        ┆ ---          ┆ ---   ┆   ┆ ---       ┆ ---         ┆ c           ┆ ---       │
│ i16  ┆ cat        ┆ cat          ┆ f32   ┆   ┆ u8        ┆ datetime[μs ┆ ---         ┆ u8        │
│      ┆            ┆              ┆       ┆   ┆           ┆ ]           ┆ str         ┆           │
╞══════╪════════════╪══════════════╪═══════╪═══╪═══════════╪═════════════╪═════════════╪═══════════╡
│ 1985 ┆ Alfa Romeo ┆ Spider       ┆ 2.0   ┆ … ┆ 25        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆ Veloce 2000  ┆       ┆   ┆           ┆ 00:00:00    ┆             ┆           │
│ 1985 ┆ Ferrari    ┆ Testarossa   ┆ 4.9   ┆ … ┆ 14        ┆ 2013-01-01  ┆ false       ┆ 5         │
│      ┆            ┆              ┆       ┆   ┆           ┆ 00:00:00    ┆  

In [32]:
autos.head(3).equals(a4)

True

In [34]:
(autos
 .head(3)
 .select(pl.all().shrink_dtype())
 .write_parquet('/Users/isisromero/desktop/polars/datasets/a3-shrink.parquet')
)

### Exporting to `SQL`

In [35]:
import sqlite3

In [37]:
with sqlite3.connect('/tmp/vehicles.db') as conn:
    uri = 'sqlite:////tmp/vehicles.db'
    autos.head(3).write_database(table_name='autos', connection=uri, 
                                 if_table_exists='replace')

In [38]:
from sqlalchemy import create_engine

In [39]:
uri = 'sqlite:////Users/isisromero/desktop/polars/datasets/vehicles.db'

In [41]:
# with create_engine(uri).connect() as conn:
#     query = 'SELECT * FROM autos'
#     a4 = pl.read_database(query=query, connection=conn)

In [42]:
# print(a4)

### Using `Arrow` to Convert DataFrames