# Chapter 6: Reading and Writing Data

In [None]:
import polars as pl
pl.__version__  # The book is built with Polars version 1.20.0

## Format Overview

## Reading CSV Files

In [None]:
! cat data/penguins.csv

In [None]:
penguins = pl.read_csv("data/penguins.csv")
penguins

## Parsing Missing Values Correctly

In [None]:
penguins = pl.read_csv("data/penguins.csv", null_values="NA")
penguins

In [None]:
penguins.null_count().transpose(  
    include_header=True, column_names=["null_count"]
)

## Reading Files with Encodings Other Than UTF-8

In [None]:
# This raises a ComputeError:
# pl.read_csv("data/directors.csv")

In [None]:
pl.read_csv("data/directors.csv", encoding="EUC-CN")

In [None]:
import chardet


def detect_encoding(filename: str) -> str:
    """Return the most probable character encoding for a file."""

    with open(filename, "rb") as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result["encoding"]


detect_encoding("data/directors.csv")

In [None]:
pl.read_csv("data/directors.csv", encoding="EUC-JP")

## Reading Excel Spreadsheets

In [None]:
songs = pl.read_excel("data/top2000-2023.xlsx")  
songs

## Working with Multiple Files

In [None]:
pl.read_csv("data/stock/nvda/201?.csv")

In [None]:
all_stocks = pl.read_csv("data/stock/**/*.csv")
all_stocks

In [None]:
import calendar

filenames = [
    f"data/stock/asml/{year}.csv"
    for year in range(1999, 2024)
    if calendar.isleap(year)
]

filenames

In [None]:
pl.concat(pl.read_csv(f) for f in filenames)

## Reading Parquet

In [None]:
%%time
trips = pl.read_parquet("data/taxi/yellow_tripdata_*.parquet")
trips

## Reading JSON and NDJSON

### JSON

In [None]:
! cat data/pokedex.json

In [None]:
pokedex = pl.read_json("data/pokedex.json")
pokedex

In [None]:
(
    pokedex.explode("pokemon")
    .unnest("pokemon")
    .select("id", "name", "type", "height", "weight")
)

### NDJSON

In [None]:
! cat data/wikimedia.ndjson

In [None]:
from json import loads
from pprint import pprint

with open("data/wikimedia.ndjson") as f:
    pprint(loads(f.readline()))

In [None]:
wikimedia = pl.read_ndjson("data/wikimedia.ndjson")
wikimedia

In [None]:
(
    wikimedia.rename({"id": "edit_id"})
    .unnest("meta")
    .select("timestamp", "title", "user", "comment")
)

## Other File Formats

In [None]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_Latin_abbreviations"
pl.from_pandas(pd.read_html(url, storage_options={"User-Agent": "Mozilla/5.0"})[0])

## Querying Databases

In [None]:
pl.read_database_uri(
    query="""
    SELECT
        f.film_id,
        f.title,
        c.name AS category,
        f.rating,
        f.length / 60.0 AS length
    FROM
        film AS f,
        film_category AS fc,
        category AS c
    WHERE
        fc.film_id = f.film_id
        AND fc.category_id = c.category_id
    LIMIT 10
    """,
    uri="sqlite:::data/sakila.db",
)

In [None]:
db = "sqlite:::data/sakila.db"
films = pl.read_database_uri("SELECT * FROM film", db)
film_categories = pl.read_database_uri("SELECT * FROM film_category", db)
categories = pl.read_database_uri("SELECT * FROM category", db)

(
    films.join(film_categories, on="film_id", suffix="_fc")
    .join(categories, on="category_id", suffix="_c")
    .select(
        "film_id",
        "title",
        pl.col("name").alias("category"),
        "rating",
        pl.col("length") / 60,
    )
    .limit(10)
)

## Writing Data

### CSV Format

In [None]:
all_stocks.write_csv("data/all_stocks.csv")

### Excel Format

In [None]:
all_stocks.write_excel("data/all_stocks.xlsx")

### Parquet Format

In [None]:
all_stocks.write_parquet("data/all_stocks.parquet")

### Other Considerations

## Takeaways