## Download data

In [0]:
%python
def path_exists(path):
    try:
        dbutils.fs.ls(path)
        return True
    except Exception as e:
        msg = str(e)
        if ("com.databricks.sql.io.CloudFileNotFoundException") in msg or "java.io.FileNotFoundException" in msg:
            return False
        else:
            raise


In [0]:
%python
def download_dataset(source, target):
    files = dbutils.fs.ls(source)

    for f in files:
        source_path = f"{source}/{f.name}"
        target_path = f"{target}/{f.name}"
        if not path_exists(target_path):
            print(f"Copying {f.name} ...")
            dbutils.fs.cp(source_path, target_path, True)

In [0]:
%python
data_source_uri = "s3://dalhussein-courses/datasets/bookstore/v1/"
dataset_bookstore = 'dbfs:/mnt/demo-datasets/bookstore'
data_catalog = 'hive_metastore'
spark.conf.set(f"dataset.bookstore", dataset_bookstore)
spark.conf.set("fs.s3a.endpoint", "s3.eu-west-3.amazonaws.com")
spark.conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

In [0]:
%python
def get_index(dir):
    files = dbutils.fs.ls(dir)
    index = 0
    if files:
        file = max(files).name


In [0]:
%python
def set_current_catalog(catalog_name):
    spark.sql(f"USE CATALOG {catalog_name}")

In [0]:
%python
streaming_dir = f"{dataset_bookstore}/orders-streaming"
raw_dir = f"{dataset_bookstore}/orders-raw"

def load_file(current_index):
    latest_file = f"{str(current_index).zfill(2)}.parquet"
    print(f"Loading {latest_file} file to the bookstore dataset")
    dbutils.fs.cp(f"{streaming_dir}/{latest_file}", f"{raw_dir}/{latest_file}")
    

In [0]:
%python
def load_new_data(all=False):
    index = get_index(raw_dir)
    if index >= 10:
        print("No more data to load\n")
    elif all == True:
        while index <= 10:
            load_file(index)
            index += 1
    else:
        load_file(index)
        index += 1
    


In [0]:
%python
download_dataset(data_source_uri, dataset_bookstore)
set_current_catalog(data_catalog)

In [0]:
USE CATALOG hive_metastore;

In [0]:
%python
%fs ls "dbfs:/mnt/demo-datasets/bookstore"

## Querying JSON 

In [0]:
%python
files = dbutils.fs.ls(f"{dataset_bookstore}/customers-json")
display(files)

In [0]:
select * from json.`dbfs:/mnt/demo-datasets/bookstore/customers-json/export_001.json`


In [0]:
select * from json.`dbfs:/mnt/demo-datasets/bookstore/customers-json/export_*.json`

In [0]:
select * from json.`dbfs:/mnt/demo-datasets/bookstore/customers-json/`

In [0]:
select count(*) from json.`dbfs:/mnt/demo-datasets/bookstore/customers-json/`

In [0]:
select *, _metadata.file_path source_file from json.`${dataset.bookstore}/customers-json`

## Querying text Format

In [0]:
SELECT * FROM text.`${dataset.bookstore}/customers-json`

## Querying BinaryFile Format

In [0]:
select * from binaryFile.`${dataset.bookstore}/customers-json`

## Querying CSV

In [0]:
%fs ls 'dbfs:/mnt/demo-datasets/bookstore/books-csv'

In [0]:
SELECT * FROM csv.`dbfs:/mnt/demo-datasets/bookstore/books-csv`

In [0]:
CREATE TABLE books_csv
  (book_id STRING, title STRING, author STRING, category STRING, price DOUBLE)
USING CSV
OPTIONS (
  header = "true",
  delimiter = ";"
)
LOCATION "dbfs:/mnt/demo-datasets/bookstore/books-csv"

In [0]:
SELECT * FROM books_csv

## Limitations of Non-Delta Tables

In [0]:
DESCRIBE EXTENDED books_csv

In [0]:
%python
files = dbutils.fs.ls("dbfs:/mnt/demo-datasets/bookstore/books-csv")
display(files)

In [0]:
%python
(spark.read.table("books_csv")
            .write
            .mode("append")
            .format("csv")
            .option("header","true")
            .option("delimiter", ";")
            .save("dbfs:/mnt/demo-datasets/bookstore/books-csv"))


In [0]:
%python
files = dbutils.fs.ls(f"dbfs:/mnt/demo-datasets/bookstore/books-csv")
display(files)

In [0]:
SELECT COUNT(*) FROM books_csv

In [0]:
REFRESH TABLE books_csv

In [0]:
SELECT COUNT(*) FROM books_csv

## CTAS Statements

In [0]:
CREATE TABLE customers AS
SELECT * FROM json.`dbfs:/mnt/demo-datasets/bookstore/customers-json`;

DESCRIBE EXTENDED customers;

In [0]:
CREATE TABLE books_unparsed AS
SELECT * FROM csv.`dbfs:/mnt/demo-datasets/bookstore/books-csv`;

SELECT * FROM books_unparsed;

In [0]:
CREATE TEMP VIEW books_tmp_vw
   (book_id STRING, title STRING, author STRING, category STRING, price DOUBLE)
USING CSV
OPTIONS (
  path = "dbfs:/mnt/demo-datasets/bookstore/books-csv/export_*.csv",
  header = "true",
  delimiter = ";"
);

CREATE TABLE books AS
  SELECT * FROM books_tmp_vw;
  
SELECT * FROM books

In [0]:
DESCRIBE EXTENDED books

In [0]:
describe extended books_tmp_vw