In [0]:
# Databricks notebook source
def path_exists(path):
  try:
    dbutils.fs.ls(path)
    return True
  except Exception as e:
    if 'java.io.FileNotFoundException' in str(e):
      return False
    else:
      raise

# COMMAND ----------

def download_dataset(source, target):
    files = dbutils.fs.ls(source)

    for f in files:
        source_path = f"{source}/{f.name}"
        target_path = f"{target}/{f.name}"
        if not path_exists(target_path):
            print(f"Copying {f.name} ...")
            dbutils.fs.cp(source_path, target_path, True)

# COMMAND ----------

data_source_uri = "s3://dalhussein-courses/datasets/bookstore/v1/"
dataset_bookstore = 'dbfs:/mnt/demo-datasets/bookstore'
data_catalog = 'hive_metastore'
spark.conf.set(f"dataset.bookstore", dataset_bookstore)

# COMMAND ----------

def get_index(dir):
    files = dbutils.fs.ls(dir)
    index = 0
    if files:
        file = max(files).name
        index = int(file.rsplit('.', maxsplit=1)[0])
    return index+1

# COMMAND ----------

def set_current_catalog(catalog_name):
    spark.sql(f"USE CATALOG {catalog_name}")

# COMMAND ----------

# Structured Streaming
streaming_dir = f"{dataset_bookstore}/orders-streaming"
raw_dir = f"{dataset_bookstore}/orders-raw"

def load_file(current_index):
    latest_file = f"{str(current_index).zfill(2)}.parquet"
    print(f"Loading {latest_file} file to the bookstore dataset")
    dbutils.fs.cp(f"{streaming_dir}/{latest_file}", f"{raw_dir}/{latest_file}")

    
def load_new_data(all=False):
    index = get_index(raw_dir)
    if index >= 10:
        print("No more data to load\n")

    elif all == True:
        while index <= 10:
            load_file(index)
            index += 1
    else:
        load_file(index)
        index += 1

# COMMAND ----------

# DLT
streaming_orders_dir = f"{dataset_bookstore}/orders-json-streaming"
streaming_books_dir = f"{dataset_bookstore}/books-streaming"

raw_orders_dir = f"{dataset_bookstore}/orders-json-raw"
raw_books_dir = f"{dataset_bookstore}/books-cdc"

def load_json_file(current_index):
    latest_file = f"{str(current_index).zfill(2)}.json"
    print(f"Loading {latest_file} orders file to the bookstore dataset")
    dbutils.fs.cp(f"{streaming_orders_dir}/{latest_file}", f"{raw_orders_dir}/{latest_file}")
    print(f"Loading {latest_file} books file to the bookstore dataset")
    dbutils.fs.cp(f"{streaming_books_dir}/{latest_file}", f"{raw_books_dir}/{latest_file}")

    
def load_new_json_data(all=False):
    index = get_index(raw_orders_dir)
    if index >= 10:
        print("No more data to load\n")

    elif all == True:
        while index <= 10:
            load_json_file(index)
            index += 1
    else:
        load_json_file(index)
        index += 1

# COMMAND ----------

download_dataset(data_source_uri, dataset_bookstore)
set_current_catalog(data_catalog)


In [0]:
files = dbutils.fs.ls('mnt/demo-datasets/bookstore') # finding all the files in the directory 

In [0]:
display(files)

path,name,size,modificationTime
dbfs:/mnt/demo-datasets/bookstore/books-cdc/,books-cdc/,0,0
dbfs:/mnt/demo-datasets/bookstore/books-csv/,books-csv/,0,0
dbfs:/mnt/demo-datasets/bookstore/books-csv-new/,books-csv-new/,0,0
dbfs:/mnt/demo-datasets/bookstore/books-streaming/,books-streaming/,0,0
dbfs:/mnt/demo-datasets/bookstore/customers-json/,customers-json/,0,0
dbfs:/mnt/demo-datasets/bookstore/customers-json-new/,customers-json-new/,0,0
dbfs:/mnt/demo-datasets/bookstore/orders/,orders/,0,0
dbfs:/mnt/demo-datasets/bookstore/orders-json-raw/,orders-json-raw/,0,0
dbfs:/mnt/demo-datasets/bookstore/orders-json-streaming/,orders-json-streaming/,0,0
dbfs:/mnt/demo-datasets/bookstore/orders-new/,orders-new/,0,0


In [0]:
customer_files = dbutils.fs.ls(f"{dataset_bookstore}/customers-json")
display(customer_files)

path,name,size,modificationTime
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_001.json,export_001.json,79378,1735708628000
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json,export_002.json,80001,1735708629000
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_003.json,export_003.json,79781,1735708629000
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_004.json,export_004.json,79976,1735708630000
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_005.json,export_005.json,79727,1735708631000
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_006.json,export_006.json,53243,1735708631000


In [0]:
%sql
SELECT * FROM json.`${dataset.bookstore}/customers-json/export_001.json` -- querying a single file directly

customer_id,email,profile,updated
C00001,dabby2y@japanpost.jp,"{""first_name"":""Dniren"",""last_name"":""Abby"",""gender"":""Female"",""address"":{""street"":""768 Mesta Terrace"",""city"":""Annecy"",""country"":""France""}}",2021-12-14T23:15:43.375Z
C00002,eabbysc1@github.com,"{""first_name"":""Etti"",""last_name"":""Abbys"",""gender"":""Female"",""address"":{""street"":""1748 Vidon Plaza"",""city"":""Varge Mondar"",""country"":""Portugal""}}",2021-12-14T23:15:43.375Z
C00003,rabelovd1@wikispaces.com,"{""first_name"":""Ronnie"",""last_name"":""Abelov"",""gender"":""Male"",""address"":{""street"":""363 Randy Park"",""city"":""San Celestio"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z
C00004,rabels9g@behance.net,"{""first_name"":""Ray"",""last_name"":""Abels"",""gender"":""Female"",""address"":{""street"":""613 Lyons Way"",""city"":""Oudtshoorn"",""country"":""South Africa""}}",2021-12-14T23:15:43.375Z
C00005,sabendrothin@cargocollective.com,"{""first_name"":""Shanon"",""last_name"":""Abendroth"",""gender"":""Female"",""address"":{""street"":""30292 Manufacturers Junction"",""city"":""Ani-e"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z
C00006,,"{""first_name"":""Norman"",""last_name"":""Abernethy"",""gender"":""Male"",""address"":{""street"":""9292 Oxford Center"",""city"":""Gibara"",""country"":""Cuba""}}",2021-12-14T23:15:43.375Z
C00007,sabrahmson3h@blinklist.com,"{""first_name"":""Skell"",""last_name"":""Abrahmson"",""gender"":""Male"",""address"":{""street"":""90941 Hallows Park"",""city"":""Huarong Chengguanzhen"",""country"":""China""}}",2021-12-14T23:15:43.375Z
C00008,dacheson2h@mapy.cz,"{""first_name"":""Darsey"",""last_name"":""Acheson"",""gender"":""Non-binary"",""address"":{""street"":""29579 Grim Plaza"",""city"":""Dārayyā"",""country"":""Syria""}}",2021-12-14T23:15:43.375Z
C00009,fackwoodji@gravatar.com,"{""first_name"":""Fredrick"",""last_name"":""Ackwood"",""gender"":""Male"",""address"":{""street"":""67 Dunning Plaza"",""city"":""Santo Domingo"",""country"":""Cuba""}}",2021-12-14T23:15:43.375Z
C00010,,"{""first_name"":""Doralynne"",""last_name"":""Adamkiewicz"",""gender"":""Female"",""address"":{""street"":""84126 Glendale Center"",""city"":""Ugep"",""country"":""Nigeria""}}",2021-12-14T23:15:43.375Z


In [0]:
%sql
SELECT * FROM json.`${dataset.bookstore}/customers-json/export_*.json` -- querying multiple files using * 
-- this means select information from files where name start with export_ and * represents all the files

customer_id,email,profile,updated
C00301,thomas.lane@gmail.com,"{""first_name"":""Thomas"",""last_name"":""Lane"",""gender"":""Male"",""address"":{""street"":""06 Boulevard Victor Hugo"",""city"":""Paris"",""country"":""France""}}",2021-12-14T23:15:43.375Z
C00302,ocolegatele@blogger.com,"{""first_name"":""Odilia"",""last_name"":""Colegate"",""gender"":""Female"",""address"":{""street"":""07 Sommers Parkway"",""city"":""Lyon"",""country"":""France""}}",2021-12-14T23:15:43.375Z
C00303,acolledged2@nbcnews.com,"{""first_name"":""Andros"",""last_name"":""Colledge"",""gender"":""Male"",""address"":{""street"":""342 Katie Center"",""city"":""Gort"",""country"":""Ireland""}}",2021-12-14T23:15:43.375Z
C00304,,"{""first_name"":""Iver"",""last_name"":""Collet"",""gender"":""Male"",""address"":{""street"":""12126 Union Point"",""city"":""Iguape"",""country"":""Brazil""}}",2021-12-14T23:15:43.375Z
C00305,pcollier5r@cmu.edu,"{""first_name"":""Page"",""last_name"":""Collier"",""gender"":""Male"",""address"":{""street"":""3 Farragut Lane"",""city"":""Berlin"",""country"":""Germany""}}",2021-12-14T23:15:43.375Z
C00306,,"{""first_name"":""Tally"",""last_name"":""Collins"",""gender"":""Male"",""address"":{""street"":""4 Hovde Park"",""city"":""Cairo"",""country"":""Egypt""}}",2021-12-14T23:15:43.375Z
C00307,lcollocottcm@t-online.de,"{""first_name"":""Leupold"",""last_name"":""Collocott"",""gender"":""Male"",""address"":{""street"":""917 Stephen Circle"",""city"":""Dzerzhinskiy"",""country"":""Russia""}}",2021-12-14T23:15:43.375Z
C00308,icolloughfa@prweb.com,"{""first_name"":""Inesita"",""last_name"":""Collough"",""gender"":""Female"",""address"":{""street"":""7910 Delladonna Street"",""city"":""Osoyoos"",""country"":""Canada""}}",2021-12-14T23:15:43.375Z
C00309,jcollymore4n@pcworld.com,"{""first_name"":""Joelle"",""last_name"":""Collymore"",""gender"":""Female"",""address"":{""street"":""19 Dayton Court"",""city"":""Yidu"",""country"":""China""}}",2021-12-14T23:15:43.375Z
C00310,gcolnetef@japanpost.jp,"{""first_name"":""Goldi"",""last_name"":""Colnet"",""gender"":""Female"",""address"":{""street"":""710 Knutson Place"",""city"":""Suso"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z


In [0]:
print(dataset_bookstore)

dbfs:/mnt/demo-datasets/bookstore


In [0]:
%sql
SELECT * FROM json.`${dataset.bookstore}/customers-json` -- directly querying the tables from the given directory link

customer_id,email,profile,updated
C00301,thomas.lane@gmail.com,"{""first_name"":""Thomas"",""last_name"":""Lane"",""gender"":""Male"",""address"":{""street"":""06 Boulevard Victor Hugo"",""city"":""Paris"",""country"":""France""}}",2021-12-14T23:15:43.375Z
C00302,ocolegatele@blogger.com,"{""first_name"":""Odilia"",""last_name"":""Colegate"",""gender"":""Female"",""address"":{""street"":""07 Sommers Parkway"",""city"":""Lyon"",""country"":""France""}}",2021-12-14T23:15:43.375Z
C00303,acolledged2@nbcnews.com,"{""first_name"":""Andros"",""last_name"":""Colledge"",""gender"":""Male"",""address"":{""street"":""342 Katie Center"",""city"":""Gort"",""country"":""Ireland""}}",2021-12-14T23:15:43.375Z
C00304,,"{""first_name"":""Iver"",""last_name"":""Collet"",""gender"":""Male"",""address"":{""street"":""12126 Union Point"",""city"":""Iguape"",""country"":""Brazil""}}",2021-12-14T23:15:43.375Z
C00305,pcollier5r@cmu.edu,"{""first_name"":""Page"",""last_name"":""Collier"",""gender"":""Male"",""address"":{""street"":""3 Farragut Lane"",""city"":""Berlin"",""country"":""Germany""}}",2021-12-14T23:15:43.375Z
C00306,,"{""first_name"":""Tally"",""last_name"":""Collins"",""gender"":""Male"",""address"":{""street"":""4 Hovde Park"",""city"":""Cairo"",""country"":""Egypt""}}",2021-12-14T23:15:43.375Z
C00307,lcollocottcm@t-online.de,"{""first_name"":""Leupold"",""last_name"":""Collocott"",""gender"":""Male"",""address"":{""street"":""917 Stephen Circle"",""city"":""Dzerzhinskiy"",""country"":""Russia""}}",2021-12-14T23:15:43.375Z
C00308,icolloughfa@prweb.com,"{""first_name"":""Inesita"",""last_name"":""Collough"",""gender"":""Female"",""address"":{""street"":""7910 Delladonna Street"",""city"":""Osoyoos"",""country"":""Canada""}}",2021-12-14T23:15:43.375Z
C00309,jcollymore4n@pcworld.com,"{""first_name"":""Joelle"",""last_name"":""Collymore"",""gender"":""Female"",""address"":{""street"":""19 Dayton Court"",""city"":""Yidu"",""country"":""China""}}",2021-12-14T23:15:43.375Z
C00310,gcolnetef@japanpost.jp,"{""first_name"":""Goldi"",""last_name"":""Colnet"",""gender"":""Female"",""address"":{""street"":""710 Knutson Place"",""city"":""Suso"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z


In [0]:
%sql
SELECT count(*) FROM json.`${dataset.bookstore}/customers-json` -- count of customers 

count(1)
1700


In [0]:
%sql
SELECT *,
    input_file_name() source_file -- using this command an additional column will be added with name as 'source_file' and it will contain 
    -- the path of the file's source file
  FROM json.`${dataset.bookstore}/customers-json`;



customer_id,email,profile,updated,source_file
C00301,thomas.lane@gmail.com,"{""first_name"":""Thomas"",""last_name"":""Lane"",""gender"":""Male"",""address"":{""street"":""06 Boulevard Victor Hugo"",""city"":""Paris"",""country"":""France""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00302,ocolegatele@blogger.com,"{""first_name"":""Odilia"",""last_name"":""Colegate"",""gender"":""Female"",""address"":{""street"":""07 Sommers Parkway"",""city"":""Lyon"",""country"":""France""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00303,acolledged2@nbcnews.com,"{""first_name"":""Andros"",""last_name"":""Colledge"",""gender"":""Male"",""address"":{""street"":""342 Katie Center"",""city"":""Gort"",""country"":""Ireland""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00304,,"{""first_name"":""Iver"",""last_name"":""Collet"",""gender"":""Male"",""address"":{""street"":""12126 Union Point"",""city"":""Iguape"",""country"":""Brazil""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00305,pcollier5r@cmu.edu,"{""first_name"":""Page"",""last_name"":""Collier"",""gender"":""Male"",""address"":{""street"":""3 Farragut Lane"",""city"":""Berlin"",""country"":""Germany""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00306,,"{""first_name"":""Tally"",""last_name"":""Collins"",""gender"":""Male"",""address"":{""street"":""4 Hovde Park"",""city"":""Cairo"",""country"":""Egypt""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00307,lcollocottcm@t-online.de,"{""first_name"":""Leupold"",""last_name"":""Collocott"",""gender"":""Male"",""address"":{""street"":""917 Stephen Circle"",""city"":""Dzerzhinskiy"",""country"":""Russia""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00308,icolloughfa@prweb.com,"{""first_name"":""Inesita"",""last_name"":""Collough"",""gender"":""Female"",""address"":{""street"":""7910 Delladonna Street"",""city"":""Osoyoos"",""country"":""Canada""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00309,jcollymore4n@pcworld.com,"{""first_name"":""Joelle"",""last_name"":""Collymore"",""gender"":""Female"",""address"":{""street"":""19 Dayton Court"",""city"":""Yidu"",""country"":""China""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json
C00310,gcolnetef@japanpost.jp,"{""first_name"":""Goldi"",""last_name"":""Colnet"",""gender"":""Female"",""address"":{""street"":""710 Knutson Place"",""city"":""Suso"",""country"":""Philippines""}}",2021-12-14T23:15:43.375Z,dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json


In [0]:
%sql
SELECT * FROM text.`${dataset.bookstore}/customers-json`; -- selecting data in from of text from the file

value
"{""customer_id"":""C00301"",""email"":""thomas.lane@gmail.com"",""profile"":""{\""first_name\"":\""Thomas\"",\""last_name\"":\""Lane\"",\""gender\"":\""Male\"",\""address\"":{\""street\"":\""06 Boulevard Victor Hugo\"",\""city\"":\""Paris\"",\""country\"":\""France\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00302"",""email"":""ocolegatele@blogger.com"",""profile"":""{\""first_name\"":\""Odilia\"",\""last_name\"":\""Colegate\"",\""gender\"":\""Female\"",\""address\"":{\""street\"":\""07 Sommers Parkway\"",\""city\"":\""Lyon\"",\""country\"":\""France\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00303"",""email"":""acolledged2@nbcnews.com"",""profile"":""{\""first_name\"":\""Andros\"",\""last_name\"":\""Colledge\"",\""gender\"":\""Male\"",\""address\"":{\""street\"":\""342 Katie Center\"",\""city\"":\""Gort\"",\""country\"":\""Ireland\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00304"",""profile"":""{\""first_name\"":\""Iver\"",\""last_name\"":\""Collet\"",\""gender\"":\""Male\"",\""address\"":{\""street\"":\""12126 Union Point\"",\""city\"":\""Iguape\"",\""country\"":\""Brazil\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00305"",""email"":""pcollier5r@cmu.edu"",""profile"":""{\""first_name\"":\""Page\"",\""last_name\"":\""Collier\"",\""gender\"":\""Male\"",\""address\"":{\""street\"":\""3 Farragut Lane\"",\""city\"":\""Berlin\"",\""country\"":\""Germany\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00306"",""profile"":""{\""first_name\"":\""Tally\"",\""last_name\"":\""Collins\"",\""gender\"":\""Male\"",\""address\"":{\""street\"":\""4 Hovde Park\"",\""city\"":\""Cairo\"",\""country\"":\""Egypt\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00307"",""email"":""lcollocottcm@t-online.de"",""profile"":""{\""first_name\"":\""Leupold\"",\""last_name\"":\""Collocott\"",\""gender\"":\""Male\"",\""address\"":{\""street\"":\""917 Stephen Circle\"",\""city\"":\""Dzerzhinskiy\"",\""country\"":\""Russia\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00308"",""email"":""icolloughfa@prweb.com"",""profile"":""{\""first_name\"":\""Inesita\"",\""last_name\"":\""Collough\"",\""gender\"":\""Female\"",\""address\"":{\""street\"":\""7910 Delladonna Street\"",\""city\"":\""Osoyoos\"",\""country\"":\""Canada\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00309"",""email"":""jcollymore4n@pcworld.com"",""profile"":""{\""first_name\"":\""Joelle\"",\""last_name\"":\""Collymore\"",\""gender\"":\""Female\"",\""address\"":{\""street\"":\""19 Dayton Court\"",\""city\"":\""Yidu\"",\""country\"":\""China\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"
"{""customer_id"":""C00310"",""email"":""gcolnetef@japanpost.jp"",""profile"":""{\""first_name\"":\""Goldi\"",\""last_name\"":\""Colnet\"",\""gender\"":\""Female\"",\""address\"":{\""street\"":\""710 Knutson Place\"",\""city\"":\""Suso\"",\""country\"":\""Philippines\""}}"",""updated"":""2021-12-14T23:15:43.375Z""}"


In [0]:
%sql
SELECT * FROM csv.`${dataset.bookstore}/books-csv`; -- selecting data in csv fromat from file

_c0
book_id;title;author;category;price
B07;The Hundred-Page Machine Learning;Andriy Burkov;Computer Science;33
B08;Quantum Computing for Everyone;Chris Bernhardt;Computer Science;41
B09;Advanced Data Structures;Peter Brass;Computer Science;24
book_id;title;author;category;price
B10;Beginning Database Design Solutions;Rod Stephens;Computer Science;44
B11;Business Intelligence for Dummies;Swain Scheps;Computer Science;38
B12;Big Data in Practice;Bernard Marr;Computer Science;30
book_id;title;author;category;price
B01;The Soul of a New Machine;Tracy Kidder;Computer Science;49


In [0]:
%sql
SELECT * FROM binaryFile.`${dataset.bookstore}/customers-json`; -- querying in binary format 

path,modificationTime,length,content
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_002.json,2025-01-01T05:17:09.000+0000,80001,eyJjdXN0b21lcl9pZCI6IkMwMDMwMSIsImVtYWlsIjoidGhvbWFzLmxhbmVAZ21haWwuY29tIiwicHJvZmlsZSI6IntcImZpcnN0X25hbWVcIjpcIlRob21hc1wiLFwibGFzdF9uYW1lXCI6XCJMYW5lXCIsXCJnZW5kZXJcIjo= (truncated)
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_004.json,2025-01-01T05:17:10.000+0000,79976,eyJjdXN0b21lcl9pZCI6IkMwMDkwMSIsImVtYWlsIjoiZ2xlbmFyZDN2QG1paXRiZWlhbi5nb3YuY24iLCJwcm9maWxlIjoie1wiZmlyc3RfbmFtZVwiOlwiR3JlZ29vclwiLFwibGFzdF9uYW1lXCI6XCJMZW5hcmRcIixcImc= (truncated)
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_003.json,2025-01-01T05:17:09.000+0000,79781,eyJjdXN0b21lcl9pZCI6IkMwMDYwMSIsImVtYWlsIjoic2dvbm5lbHk1YUBhb2wuY29tIiwicHJvZmlsZSI6IntcImZpcnN0X25hbWVcIjpcIlN1c2FuYVwiLFwibGFzdF9uYW1lXCI6XCJHb25uZWx5XCIsXCJnZW5kZXJcIjo= (truncated)
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_005.json,2025-01-01T05:17:11.000+0000,79727,eyJjdXN0b21lcl9pZCI6IkMwMTIwMSIsImVtYWlsIjoiYXBlZHJpY2s0aUBibG9nbGluZXMuY29tIiwicHJvZmlsZSI6IntcImZpcnN0X25hbWVcIjpcIkFtYmxlXCIsXCJsYXN0X25hbWVcIjpcIlBlZHJpY2tcIixcImdlbmQ= (truncated)
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_001.json,2025-01-01T05:17:08.000+0000,79378,eyJjdXN0b21lcl9pZCI6IkMwMDAwMSIsImVtYWlsIjoiZGFiYnkyeUBqYXBhbnBvc3QuanAiLCJwcm9maWxlIjoie1wiZmlyc3RfbmFtZVwiOlwiRG5pcmVuXCIsXCJsYXN0X25hbWVcIjpcIkFiYnlcIixcImdlbmRlclwiOlw= (truncated)
dbfs:/mnt/demo-datasets/bookstore/customers-json/export_006.json,2025-01-01T05:17:11.000+0000,53243,eyJjdXN0b21lcl9pZCI6IkMwMTUwMSIsImVtYWlsIjoibXN1bGxlcjMxQGNuYmMuY29tIiwicHJvZmlsZSI6IntcImZpcnN0X25hbWVcIjpcIk1pdGNoZWxsXCIsXCJsYXN0X25hbWVcIjpcIlN1bGxlclwiLFwiZ2VuZGVyXCI= (truncated)


#Registering tables from files 

##using CTAS 

In [0]:
%sql
CREATE TABLE books_csv AS
SELECT * FROM csv.`${dataset.bookstore}/books-csv`;

num_affected_rows,num_inserted_rows


In [0]:
%sql
DESCRIBE EXTENDED books_csv;

col_name,data_type,comment
_c0,string,
,,
# Detailed Table Information,,
Catalog,hive_metastore,
Database,default,
Table,books_csv,
Created Time,Thu Jan 02 09:33:48 UTC 2025,
Last Access,UNKNOWN,
Created By,Spark 3.3.2,
Statistics,8017 bytes,


In [0]:
%sql
SELECT * FROM books_csv;

_c0
book_id;title;author;category;price
B07;The Hundred-Page Machine Learning;Andriy Burkov;Computer Science;33.0
B08;Quantum Computing for Everyone;Chris Bernhardt;Computer Science;41.0
B09;Advanced Data Structures;Peter Brass;Computer Science;24.0
book_id;title;author;category;price
B10;Beginning Database Design Solutions;Rod Stephens;Computer Science;44
B11;Business Intelligence for Dummies;Swain Scheps;Computer Science;38
B12;Big Data in Practice;Bernard Marr;Computer Science;30
book_id;title;author;category;price
B01;The Soul of a New Machine;Tracy Kidder;Computer Science;49.0


The table books_csv is a delta table but it is not in well defined format. Headers are counted as first row and all the columns are merged into a single column. Therefore we need to use another format to create a delta table i.e. Create table by 'USING' keyword.

CREATING DELTA TABLE USING DATA FROM AN EXTERNAL SOURCE

In [0]:
%sql
CREATE TABLE books_csv2 
(book_id STRING, title STRING, author STRING, category STRING,price DOUBLE)
USING CSV
OPTIONS (header=true, delimiter = ";")

In [0]:
%sql
DESCRIBE EXTENDED books_csv2;

col_name,data_type,comment
book_id,string,
title,string,
author,string,
category,string,
price,double,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,default,
Table,books_csv2,


In [0]:
%sql
SELECT * FROM books_csv2;

book_id,title,author,category,price


In [0]:
%sql
CREATE TABLE books_csv
  (book_id STRING, title STRING, author STRING, category STRING, price DOUBLE) -- initializing schema 
USING CSV
OPTIONS(
  header = "true",
  delimiter = ";"  -- in out case delimiter is ; instead of ,
  -- path = 'specify the path where your data is present which you want to insert in file'
)
LOCATION "${dataset.bookstore}/books-csv" -- LOCATION SPECIFIES WHERE WE WANT TO STORE OUR CREATED FILE 
-- we can also create a table by specifying location. In this way the table will be an external table.


In [0]:
%sql
-- querying books_csv
SELECT * FROM books_csv;

book_id,title,author,category,price
B07,The Hundred-Page Machine Learning,Andriy Burkov,Computer Science,33.0
B08,Quantum Computing for Everyone,Chris Bernhardt,Computer Science,41.0
B09,Advanced Data Structures,Peter Brass,Computer Science,24.0
B10,Beginning Database Design Solutions,Rod Stephens,Computer Science,44.0
B11,Business Intelligence for Dummies,Swain Scheps,Computer Science,38.0
B12,Big Data in Practice,Bernard Marr,Computer Science,30.0
B01,The Soul of a New Machine,Tracy Kidder,Computer Science,49.0
B02,Learning JavaScript Design Patterns,Addy Osmani,Computer Science,28.0
B03,Make Your Own Neural Network,Tariq Rashid,Computer Science,35.0
B04,Robot Dynamics and Control,Mark W. Spong,Computer Science,20.0


In [0]:
%sql
DESCRIBE EXTENDED books_csv;

col_name,data_type,comment
book_id,string,
title,string,
author,string,
category,string,
price,double,
,,
# Detailed Table Information,,
Catalog,hive_metastore,
Database,default,
Table,books_csv,


#Creating a delta table using temp view


In [0]:
%sql
SELECT * FROM csv.`/mnt/demo-datasets/bookstore/books-csv-new/export_005.csv`;

_c0
book_id;title;author;category;price
B14;Data Communications and Networking;Behrouz A. Forouzan;Computer Science;34
B15;Inside the Java Virtual Machine;Bill Venners;Computer Science;41
B13;Linux pocket guide;Daniel J. Barrett;Computer Science;26


In [0]:
%sql
-- creating a temp view out of the csv file in above cell

CREATE TEMP VIEW export_temp_view
(book_id STRING ,title STRING,author STRING,category STRING,price DOUBLE)
USING CSV
OPTIONS (
  path = '/mnt/demo-datasets/bookstore/books-csv-new/export_005.csv',
  header = "true",
  delimiter = ";"
);


In [0]:
%sql
CREATE TABLE export_temp_table
AS SELECT * FROM export_temp_view;

num_affected_rows,num_inserted_rows


In [0]:
%sql
SELECT * FROM export_temp_table;

book_id,title,author,category,price
B14,Data Communications and Networking,Behrouz A. Forouzan,Computer Science,34.0
B15,Inside the Java Virtual Machine,Bill Venners,Computer Science,41.0
B13,Linux pocket guide,Daniel J. Barrett,Computer Science,26.0


In [0]:
files = dbutils.fs.ls(f"{dataset_bookstore}/books-csv")
display(files)

path,name,size,modificationTime
dbfs:/mnt/demo-datasets/bookstore/books-csv/_SUCCESS,_SUCCESS,0,1735812851000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_5900796210569617141,_committed_5900796210569617141,736,1735812851000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_963587617814185382,_committed_963587617814185382,376,1735730525000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_vacuum6687161177839201177,_committed_vacuum6687161177839201177,95,1735812852000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_started_5900796210569617141,_started_5900796210569617141,0,1735812850000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_001.csv,export_001.csv,238,1735708613000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_002.csv,export_002.csv,237,1735708614000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_003.csv,export_003.csv,240,1735708615000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_004.csv,export_004.csv,223,1735708615000
dbfs:/mnt/demo-datasets/bookstore/books-csv/part-00000-tid-5900796210569617141-66f80033-7f1f-4317-8994-2aa18bdbbdd5-245-1-c000.csv,part-00000-tid-5900796210569617141-66f80033-7f1f-4317-8994-2aa18bdbbdd5-245-1-c000.csv,258,1735812851000


In [0]:
# -- writing in books-csv 

(spark.read
        .table("books_csv")
      .write
        .mode("append")
        .format("csv")
        .option('header', 'true')
        .option('delimiter', ';')
        .save(f"{dataset_bookstore}/books-csv"))

In [0]:
files = dbutils.fs.ls(f"{dataset_bookstore}/books-csv")
display(files)

path,name,size,modificationTime
dbfs:/mnt/demo-datasets/bookstore/books-csv/_SUCCESS,_SUCCESS,0,1735813321000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_1341138122711521162,_committed_1341138122711521162,736,1735813321000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_5900796210569617141,_committed_5900796210569617141,736,1735812851000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_963587617814185382,_committed_963587617814185382,376,1735730525000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_committed_vacuum6687161177839201177,_committed_vacuum6687161177839201177,95,1735812852000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_started_1341138122711521162,_started_1341138122711521162,0,1735813320000
dbfs:/mnt/demo-datasets/bookstore/books-csv/_started_5900796210569617141,_started_5900796210569617141,0,1735812850000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_001.csv,export_001.csv,238,1735708613000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_002.csv,export_002.csv,237,1735708614000
dbfs:/mnt/demo-datasets/bookstore/books-csv/export_003.csv,export_003.csv,240,1735708615000


In [0]:
%sql
SELECT COUNT(*) csv.`${dataset_bookstore}/books-csv"`;

[0;31m---------------------------------------------------------------------------[0m
[0;31mParseException[0m                            Traceback (most recent call last)
File [0;32m<command-1587427143463975>:7[0m
[1;32m      5[0m     display(df)
[1;32m      6[0m     [38;5;28;01mreturn[39;00m df
[0;32m----> 7[0m   _sqldf [38;5;241m=[39m [43m____databricks_percent_sql[49m[43m([49m[43m)[49m
[1;32m      8[0m [38;5;28;01mfinally[39;00m:
[1;32m      9[0m   [38;5;28;01mdel[39;00m ____databricks_percent_sql

File [0;32m<command-1587427143463975>:4[0m, in [0;36m____databricks_percent_sql[0;34m()[0m
[1;32m      2[0m [38;5;28;01mdef[39;00m [38;5;21m____databricks_percent_sql[39m():
[1;32m      3[0m   [38;5;28;01mimport[39;00m [38;5;21;01mbase64[39;00m
[0;32m----> 4[0m   df [38;5;241m=[39m [43mspark[49m[38;5;241;43m.[39;49m[43msql[49m[43m([49m[43mbase64[49m[38;5;241;43m.[39;49m[43mstandard_b64decode[49m[43m([49m[38;5;124;43m"[39;

In [0]:
%sql
SELECT * FROM json.`/FileStore/tables/iris.json`;

_corrupt_record,petalLength,petalWidth,sepalLength,sepalWidth,species
[,,,,,
,1.4,0.2,5.1,3.5,setosa
,1.4,0.2,4.9,3.0,setosa
,1.3,0.2,4.7,3.2,setosa
,1.5,0.2,4.6,3.1,setosa
,1.4,0.2,5.0,3.6,setosa
,1.7,0.4,5.4,3.9,setosa
,1.4,0.3,4.6,3.4,setosa
,1.5,0.2,5.0,3.4,setosa
,1.4,0.2,4.4,2.9,setosa
