## Import libraries

In [1]:
from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession

## Create a SparkSession object

In [3]:
builder = (SparkSession.builder
           .appName("read-delta-table")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
          )

spark = configure_spark_with_delta_pip(builder).getOrCreate()

spark.sparkContext.setLogLevel("ERROR")



## Read the Delta Table

In [5]:
df = (spark.read.format("delta")
      .load("/opt/workspace/data/delta_lake/netflix_titles"))

                                                                                

## Explore the data

In [6]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [7]:
df.show(5, truncate=False)

                                                                                

+-------+-------+---------------------+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+------------------+------------+------+---------+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|show_id|type   |title                |director       |cast                                                                                                                                                                                                                                                                                                           |cou

## Using SQL to explore the Delta Tables

In [10]:
sql_query = """

SELECT * FROM delta.`/opt/workspace/data/delta_lake/netflix_titles` LIMIT 3 

"""

#use backticks for path-not quotes

df_2 = spark.sql(sql_query)
df_2.show(2)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
+-------+-------+--------------------+---------------+--------------------+-------------+------

## Get the history of Delta Table

In [17]:
sql_query = """

DESCRIBE HISTORY delta.`/opt/workspace/data/delta_lake/netflix_titles` 

"""

#use backticks for path-not quotes

history = spark.sql(sql_query)
history.show(truncate=False)

+-------+-----------------------+------+--------+---------------------------------+------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation                        |operationParameters                                                           |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                 |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------------------------------+------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------+------------+--------

## Retention period for Table history

In [18]:
## Stop the spark session

spark.stop()