<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_sql_ops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

sc = spark.sparkContext # create a SQL context

In [3]:
from pyspark.sql import SQLContext
from pyspark.sql.types import Row
from datetime import datetime

sqlc = SQLContext(sc) # create a SQL context

# Note: we can query through the sql if the table is there. The sql cannot work directly with the dataframe. 
# We can create a table/view for the dataframe.
# The sql returns a dataframe.

In [4]:
record_rdd = sc.parallelize([Row(id = 1,
                             name = "Jill",
                             active = True,
                             clubs = ['chess', 'hockey'], # list
                             subjects = {"math": 80, 'english': 56}, # dictionary
                             enrolled = datetime(2014, 8, 1, 14, 1, 5)),
                         Row(id = 2,
                             name = "George",
                             active = False,
                             clubs = ['chess', 'soccer'],
                             subjects = {"math": 60, 'english': 96},
                             enrolled = datetime(2015, 3, 21, 8, 2, 5))
])

In [5]:
record_df = record_rdd.toDF()
record_df.show()

+---+------+------+---------------+--------------------+-------------------+
| id|  name|active|          clubs|            subjects|           enrolled|
+---+------+------+---------------+--------------------+-------------------+
|  1|  Jill|  true|[chess, hockey]|{english -> 56, m...|2014-08-01 14:01:05|
|  2|George| false|[chess, soccer]|{english -> 96, m...|2015-03-21 08:02:05|
+---+------+------+---------------+--------------------+-------------------+



In [6]:
# Register dataframe as a table. This is a temp table per session. As session ends then the table will be gone.
# It does not share across the spark sessions.
record_df.createOrReplaceTempView('record_tbl')

# Once the table is created then we can use the sql context to query data from the table
record_tbl_df1 = sqlc.sql('select * from record_tbl')
record_tbl_df1.show()

record_tbl_df2 = sqlc.sql('select id, clubs[1], subjects["english"] from record_tbl')
record_tbl_df2.show()

record_tbl_df3 = sqlc.sql('select id, NOT active from record_tbl')
record_tbl_df3.show()

record_tbl_df4 = sqlc.sql('select * from record_tbl where active')
record_tbl_df4.show()

+---+------+------+---------------+--------------------+-------------------+
| id|  name|active|          clubs|            subjects|           enrolled|
+---+------+------+---------------+--------------------+-------------------+
|  1|  Jill|  true|[chess, hockey]|{english -> 56, m...|2014-08-01 14:01:05|
|  2|George| false|[chess, soccer]|{english -> 96, m...|2015-03-21 08:02:05|
+---+------+------+---------------+--------------------+-------------------+

+---+--------+-----------------+
| id|clubs[1]|subjects[english]|
+---+--------+-----------------+
|  1|  hockey|               56|
|  2|  soccer|               96|
+---+--------+-----------------+

+---+------------+
| id|(NOT active)|
+---+------------+
|  1|       false|
|  2|        true|
+---+------------+

+---+----+------+---------------+--------------------+-------------------+
| id|name|active|          clubs|            subjects|           enrolled|
+---+----+------+---------------+--------------------+-------------------

In [7]:
# It's available cross spark sessions within a cluster.
# we have to use global_temp database/namespace.
record_df.createOrReplaceGlobalTempView('record_glb_tbl')
record_glb_td1 = sqlc.sql('select * from global_temp.record_glb_tbl')
record_glb_td1.show()

+---+------+------+---------------+--------------------+-------------------+
| id|  name|active|          clubs|            subjects|           enrolled|
+---+------+------+---------------+--------------------+-------------------+
|  1|  Jill|  true|[chess, hockey]|{english -> 56, m...|2014-08-01 14:01:05|
|  2|George| false|[chess, soccer]|{english -> 96, m...|2015-03-21 08:02:05|
+---+------+------+---------------+--------------------+-------------------+



### File - Read and Write

In [8]:
# Select data from the file
#house_df = spark.sql('select * from parquet.`/user/doc/house_dataset/file1`')
#house_df.show()

# saveAsTable()


### Catalog

In [13]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/content/spark-warehouse')]

In [14]:
spark.catalog.listTables()

[Table(name='record_tbl', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]