<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_hive_ops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
# If we don't specifiy the Hive support then it gives an error while creating a table:
# AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT);
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .enableHiveSupport() \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
from pyspark.sql import Row
from pyspark.sql import HiveContext
from pyspark.sql import SQLContext

hc = HiveContext(sc) # create a Hive context
sqc = SQLContext(sc) # create a SQL context

# So does diff between these two? - 
# HiveContext is still the superset of SQLContext.
# It contains certain extra properties such as it can read the configuration from hive-site.xml, in case you have hive use cases otherwise simply use SQLContext

In [4]:
hc

<pyspark.sql.context.HiveContext at 0x7f495c5ab150>

In [5]:
sqc

<pyspark.sql.context.SQLContext at 0x7f495c5ab210>

### Load Constant Data

In [6]:
people_list = [('A', 25), ('B', 20), ('C', 30), ('D', 15)]
people_rdd_pair = sc.parallelize(people_list) # create a RDD but with Tuple/Pair objects
#people_rdd_pair # ParallelCollectionRDD[2] at readRDDFromFile

people_rdd_row = people_rdd_pair.map(lambda x: Row(name=x[0], age=int(x[1]))) # create a RDD with Row objects
#people # PythonRDD[5] at RDD

people_rdd_row.collect()

[Row(name='A', age=25),
 Row(name='B', age=20),
 Row(name='C', age=30),
 Row(name='D', age=15)]

In [7]:
people_df = spark.createDataFrame(people_rdd_row)
#people_df # DataFrame[name: string, age: bigint]

people_df.collect()

[Row(name='A', age=25),
 Row(name='B', age=20),
 Row(name='C', age=30),
 Row(name='D', age=15)]

### HiveContext

In [8]:
hc.registerDataFrameAsTable(people_df, 'people_tbl') # create a table
hc.sql('show tables').show()

+--------+----------+-----------+
|database| tableName|isTemporary|
+--------+----------+-----------+
|        |people_tbl|       true|
+--------+----------+-----------+



In [9]:
hc.sql('select * from people_tbl').show()

+----+---+
|name|age|
+----+---+
|   A| 25|
|   B| 20|
|   C| 30|
|   D| 15|
+----+---+



###Load JSON Data

In [10]:
people_json_df = spark.read.json('sample_data/anscombe.json') # Series is not Pandas class here :) It is a key in JSON file.
people_json_df

DataFrame[Series: string, X: double, Y: double, _corrupt_record: string]

In [11]:
people_json_df.registerTempTable('people_json_tbl')
hc.sql('show tables').show()

+--------+---------------+-----------+
|database|      tableName|isTemporary|
+--------+---------------+-----------+
|        |people_json_tbl|       true|
|        |     people_tbl|       true|
+--------+---------------+-----------+



In [12]:
results = hc.sql('select * from people_json_tbl')
results.show() # default shows top 20 rows

+------+----+-----+---------------+
|Series|   X|    Y|_corrupt_record|
+------+----+-----+---------------+
|  null|null| null|              [|
|     I|10.0| 8.04|           null|
|     I| 8.0| 6.95|           null|
|     I|13.0| 7.58|           null|
|     I| 9.0| 8.81|           null|
|     I|11.0| 8.33|           null|
|     I|14.0| 9.96|           null|
|     I| 6.0| 7.24|           null|
|     I| 4.0| 4.26|           null|
|     I|12.0|10.84|           null|
|     I| 7.0| 4.81|           null|
|     I| 5.0| 5.68|           null|
|    II|10.0| 9.14|           null|
|    II| 8.0| 8.14|           null|
|    II|13.0| 8.74|           null|
|    II| 9.0| 8.77|           null|
|    II|11.0| 9.26|           null|
|    II|14.0|  8.1|           null|
|    II| 6.0| 6.13|           null|
|    II| 4.0|  3.1|           null|
+------+----+-----+---------------+
only showing top 20 rows



### If we need to connect Hadoop Yarn Hive then make sure the script is submited in the Yarn

In [13]:
# Set master as yarn in that case and enable hive support
spark = SparkSession.builder \
        .master("yarn") \
        .appName("Spark_App1") \
        .enableHiveSupport() \
        .getOrCreate()

### SQLContext

In [14]:
# It's a alternative way to play with data. SQLContext helps to create a DataFrame too.

df = sqc.range(5)
#df # DataFrame[id: bigint]
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



### Partition

In [15]:
# 1. It won't delete the files and folders. because it is an external table.
# Table or view 'temp_tbl1' already exists in database 'default'
spark.sql('drop table if exists temp_tbl1')
# 2. It creates a spark-warehouse/ext/temp_db/temp_tbl2/datekey=2021-06-20/_SUCCESS
# CREATE EXTERNAL TABLE must be accompanied by LOCATION
spark.sql("create external table temp_tbl1(id string, name string) partitioned by(datekey string) stored as parquet location 'ext/temp_db/temp_tbl2'")

 # 3. I ran this query 5+ times. :) It keeps adding new row because the drop table does not delete existing data due to the external table.
spark.sql("insert into temp_tbl1 values('A1', 'Phill', '2020-01-30')")

spark.sql('select * from temp_tbl1').show()
spark.sql('show partitions temp_tbl1').show()


+---+-----+----------+
| id| name|   datekey|
+---+-----+----------+
| A1|Phill|2020-01-30|
+---+-----+----------+

+------------------+
|         partition|
+------------------+
|datekey=2020-01-30|
+------------------+



In [16]:
# 1. It creates a metastore_db and derby.log
# Table or view 'temp_tbl2' already exists in database 'default'
spark.sql('drop table if exists temp_tbl2') # It drops the table and delete the table folder too
# 2. It creates a spark-warehouse/temp_tbl2/datekey=2021-06-20/_SUCCESS
spark.sql("create table temp_tbl2(id string, name string) partitioned by(datekey string) stored as parquet")
# 3. It creates a partition automatically and adds parquet files. The last param value is considered as a partition value.
spark.sql("insert into temp_tbl2 values('A1', 'Jugal', '2021-06-20')") # It adds a part-00000-GUID.c000.snappy.parquet file.
spark.sql("insert into temp_tbl2 values('A2', 'Garvik', '2021-06-20')") # It adds an another part-00000-GUID.c000.snappy.parquet file.
spark.sql("insert into temp_tbl2 values('A3', 'Jack', '2021-06-21')") # It adds an another part-00000-GUID.c000.snappy.parquet file but in a new partition.

spark.sql('select * from temp_tbl2').show() # DataFrame
spark.sql('show partitions temp_tbl2').show()

# 4 It adds new partition
spark.sql("insert overwrite table temp_tbl2 partition(datekey) values('A4', 'Jill', '2021-06-22')")
spark.sql("insert into temp_tbl2 values('A5', 'Paul', '2021-06-22')")
spark.sql('select * from temp_tbl2').show()
spark.sql('show partitions temp_tbl2').show()

# 5 It adds new partition but overwrite the existing partition too.
spark.sql("insert overwrite table temp_tbl2 partition(datekey) values('A6', 'Mark', '2021-06-22')")
spark.sql('select * from temp_tbl2').show()
spark.sql('show partitions temp_tbl2').show()


+---+------+----------+
| id|  name|   datekey|
+---+------+----------+
| A2|Garvik|2021-06-20|
| A1| Jugal|2021-06-20|
| A3|  Jack|2021-06-21|
+---+------+----------+

+------------------+
|         partition|
+------------------+
|datekey=2021-06-20|
|datekey=2021-06-21|
+------------------+

+---+------+----------+
| id|  name|   datekey|
+---+------+----------+
| A2|Garvik|2021-06-20|
| A1| Jugal|2021-06-20|
| A3|  Jack|2021-06-21|
| A4|  Jill|2021-06-22|
| A5|  Paul|2021-06-22|
+---+------+----------+

+------------------+
|         partition|
+------------------+
|datekey=2021-06-20|
|datekey=2021-06-21|
|datekey=2021-06-22|
+------------------+

+---+------+----------+
| id|  name|   datekey|
+---+------+----------+
| A2|Garvik|2021-06-20|
| A1| Jugal|2021-06-20|
| A3|  Jack|2021-06-21|
| A6|  Mark|2021-06-22|
+---+------+----------+

+------------------+
|         partition|
+------------------+
|datekey=2021-06-20|
|datekey=2021-06-21|
|datekey=2021-06-22|
+------------------+

In [18]:
df_temp_tbl2 = spark.sql('select * from temp_tbl2')
df_temp_tbl2.createOrReplaceTempView("temp_tbl3")

spark.sql('drop table if exists temp_tbl4')
spark.sql("create table temp_tbl4(id string, name string) partitioned by(datekey string) stored as parquet")

spark.sql("insert overwrite table temp_tbl4 partition(datekey) select * from temp_tbl3") # It's required to create a table first.

spark.sql("select * from temp_tbl4").show()

+---+------+----------+
| id|  name|   datekey|
+---+------+----------+
| A3|  Jack|2021-06-21|
| A6|  Mark|2021-06-22|
| A2|Garvik|2021-06-20|
| A1| Jugal|2021-06-20|
+---+------+----------+

