<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_hive_ops.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

sc = spark.sparkContext

In [3]:
from pyspark.sql import Row
from pyspark.sql import HiveContext

hc = HiveContext(sc) # create a hive context

### Load Constant Data

In [4]:
people_list = [('A', 25), ('B', 20), ('C', 30), ('D', 15)]
people_rdd_pair = sc.parallelize(people_list) # create a RDD but with Tuple/Pair objects
#people_rdd_pair # ParallelCollectionRDD[2] at readRDDFromFile

people_rdd_row = people_rdd_pair.map(lambda x: Row(name=x[0], age=int(x[1]))) # create a RDD with Row objects
#people # PythonRDD[5] at RDD

people_rdd_row.collect()

[Row(name='A', age=25),
 Row(name='B', age=20),
 Row(name='C', age=30),
 Row(name='D', age=15)]

In [5]:
people_df = spark.createDataFrame(people_rdd_row)
#people_df # DataFrame[name: string, age: bigint]

people_df.collect()

[Row(name='A', age=25),
 Row(name='B', age=20),
 Row(name='C', age=30),
 Row(name='D', age=15)]

In [6]:
hc.registerDataFrameAsTable(people_df, 'people_tbl') # create a table
hc.sql('show tables').show()

+--------+----------+-----------+
|database| tableName|isTemporary|
+--------+----------+-----------+
|        |people_tbl|       true|
+--------+----------+-----------+



In [7]:
hc.sql('select * from people_tbl').show()

+----+---+
|name|age|
+----+---+
|   A| 25|
|   B| 20|
|   C| 30|
|   D| 15|
+----+---+



###Load JSON Data

In [8]:
people_json_df = spark.read.json('sample_data/anscombe.json') # Series is not Pandas class here :) It is a key in JSON file.
people_json_df

DataFrame[Series: string, X: double, Y: double, _corrupt_record: string]

In [9]:
people_json_df.registerTempTable('people_json_tbl')
hc.sql('show tables').show()

+--------+---------------+-----------+
|database|      tableName|isTemporary|
+--------+---------------+-----------+
|        |people_json_tbl|       true|
|        |     people_tbl|       true|
+--------+---------------+-----------+



In [10]:
results = hc.sql('select * from people_json_tbl')
results.show() # default shows top 20 rows

+------+----+-----+---------------+
|Series|   X|    Y|_corrupt_record|
+------+----+-----+---------------+
|  null|null| null|              [|
|     I|10.0| 8.04|           null|
|     I| 8.0| 6.95|           null|
|     I|13.0| 7.58|           null|
|     I| 9.0| 8.81|           null|
|     I|11.0| 8.33|           null|
|     I|14.0| 9.96|           null|
|     I| 6.0| 7.24|           null|
|     I| 4.0| 4.26|           null|
|     I|12.0|10.84|           null|
|     I| 7.0| 4.81|           null|
|     I| 5.0| 5.68|           null|
|    II|10.0| 9.14|           null|
|    II| 8.0| 8.14|           null|
|    II|13.0| 8.74|           null|
|    II| 9.0| 8.77|           null|
|    II|11.0| 9.26|           null|
|    II|14.0|  8.1|           null|
|    II| 6.0| 6.13|           null|
|    II| 4.0|  3.1|           null|
+------+----+-----+---------------+
only showing top 20 rows



### If we need to connect Hadoop Yarn Hive then make sure the script is submited in the Yarn

In [11]:
# Set master as yarn in that case and enable hive support
spark = SparkSession.builder \
        .master("yarn") \
        .appName("Spark_App1") \
        .enableHiveSupport() \
        .getOrCreate()