# Configure Spark to Python Jupyter

In [1]:
#Setting Spark installation location to jupyter
import os
import sys
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/home/hadoop/anaconda2/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/home/hadoop/anaconda2/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [2]:
# To find out where the pyspark
import findspark
findspark.init()

<a href='https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=explode#pyspark.sql.SparkSession.Builder.enableHiveSupport'>PySpark API Docs</a>

# Create SparkSession object to connect the Spark Cluster

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Read HBase Table using PySpark Demo") \
    .config("spark.jars", "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hive/lib/hive-hbase-handler-2.1.1-cdh6.2.0.jar") \
    .config("spark.executor.extraClassPath", "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hive/lib/hive-hbase-handler-2.1.1-cdh6.2.0.jar") \
    .config("spark.executor.extraLibrary", "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hive/lib/hive-hbase-handler-2.1.1-cdh6.2.0.jar") \
    .config("spark.driver.extraClassPath", "/opt/cloudera/parcels/CDH-6.2.0-1.cdh6.2.0.p0.967373/lib/hive/lib/hive-hbase-handler-2.1.1-cdh6.2.0.jar") \
    .enableHiveSupport()\
    .getOrCreate()

In [4]:
print(spark.sparkContext.appName)

Read HBase Table using PySpark Demo


# Read HBase Table using SparkSession Object

In [5]:
transaction_detail_df = spark.sql("use default")
transaction_detail_df = spark.sql("select * from transaction_detail_hive_tbl")
#transaction_detail_df = spark.sql("select * from default.transaction_detail_hive_tbl")

In [6]:
transaction_detail_df.printSchema()

root
 |-- transaction_id: integer (nullable = true)
 |-- transaction_card_type: string (nullable = true)
 |-- transaction_ecommerce_website_name: string (nullable = true)
 |-- transaction_product_name: string (nullable = true)
 |-- transaction_datetime: string (nullable = true)
 |-- transaction_amount: double (nullable = true)
 |-- transaction_city_name: string (nullable = true)
 |-- transaction_country_name: string (nullable = true)



In [7]:
transaction_detail_df.show(2, False)

+--------------+---------------------+----------------------------------+------------------------+--------------------+------------------+---------------------+------------------------+
|transaction_id|transaction_card_type|transaction_ecommerce_website_name|transaction_product_name|transaction_datetime|transaction_amount|transaction_city_name|transaction_country_name|
+--------------+---------------------+----------------------------------+------------------------+--------------------+------------------+---------------------+------------------------+
|1             |MasterCard           |www.ebay.com                      |Laptop                  |2019-05-14 15:24:12 |50.85             |Mumbai               |India                   |
|2             |MasterCard           |www.amazon.com                    |Wrist Band              |2019-05-14 15:24:13 |259.12            |Pune                 |India                   |
+--------------+---------------------+--------------------------------

In [8]:
transaction_detail_df.count()

5

In [9]:
transaction_detail_df_stg1 = transaction_detail_df.select("transaction_card_type", "transaction_country_name", "transaction_amount")

In [10]:
transaction_detail_df_stg1.show(5, False)

+---------------------+------------------------+------------------+
|transaction_card_type|transaction_country_name|transaction_amount|
+---------------------+------------------------+------------------+
|MasterCard           |India                   |50.85             |
|MasterCard           |India                   |259.12            |
|MasterCard           |United States           |328.16            |
|Visa                 |India                   |399.06            |
|Visa                 |Italy                   |194.52            |
+---------------------+------------------------+------------------+



In [11]:
transaction_detail_df_stg1.groupby('transaction_card_type').agg({'transaction_amount': 'sum'}).show()

+---------------------+-----------------------+
|transaction_card_type|sum(transaction_amount)|
+---------------------+-----------------------+
|                 Visa|                 593.58|
|           MasterCard|      638.1300000000001|
+---------------------+-----------------------+



In [12]:
spark.stop()