In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("SparkHiveDatabase")

# centarlized hive meta server
# hdfs hive for data warehouse
# enable hive support must for sql database
 
config.set("spark.local.dir", "/home/ubuntu/spark-temp")

# while using hive.metastore.warehouse.dir, we should not use spark warehouse dir

config.set("hive.metastore.uris", "thrift://localhost:9083")
config.set("hive.metastore.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse")


from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame

# enableHiveSupport() now using hive meta server running as server
# multiple notebooks can share hive meta server, work in parallel
# we use hive warehouse directory for spark too, this way hive and spark can co-exists
# metastore shall have meta data: database, tables, columns, data types, where exactly
# data located in hdfs or file system or s3

spark = SparkSession.builder\
                    .config(conf=config)\
                    .enableHiveSupport()\
                    .getOrCreate()

sc = spark.sparkContext

22/05/16 21:01:24 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/16 21:01:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/16 21:01:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/16 21:01:25 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/05/16 21:01:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark


In [5]:
spark.sql("SHOW DATABASES").show()


+----------+
| namespace|
+----------+
|   default|
|   moviedb|
|  ordersdb|
|productsdb|
+----------+



In [8]:
spark.sql("SHOW TABLES IN moviedb").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| moviedb|   movies|      false|
| moviedb|  ratings|      false|
+--------+---------+-----------+



In [12]:
spark.sql("SELECT * from moviedb.movies LIMIT 11").show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|    null|               title|              genres|
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|  Adventure|Children|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
+--------+--------------------+--------------------+



In [14]:
spark.sql("SELECT * from moviedb.ratings LIMIT 11").show()

+-------+--------+------+----------------+
|user_id|movie_id|rating|rating_timestamp|
+-------+--------+------+----------------+
|   null|    null|  null|            null|
|      1|       1|     4|       964982703|
|      1|       3|     4|       964981247|
|      1|       6|     4|       964982224|
|      1|      47|     5|       964983815|
|      1|      50|     5|       964982931|
|      1|      70|     3|       964982400|
|      1|     101|     5|       964980868|
|      1|     110|     4|       964982176|
|      1|     151|     5|       964984041|
|      1|     157|     5|       964984100|
+-------+--------+------+----------------+



In [25]:
spark.sql("SELECT movie_id, AVG(rating) as avg_rating, \
          COUNT(user_id) as user_id_count from moviedb.ratings WHERE rating >= 1.0 group by movie_id").show()

+--------+----------+-------------+
|movie_id|avg_rating|user_id_count|
+--------+----------+-------------+
|    1580|    3.6364|          165|
|    2366|    3.7200|           25|
|    3175|    3.6933|           75|
|    1088|    3.5714|           42|
|   32460|    4.5000|            4|
|   44022|    3.4783|           23|
|   96488|    4.5000|            4|
|    1238|    4.1111|            9|
|    1342|    2.6364|           11|
|    1591|    2.7692|           26|
|    1645|    3.5490|           51|
|    4519|    3.4444|            9|
|    2142|    2.8000|           10|
|     471|    3.6500|           40|
|    3997|    2.0000|           12|
|     833|    2.1667|            6|
|    3918|    3.3333|            9|
|    7982|    3.7500|            4|
|    1959|    3.8000|           15|
|   68135|    3.9000|           10|
+--------+----------+-------------+
only showing top 20 rows



In [41]:
spark.sql(""" CREATE OR REPLACE TEMP VIEW most_popular_temp_table AS
        SELECT movie_id, AVG(rating) as avg_ratings, COUNT(user_id) as total_ratings FROM moviedb.ratings
        group by movie_id
        HAVING avg_ratings >= 3.5 and total_ratings > 100
""")

DataFrame[]

In [42]:
spark.sql("SELECT * from most_popular_temp_table").show(10)

                                                                                

+--------+-----------+-------------+
|movie_id|avg_ratings|total_ratings|
+--------+-----------+-------------+
|    1580|     3.6364|          165|
|    1721|     3.5571|          140|
|     858|     4.3958|          192|
|    1270|     4.1696|          171|
|    1265|     4.0699|          143|
|     588|     3.8689|          183|
|     296|     4.2964|          307|
|   68954|     4.1810|          105|
|   58559|     4.4295|          149|
|     593|     4.2688|          279|
+--------+-----------+-------------+
only showing top 10 rows



In [63]:
spark.sql("""SELECT mp.movie_id, title, avg_ratings, total_ratings FROM most_popular_temp_table mp INNER JOIN
        moviedb.movies m ON mp.movie_id = m.movie_id""").show(10)

22/05/16 23:54:06 WARN LazyStruct: Extra bytes detected at the end of the row! Ignoring similar problems.


+--------+--------------------+-----------+-------------+
|movie_id|               title|avg_ratings|total_ratings|
+--------+--------------------+-----------+-------------+
|    1580|Men in Black (a.k...|     3.6364|          165|
|    1721|      Titanic (1997)|     3.5571|          140|
|     858|          "Godfather|     4.3958|          192|
|    1270|Back to the Futur...|     4.1696|          171|
|    1265|Groundhog Day (1993)|     4.0699|          143|
|     588|      Aladdin (1992)|     3.8689|          183|
|     296| Pulp Fiction (1994)|     4.2964|          307|
|   68954|           Up (2009)|     4.1810|          105|
|   58559|        "Dark Knight|     4.4295|          149|
|     593|"Silence of the L...|     4.2688|          279|
+--------+--------------------+-----------+-------------+
only showing top 10 rows



In [66]:
spark.sql(""" CREATE TABLE moviedb.popular_movies AS
        SELECT mp.movie_id, title, avg_ratings, total_ratings FROM most_popular_temp_table mp
        INNER JOIN moviedb.movies m ON m.movie_id= mp.movie_id
""").show(10)

22/05/17 00:04:41 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
22/05/17 00:04:41 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
22/05/17 00:04:41 WARN LazyStruct: Extra bytes detected at the end of the row! Ignoring similar problems.
22/05/17 00:04:52 ERROR KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] to create a keyProvider !!


++
||
++
++



In [67]:
spark.sql("DESC FORMATTED moviedb.popular_movies").show(truncate=False)

+----------------------------+-------------------------------------------------------------------+-------+
|col_name                    |data_type                                                          |comment|
+----------------------------+-------------------------------------------------------------------+-------+
|movie_id                    |int                                                                |null   |
|title                       |string                                                             |null   |
|avg_ratings                 |decimal(14,4)                                                      |null   |
|total_ratings               |bigint                                                             |null   |
|                            |                                                                   |       |
|# Detailed Table Information|                                                                   |       |
|Database                    |moviedb

In [68]:
movieDf = spark.table("moviedb.movies")
movieDf.printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [69]:
movieDf.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|    null|               title|              genres|
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|  Adventure|Children|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11| "American President|         The (1995)"|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Adventure|Animati...|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventur

22/05/17 00:07:51 WARN LazyStruct: Extra bytes detected at the end of the row! Ignoring similar problems.
