In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("SparkDatabase")

# embedded, simple, local spark database/warehouse
# spark will store temporary files
# enable hive support must for sql database
# enable hiveSupport hive catalog to be embedded inside working directory
# spark temp data goes to "hdfs://localhost:9000/spark-temp"
config.set("spark.local.dir", "/home/ubuntu/spark-temp")
# spark data [not meta data] goes into  "/home/ubuntu/spark-warehouse"
config.set("spark.sql.warehouse.dir", "hdfs://localhost:9000/spark-warehouse")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame

# enableHiveSupport() create a meta catalog/database using derby database
# inside current working directory, embedded into spark notebook,
# multiple notebooks cannot share at same time.
# inside pyspark-notebooks, you could see metastore_db
# metastore shall have meta data: database, tables, columns, data types, where exactly
# data located in hdfs or file system or s3
# derby.log - derby database log 
## metastore_db 

spark = SparkSession.builder\
                    .config(conf=config)\
                    .enableHiveSupport()\
                    .getOrCreate()

sc = spark.sparkContext

22/05/16 20:22:48 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/16 20:22:48 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/16 20:22:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/16 20:22:49 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
# from metadata
df = spark.sql("SHOW DATABASES")

df.show()

22/05/16 20:22:56 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/05/16 20:22:56 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/05/16 20:23:01 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
22/05/16 20:23:01 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore ubuntu@127.0.1.1
22/05/16 20:23:01 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException


+---------+
|namespace|
+---------+
|  default|
+---------+



In [4]:
# meta data local, but data directory should be in hdfs spark-warehouse
# hdfs://localhost:9000/spark-warehouse/spark-warehouse/stocklocaldb.db
spark.sql("CREATE DATABASE IF NOT EXISTS stocklocaldb")

22/05/16 20:23:02 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
22/05/16 20:23:02 WARN ObjectStore: Failed to get database stocklocaldb, returning NoSuchObjectException


DataFrame[]

In [5]:
# Create spark Managed table
# we have to use spark sql like insert, (update, delete won't work at 2.x)
# to add data
# create databsae called stocklocaldb
# hdfs://localhost:9000/spark-warehouse/spark-warehouse/stocklocaldb.db/stocks
spark.sql("CREATE TABLE  IF NOT EXISTS stocklocaldb.stocks(symbol STRING, industry STRING)")

22/05/16 20:29:09 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
22/05/16 20:29:09 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
22/05/16 20:29:09 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
22/05/16 20:29:09 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
22/05/16 20:29:09 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
22/05/16 20:29:09 WARN HiveMetaStore: Location: hdfs://localhost:9000/spark-warehouse/stocklocaldb.db/stocks specified for non-external table:stocks


DataFrame[]

In [6]:
spark.sql("""
 INSERT INTO stocklocaldb.stocks VALUES('INFY', 'IT')
"""
         )

22/05/16 20:31:34 ERROR KeyProviderCache: Could not find uri with key [dfs.encryption.key.provider.uri] to create a keyProvider !!


DataFrame[]

In [7]:
spark.sql("SELECT * FROM stocklocaldb.stocks").show()


+------+--------+
|symbol|industry|
+------+--------+
|  INFY|      IT|
+------+--------+



In [8]:
# this command drop the table from meta data store and drop the in the 
# spark datawarehouse directory

spark.sql("DROP TABLE IF EXISTS stocklocaldb.stocks")

DataFrame[]

In [9]:
spark.sql("SHOW TABLES").show()


+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [10]:
# if no table exists, no data inside, then it drop the database
# drop the metadata too..
spark.sql("DROP DATABASE IF EXISTS stocklocaldb")

22/05/16 20:34:21 WARN TxnHandler: Cannot perform cleanup since metastore table does not exist


DataFrame[]

In [11]:
spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
+---------+

