In [23]:
spark.sql("""CREATE DATABASE IF NOT EXISTS plants
          LOCATION "/user/yashiro/plants/"
          """)

DataFrame[]

In [22]:
spark.sql("""CREATE EXTERNAL TABLE IF NOT EXISTS plants.subkingdoms(
    id BIGINT,
    link STRING,
    name STRING,
    slug STRING
)
    STORED AS PARQUET
    LOCATION "/user/yashiro/plants/subkingdoms"
     """)

DataFrame[]

In [51]:
spark.sql(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS plants.kingdoms(
    id BIGINT ,
    name STRING ,
    slug STRING ,
    subkingdoms array<
          struct <
             id: BIGINT,
             link: STRING,
             name: STRING,
             slug: STRING
           >
     >
)
    STORED AS PARQUET
    LOCATION "plants/kingdoms"
""")

DataFrame[]

In [55]:
spark.sql(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS plants.divisions(
    division_classes array<
         struct<
            id: BIGINT,
            link: STRING,
            name: STRING,
            slug: STRING
             >
         >,
    id BIGINT,
    kingdom struct<
        id: BIGINT,
        link: STRING,
        name: STRING,
        slug: STRING
         >,
    name STRING,
    slug STRING,
    subkingdom struct<
        id: BIGINT,
        link: STRING,
        name: STRING,
        slug: STRING
        >
)
    STORED AS PARQUET
    LOCATION "/user/yashiro/plants/divisions"
""")

DataFrame[]

In [56]:
spark.sql("SHOW TABLES").show()

+--------+-----------+-----------+
|database|  tableName|isTemporary|
+--------+-----------+-----------+
|  plants|  divisions|      false|
|  plants|   kingdoms|      false|
|  plants|subkingdoms|      false|
+--------+-----------+-----------+



In [57]:
spark.sql("DESCRIBE FORMATTED kingdoms").show(truncate=False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|id                          |bigint                                                        |null   |
|name                        |string                                                        |null   |
|slug                        |string                                                        |null   |
|subkingdoms                 |array<struct<id:bigint,link:string,name:string,slug:string>>  |null   |
|                            |                                                              |       |
|# Detailed Table Information|                                                              |       |
|Database                    |plants                                              

In [63]:
import os
os.chdir("..")
os.chdir("..")
BASE_DIR = os.getcwd()

In [68]:
!hdfs dfs -ls /user/yashiro

20/05/26 23:39:41 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 3 items
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:30 /user/yashiro/kingdoms
drwxr-xr-x   - yashiro supergroup          0 2020-05-16 17:51 /user/yashiro/lr1.model
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:32 /user/yashiro/plants


In [69]:
!hdfs dfs -rmdir /user/yashiro/kingdoms

20/05/26 23:40:06 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [70]:
!hdfs dfs -ls /user/yashiro

20/05/26 23:40:15 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
drwxr-xr-x   - yashiro supergroup          0 2020-05-16 17:51 /user/yashiro/lr1.model
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:32 /user/yashiro/plants


In [71]:
!hdfs dfs -mkdir /user/yashiro/data

20/05/26 23:40:43 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [76]:
# !hdfs dfs -copyFromLocal 00_input/data/* /user/yashiro/plants

In [75]:
!hdfs dfsadmin -report

20/05/26 23:42:44 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Configured Capacity: 250685575168 (233.47 GB)
Present Capacity: 29408750441 (27.39 GB)
DFS Remaining: 29395537920 (27.38 GB)
DFS Used: 13212521 (12.60 MB)
DFS Used%: 0.04%
Under replicated blocks: 0
Blocks with corrupt replicas: 0
Missing blocks: 0
Missing blocks (with replication factor 1): 0
Pending deletion blocks: 0

-------------------------------------------------
Live datanodes (1):

Name: 127.0.0.1:50010 (localhost)
Hostname: 10.0.0.111
Decommission Status : Normal
Configured Capacity: 250685575168 (233.47 GB)
DFS Used: 13212521 (12.60 MB)
Non DFS Used: 206186669207 (192.03 GB)
DFS Remaining: 29395537920 (27.38 GB)
DFS Used%: 0.01%
DFS Remaining%: 11.73%
Configured Cache Capacity: 0 (0 B)
Cache Used: 0 (0 B)
Cache Remaining: 0 (0 B)
Cache Used%: 100.00%
Cache Remaining%: 0.00%
Xceivers: 2
Last contact: Tue May 26 23:42:43 EDT 2020



In [78]:
sk_df = spark.read\
    .option("header", True)\
    .option("inferSchema", True)\
    .json(f"file://{BASE_DIR}/00_input/data/subkingdoms*")

In [79]:
sk_df.show()

+---+--------------------+-------------+-------------+
| id|                link|         name|         slug|
+---+--------------------+-------------+-------------+
|  1|http://trefle.io/...|Tracheobionta|tracheobionta|
+---+--------------------+-------------+-------------+



In [80]:
sk_df.write.mode("append").insertInto("plants.subkingdoms")

In [86]:
div_df = spark.read\
    .option("header", True)\
    .option("inferSchema", True)\
    .json(f"file://{BASE_DIR}/00_input/data/divisions*")

In [90]:
div_df.count()

9

In [88]:
div_df.write.mode("append").insertInto("plants.divisions")

In [89]:
spark.sql("SELECT COUNT(*) FROM plants.divisions").show()

+--------+
|count(1)|
+--------+
|       9|
+--------+



In [114]:
spark.sql("SELECT * FROM plants.divisions LIMIT 3").show()

+--------------------+---+--------------------+-------------+-------------+--------------------+
|    division_classes| id|             kingdom|         name|         slug|          subkingdom|
+--------------------+---+--------------------+-------------+-------------+--------------------+
|[[1, http://trefl...|  1|[1, http://trefle...|Magnoliophyta|magnoliophyta|[1, http://trefle...|
|[[2, http://trefl...|  2|[1, http://trefle...|Coniferophyta|coniferophyta|[1, http://trefle...|
|[[8, http://trefl...|  3|[1, http://trefle...| Pteridophyta| pteridophyta|[1, http://trefle...|
+--------------------+---+--------------------+-------------+-------------+--------------------+



In [100]:
ls SparkHiveExample/hql/

create_tables.sql


In [101]:
!spark-sql -f SparkHiveExample/hql/create_tables.sql

Ivy Default Cache set to: /Users/yashiro/.ivy2/cache
The jars for the packages stored in: /Users/yashiro/.ivy2/jars
:: loading settings :: url = jar:file:/opt/spark-2.4.5-bin-hadoop2.7/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.datastax.spark#spark-cassandra-connector_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f88beca0-b3d3-49b6-a0b4-b182675215c5;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.11;2.5.0 in spark-list
	found com.datastax.spark#spark-cassandra-connector-driver_2.11;2.5.0 in spark-list
	found com.datastax.oss#java-driver-core-shaded;4.5.0 in spark-list
	found com.datastax.oss#native-protocol;1.4.9 in spark-list
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre in spark-list
	found com.typesafe#config;1.3.4 in spark-list
	found com.github.jnr#jnr-ffi;2.1.10 in spark-list
	found com.github.jnr#jffi;1.2.19 in spark-list
	found org.ow2.asm#asm;7.1 in spark-list
	foun

20/05/27 00:00:57 INFO DataNucleus.Datastore: The class "org.apache.hadoop.hive.metastore.model.MFieldSchema" is tagged as "embedded-only" so does not have its own datastore table.
20/05/27 00:00:57 INFO DataNucleus.Datastore: The class "org.apache.hadoop.hive.metastore.model.MOrder" is tagged as "embedded-only" so does not have its own datastore table.
20/05/27 00:00:57 INFO DataNucleus.Datastore: The class "org.apache.hadoop.hive.metastore.model.MFieldSchema" is tagged as "embedded-only" so does not have its own datastore table.
20/05/27 00:00:57 INFO DataNucleus.Datastore: The class "org.apache.hadoop.hive.metastore.model.MOrder" is tagged as "embedded-only" so does not have its own datastore table.
20/05/27 00:00:58 INFO DataNucleus.Query: Reading in results for query "org.datanucleus.store.rdbms.query.SQLQuery@0" since the connection used is closing
20/05/27 00:00:58 INFO metastore.MetaStoreDirectSql: Using direct SQL, underlying DB is DERBY
20/05/27 00:00:58 INFO metastore.Object

20/05/27 00:00:59 INFO executor.Executor: Starting executor ID driver on host localhost
20/05/27 00:00:59 INFO util.Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 54885.
20/05/27 00:00:59 INFO netty.NettyBlockTransferService: Server created on 10.0.0.111:54885
20/05/27 00:00:59 INFO storage.BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
20/05/27 00:00:59 INFO storage.BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 10.0.0.111, 54885, None)
20/05/27 00:00:59 INFO storage.BlockManagerMasterEndpoint: Registering block manager 10.0.0.111:54885 with 366.3 MB RAM, BlockManagerId(driver, 10.0.0.111, 54885, None)
20/05/27 00:00:59 INFO storage.BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 10.0.0.111, 54885, None)
20/05/27 00:00:59 INFO storage.BlockManager: Initialized BlockManager: BlockManagerId(driver, 10.0.0.111, 54885, None)
20/05/27 00:00

20/05/27 00:01:02 INFO hive.log: Updating table stats fast for subkingdoms
20/05/27 00:01:02 INFO hive.log: Updated size of table subkingdoms to 0
Time taken: 0.371 seconds
20/05/27 00:01:02 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.371 seconds
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=kingdoms
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=kingdoms	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=kingdoms
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=kingdoms

Time taken: 0.044 seconds
20/05/27 00:01:02 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.044 seconds
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=families
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=families	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=families
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=families	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: create_table: Table(tableName:families, dbName:plants, owner:yashiro, createTime:159

Time taken: 0.047 seconds
20/05/27 00:01:02 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.047 seconds
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=genuses
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=genuses	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=genuses
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=genuses	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: create_table: Table(tableName:genuses, dbName:plants, owner:yashiro, createTime:15905520

Time taken: 0.066 seconds
20/05/27 00:01:02 INFO thriftserver.SparkSQLCLIDriver: Time taken: 0.066 seconds
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_database: plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_database: plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: get_table : db=plants tbl=plants
20/05/27 00:01:02 INFO HiveMetaStore.audit: ugi=yashiro	ip=unknown-ip-addr	cmd=get_table : db=plants tbl=plants	
20/05/27 00:01:02 INFO metastore.HiveMetaStore: 0: create_table: Table(tableName:plants, dbName:plants, owner:yashiro, createTime:1590552062, lastAccessT

In [112]:
!hdfs dfs -ls /user/yashiro/plants

20/05/27 00:08:32 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 6 items
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:49 /user/yashiro/plants/divisions
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:32 /user/yashiro/plants/families
drwxr-xr-x   - yashiro supergroup          0 2020-05-27 00:01 /user/yashiro/plants/genuses
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:30 /user/yashiro/plants/kingdoms
drwxr-xr-x   - yashiro supergroup          0 2020-05-27 00:01 /user/yashiro/plants/plants
drwxr-xr-x   - yashiro supergroup          0 2020-05-26 23:47 /user/yashiro/plants/subkingdoms
