In [None]:
import pixiedust

if sc.version.startswith('1.6.'):  # Spark 1.6
    pixiedust.installPackage("graphframes:graphframes:0.5.0-spark1.6-s_2.11")
elif sc.version.startswith('2.'):  # Spark 2.1, 2.0
    pixiedust.installPackage("graphframes:graphframes:0.5.0-spark2.1-s_2.11")


pixiedust.installPackage("com.typesafe.scala-logging:scala-logging-api_2.11:2.1.2")
pixiedust.installPackage("com.typesafe.scala-logging:scala-logging-slf4j_2.11:2.1.2")

print("done")


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# import os

# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.5.0-spark2.1-s_2.11 pyspark-shell'


# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("sparkPlot") \
   .config("spark.executor.memory", "2gb") \
   .getOrCreate()

# spark.conf.set("spark.jars.packages", "graphframes:graphframes:0.5.0-spark2.1-s_2.11")
sc = spark.sparkContext

sqlContext=SQLContext(sc)

In [4]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("CLASS_IRI", StringType()),
    StructField("PARENT_CLASS_IRI", StringType())
])

df= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/ONT_CLASS_HIERARCHY.csv")

In [5]:
df.show()


+-----------+--------------------+--------------------+
|   ONT_NAME|           CLASS_IRI|    PARENT_CLASS_IRI|
+-----------+--------------------+--------------------+
|cheminf.owl|http://semanticsc...|http://semanticsc...|
|cheminf.owl|http://semanticsc...|http://semanticsc...|
|cheminf.owl|http://semanticsc...|http://semanticsc...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://www.ifomis...|http://www.ifomis...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://purl.oboli...|
|cheminf.owl|http://purl.oboli...|http://www.w3.org...|
|cheminf.owl|http://semanticsc...|http://semanti

In [6]:
df.printSchema()

root
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- PARENT_CLASS_IRI: string (nullable = true)



In [11]:
df.groupBy("CLASS_IRI").count().sort("count", ascending=False).show()

+--------------------+-----+
|           CLASS_IRI|count|
+--------------------+-----+
|http://www.biopax...|   77|
|http://chem2bio2r...|   76|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
|http://www.owl-on...|   33|
+--------------------+-----+
only showing top 20 rows



In [12]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("CLASS_IRI", StringType()),
    StructField("CLASS_LABEL", StringType())
])

df= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/ONT_CLASS_LABELS.csv")

    
df.printSchema()
df.groupBy("ONT_NAME").count().sort("count", ascending=False).show()

root
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)

+-----------+------+
|   ONT_NAME| count|
+-----------+------+
|   dron.owl|434664|
|  chebi.obo|126431|
|  ccont.owl| 20622|
|   cseo.owl| 20086|
|   doid.obo| 12490|
|    bao.owl|  7126|
|    ddo.owl|  6445|
|    ato.obo|  6136|
|dermlex.owl|  6107|
|apaonto.owl|  6038|
|    bto.obo|  5903|
|  ctcae.owl|  3877|
|    dcm.owl|  3735|
|    bdo.owl|  3668|
|  cogat.owl|  3640|
|birnlex.owl|  3581|
|  dermo.obo|  3522|
|   edam.owl|  3295|
|   chmo.owl|  2966|
|    cmo.obo|  2762|
+-----------+------+
only showing top 20 rows

