In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.5.0-spark2.1-s_2.11 pyspark-shell'


# Build the SparkSession
spark = SparkSession.builder \
   .master("spark://128.235.40.174:7077") \
   .appName("sparkPlot_testing2") \
   .config("spark.cores.max","4")\
   .config("spark.executor.cores","1")\
   .config("spark.executor.memory", "2G") \
   .config('spark.driver.extraClassPath', '/home/hao/pixiedust/data/libs/*')\
   .config('spark.jars', 'file:/home/hao/pixiedust/bin/cloudant-spark-v2.0.0-185.jar')\
   .config("spark.jars.packages", "graphframes:graphframes:0.5.0-spark2.1-s_2.11") \
   .config("spark.jars.packages", "com.typesafe.scala-logging:scala-logging-api_2.11:2.1.2") \
   .config("spark.jars.packages", "com.typesafe.scala-logging:scala-logging-slf4j_2.11:2.1.2") \
   .getOrCreate()

#    .config("spark.executor.instances","1")\
# spark.conf.set("spark.jars.packages", "graphframes:graphframes:0.5.0-spark2.1-s_2.11")
sc = spark.sparkContext

sqlContext=SQLContext(sc)

spark.sparkContext.getConf().getAll()

# from graphframes import *

In [2]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

import time

start_time = time.time()

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("CLASS_IRI", StringType()),
    StructField("PARENT_CLASS_IRI", StringType())
])

df_class_hier= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://128.235.40.174:9000/ONT_CLASS_HIERARCHY.csv")
    
    
print(df_class_hier.count())
df_class_hier.printSchema()
df_class_hier=df_class_hier.distinct()
# df_class_hier.createGlobalTempView("class_hier")
df_class_hier.createOrReplaceTempView("class_hier")


schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("CLASS_IRI", StringType()),
    StructField("CLASS_LABEL", StringType())
])

df_class_labels= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://128.235.40.174:9000/ONT_CLASS_LABELS.csv")

    
print(df_class_labels.count())
df_class_labels.printSchema()
df_class_labels=df_class_labels.distinct()
# df_class_labels.createGlobalTempView("class_labels")
df_class_labels.createOrReplaceTempView("class_labels")

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("TAX_TYPE", StringType()),
    StructField("AREA_ID", StringType()),
    StructField("AREA_NAME", StringType()),
    StructField("AREA_LEVEL", IntegerType())
])

df_tax_areas= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://128.235.40.174:9000/TAX_AREAS.csv")

print(df_tax_areas.count())
df_tax_areas.printSchema()
df_tax_areas=df_tax_areas.distinct()
# df_tax_areas.createGlobalTempView("tax_areas")
df_tax_areas.createOrReplaceTempView("tax_areas")


schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("TAX_TYPE", StringType()),
    StructField("AREA_ID", StringType()),
    StructField("CLASS_IRI", StringType())
])

df_tax_areas_concepts= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://128.235.40.174:9000/TAX_AREAS_CONCEPTS.csv")

print(df_tax_areas_concepts.count())
df_tax_areas_concepts.printSchema()
df_tax_areas_concepts=df_tax_areas_concepts.distinct()
# df_tax_areas_concepts.createGlobalTempView("tax_areas_concepts")
df_tax_areas_concepts.createOrReplaceTempView("tax_areas_concepts")



schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("TAX_TYPE", StringType()),
    StructField("PAREA_ROOT_IRI", StringType()),
    StructField("CLASS_IRI", StringType())
])

df_tax_areas_pareas= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://128.235.40.174:9000/TAX_AREAS_PAREAS.csv")

print(df_tax_areas_pareas.count())
df_tax_areas_pareas.printSchema()
df_tax_areas_pareas=df_tax_areas_pareas.distinct()
# df_tax_areas_pareas.createGlobalTempView("tax_areas_pareas")
df_tax_areas_pareas.createOrReplaceTempView("tax_areas_pareas")


print(time.time() - start_time, "seconds")

# df_class_labels
# df_class_hier
# df_tax_areas
# df_tax_areas_concepts
# df_tax_areas_pareas

# return unioned df_union

df_hier = df_class_hier.rdd.map(lambda x : (x[1], x[2], x[0])).distinct().toDF(['src', 'dst', 'ont'])

# find which ont it belongs to 

# find which area it belongs to
result = df_class_labels.join(df_tax_areas_concepts, ['CLASS_IRI', 'ONT_NAME'])
# result.show(20 ,False)
result.printSchema()

# find which area level it belongs to
result = result.join(df_tax_areas, ['ONT_NAME', 'TAX_TYPE', 'AREA_ID'])
# result.show(20 ,False)
result.printSchema()


# find which parea it belongs to 
result = result.join(df_tax_areas_pareas, ['CLASS_IRI', 'ONT_NAME', 'TAX_TYPE'])
# result.show(20 ,False)
result.printSchema()



df_union = result.rdd.map(lambda x: (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7])).distinct().toDF(['id', 'ont','type', 'area_id','label', 'area_name', 'area_level', 'parea_root_id'])


print(time.time() - start_time, "seconds")

809619
root
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- PARENT_CLASS_IRI: string (nullable = true)

711444
root
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)

1258
root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)
 |-- AREA_NAME: string (nullable = true)
 |-- AREA_LEVEL: integer (nullable = true)

343737
root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)

381471
root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- PAREA_ROOT_IRI: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)

7.609169960021973 seconds
root
 |-- CLASS_IRI: string (nullable = true)
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)

In [80]:
from graphframes import *
g = GraphFrame(df_union, df_hier)
g.inDegrees.show(20, False)

print(g.vertices.count())
print(g.edges.count())

v2 = g.vertices.filter("area_name = 'has part, is conjugate base of' and parea_root_id = 'http://purl.obolibrary.org/obo/chebi_29067'")
# v2 = g.vertices.filter("ont = 'chebi.obo'")
e2 = g.edges.filter("ont = 'chebi.obo'")
g2 = GraphFrame(v2, e2)
print(g2.vertices.count())
print(g2.edges.count())





+-------------------------------------------------------------------------------------+--------+
|id                                                                                   |inDegree|
+-------------------------------------------------------------------------------------+--------+
|http://purl.obolibrary.org/obo/chebi_64517                                           |6       |
|http://scai.fraunhofer.de/ndduo#mortility_rate                                       |9       |
|http://ncicb.nci.nih.gov/xml/owl/evs/thesaurus.owl#brain_oligodendroglioma           |1       |
|http://ncicb.nci.nih.gov/xml/owl/evs/thesaurus.owl#female_urethral_malignant_neoplasm|1       |
|http://purl.obolibrary.org/obo/chebi_46844                                           |20      |
|http://purl.obolibrary.org/obo/dron_00023640                                         |28      |
|http://purl.obolibrary.org/obo/dron_00020574                                         |6       |
|http://purl.obolibrary.org/ob

In [84]:
def removeEdgesNotInVertices(g):
    e = g.edges
    v = g.vertices
    vert = v.select('id').rdd.flatMap(lambda x: x).collect()
    e2= e.filter((e.src.isin(vert)==True) & (e.dst.isin(vert)==True))
    return GraphFrame(v, e2)
    
    
    
g2 = removeEdgesNotInVertices(g2)
print(g2.vertices.count())
print(g2.edges.count())

1692
2155


In [None]:
import time
from pyspark.sql.functions import *
import pixiedust

In [87]:
start_time = time.time()

results = g2.pageRank(resetProbability=0.15, maxIter=3)

print(time.time() - start_time, "seconds")

14.55365538597107 seconds


In [91]:
start_time = time.time()
# Run PageRank until convergence to tolerance "tol".
results = g2.pageRank(resetProbability=0.15, tol=0.1)

print(time.time() - start_time, "seconds")

160.7638611793518 seconds


In [90]:
start_time = time.time()
# Run PageRank personalized for vertex "a"
a = 'http://purl.obolibrary.org/obo/chebi_35693'
results = g2.pageRank(resetProbability=0.15, maxIter=3, sourceId=a)
print(time.time() - start_time, "seconds")

KeyboardInterrupt: 

In [92]:
display(results.edges.select("src", "dst", "weight").orderBy(desc("weight")))
display(results.vertices.select("id","pagerank").orderBy(desc("pagerank")))

id,pagerank
http://purl.obolibrary.org/obo/chebi_29067,296.3211650785135
http://purl.obolibrary.org/obo/chebi_35757,177.89892115112855
http://purl.obolibrary.org/obo/chebi_28868,78.46936258729082
http://purl.obolibrary.org/obo/chebi_28965,41.01514183871766
http://purl.obolibrary.org/obo/chebi_35693,34.893612128036175
http://purl.obolibrary.org/obo/chebi_2580,29.00286098489238
http://purl.obolibrary.org/obo/chebi_37022,24.757164684234837
http://purl.obolibrary.org/obo/chebi_36059,24.47657164280014
http://purl.obolibrary.org/obo/chebi_33558,21.19977105799912
http://purl.obolibrary.org/obo/chebi_76567,21.050803205249707


In [83]:
start_time = time.time()

a = 'http://purl.obolibrary.org/obo/chebi_28868'
# b = 'http://purl.obolibrary.org/obo/chebi_33549'
results = g2.shortestPaths(landmarks=[a])

print(time.time() - start_time, "seconds")

results.select("id", "distances").show()
# results.show(20, False)



df = df.withColumn('dist', df.distances.getItem(a))

# df.printSchema()
df.na.drop().show(20, False)
# df.filter(df.testing !="null").show()


56.397923946380615 seconds
+--------------------+--------------------+
|                  id|           distances|
+--------------------+--------------------+
|http://purl.oboli...|               Map()|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|               Map()|
|http://purl.oboli...|Map(http://purl.o...|
|http://purl.oboli...|Map(http://purl.o...|
|http