In [None]:
# !pip install --user --upgrade pixiedust

In [None]:
# import findspark
# findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

# import os

# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.5.0-spark2.1-s_2.11 pyspark-shell'


# Build the SparkSession
spark = SparkSession.builder \
   .master("spark://128.235.40.174:7077") \
   .appName("sparkPlot3") \
   .config("spark.executor.memory", "4gb") \
   .getOrCreate()

# spark.conf.set("spark.jars.packages", "graphframes:graphframes:0.5.0-spark2.1-s_2.11")
sc = spark.sparkContext

sqlContext=SQLContext(sc)

In [None]:
spark.sparkContext.getConf().getAll()

In [None]:
spark.conf.set("spark.executor.memory", "4gb")

In [None]:
sc.getConf().getAll()

In [3]:
import pixiedust

if sc.version.startswith('1.6.'):  # Spark 1.6
    pixiedust.installPackage("graphframes:graphframes:0.5.0-spark1.6-s_2.11")
elif sc.version.startswith('2.'):  # Spark 2.1, 2.0
    pixiedust.installPackage("graphframes:graphframes:0.5.0-spark2.1-s_2.11")


pixiedust.installPackage("com.typesafe.scala-logging:scala-logging-api_2.11:2.1.2")
pixiedust.installPackage("com.typesafe.scala-logging:scala-logging-slf4j_2.11:2.1.2")

print("done")


Package already installed: graphframes:graphframes:0.5.0-spark2.1-s_2.11
Package already installed: com.typesafe.scala-logging:scala-logging-api_2.11:2.1.2
Package already installed: com.typesafe.scala-logging:scala-logging-slf4j_2.11:2.1.2
done


In [4]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("CLASS_IRI", StringType()),
    StructField("PARENT_CLASS_IRI", StringType())
])

df_class_hier= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/ONT_CLASS_HIERARCHY.csv")
    
    
print(df_class_hier.count())
df_class_hier.printSchema()
df_class_hier=df_class_hier.distinct()
# df_class_hier.createGlobalTempView("class_hier")
df_class_hier.createOrReplaceTempView("class_hier")

# iri = "http://purl.obolibrary.org/obo/iao_0000030"
# print(spark.sql("SELECT * from class_hier where CLASS_IRI = '" + iri +"'").collect())
# print(df_class_hier.count())


809619
root
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- PARENT_CLASS_IRI: string (nullable = true)



In [5]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("CLASS_IRI", StringType()),
    StructField("CLASS_LABEL", StringType())
])

df_class_labels= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/ONT_CLASS_LABELS.csv")

    
print(df_class_labels.count())
df_class_labels.printSchema()
df_class_labels=df_class_labels.distinct()
# df_class_labels.createGlobalTempView("class_labels")
df_class_labels.createOrReplaceTempView("class_labels")
    

711444
root
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)



In [6]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("TAX_TYPE", StringType()),
    StructField("AREA_ID", StringType()),
    StructField("AREA_NAME", StringType()),
    StructField("AREA_LEVEL", IntegerType())
])

df_tax_areas= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/TAX_AREAS.csv")

print(df_tax_areas.count())
df_tax_areas.printSchema()
df_tax_areas=df_tax_areas.distinct()
# df_tax_areas.createGlobalTempView("tax_areas")
df_tax_areas.createOrReplaceTempView("tax_areas")

1258
root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)
 |-- AREA_NAME: string (nullable = true)
 |-- AREA_LEVEL: integer (nullable = true)



In [7]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("TAX_TYPE", StringType()),
    StructField("AREA_ID", StringType()),
    StructField("CLASS_IRI", StringType())
])

df_tax_areas_concepts= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/TAX_AREAS_CONCEPTS.csv")

print(df_tax_areas_concepts.count())
df_tax_areas_concepts.printSchema()
df_tax_areas_concepts=df_tax_areas_concepts.distinct()
# df_tax_areas_concepts.createGlobalTempView("tax_areas_concepts")
df_tax_areas_concepts.createOrReplaceTempView("tax_areas_concepts")

343737
root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)



In [8]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("ONT_NAME", StringType()),
    StructField("TAX_TYPE", StringType()),
    StructField("PAREA_ROOT_IRI", StringType()),
    StructField("CLASS_IRI", StringType())
])

df_tax_areas_pareas= spark.read \
    .schema(schema) \
    .option("header", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv("hdfs://localhost:9000/TAX_AREAS_PAREAS.csv")

print(df_tax_areas_pareas.count())
df_tax_areas_pareas.printSchema()
df_tax_areas_pareas=df_tax_areas_pareas.distinct()
# df_tax_areas_pareas.createGlobalTempView("tax_areas_pareas")
df_tax_areas_pareas.createOrReplaceTempView("tax_areas_pareas")


381471
root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- PAREA_ROOT_IRI: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)



In [None]:
schema = StructType([])
iri_class_labels = sqlContext.createDataFrame(sc.emptyRDD(), schema)
iri_class_labels = spark.sql("SELECT * from class_labels where ONT_NAME = 'chembio.owl'")
# iri_class_labels.describe()

sql = "SELECT distinct ONT_NAME from class_labels where CLASS_IRI = 'http://chem2bio2rdf.org/chem2bio2rdf.owl#ubiquitination'"
df = spark.sql(sql)
df.show()
df.createOrReplaceTempView("people")

df_tax_areas_pareas.join(df, 'ONT_NAME').show()

sql2 = "SELECT * FROM class_labels INNER JOIN people ON class_labels.ONT_NAME=people.ONT_NAME"
dff = spark.sql(sql2)
dff.show()


In [None]:
spark.stop()

In [None]:
df1=df_tax_areas_pareas.select('ONT_NAME').limit(5)
df1.show()
df2=df_tax_areas_pareas.select('ONT_NAME').limit(3)
df2.show()
df2.subtract(df1).show()
print( df2.subtract(df1).count()==0)

a = set()
if not a:
    print('empty set')
else:
    print('not empty')

In [61]:
spark.stop()

sc.stop()
# conf = SparkConf().setAppName("testing").setMaster("spark://128.235.40.174:7077")
# sc = SparkContext(conf=conf)

In [None]:
#init 5 tables
schema = StructType([])
iri_class_labels = sqlContext.createDataFrame(sc.emptyRDD(), schema)
iri_class_hier = sqlContext.createDataFrame(sc.emptyRDD(), schema)
iri_tax_areas = sqlContext.createDataFrame(sc.emptyRDD(), schema)
iri_tax_areas_concepts = sqlContext.createDataFrame(sc.emptyRDD(), schema)
iri_tax_areas_pareas = sqlContext.createDataFrame(sc.emptyRDD(), schema)

#init ontologies list
global_ontList=sqlContext.createDataFrame(sc.emptyRDD(), schema)


class IRIRelatedTables:
    
    
    def __init__(self, iri):
        self.iri = iri
        self.local_ontList = self.getOntNames()
        self.initializeTables()
        
    def initializeTables(self):
        global iri_class_labels,\
        iri_class_hier,\
        iri_tax_areas,\
        iri_tax_areas_concepts,\
        iri_tax_areas_pareas,\
        global_ontList
        
        if global_ontList.count()==0:
            #initialize 5 tables
            print('init')
            iri_class_labels = self.initializeTable(df_class_labels)
            iri_class_hier = self.initializeTable(df_class_hier)
            iri_tax_areas = self.initializeTable(df_tax_areas)
            iri_tax_areas_concepts = self.initializeTable(df_tax_areas_concepts)
            iri_tax_areas_pareas = self.initializeTable(df_tax_areas_pareas)
            global_ontList = self.getOntNames()
        else:
            print('update')
            ontNames = self.local_ontList.subtract(global_ontList)
            if ontNames.count()!=0:
                print('inside update')
                #update 5 tables
                iri_class_labels = self.updateTable(iri_class_labels, df_class_labels, ontNames) 
                iri_class_hier = self.updateTable(iri_class_hier, df_class_hier, ontNames)
                iri_tax_areas = self.updateTable(iri_tax_areas, df_tax_areas, ontNames)
                iri_tax_areas_concepts = self.updateTable(iri_tax_areas_concepts, df_tax_areas_concepts, ontNames)
                iri_tax_areas_pareas = self.updateTable(iri_tax_areas_pareas, df_tax_areas_pareas, ontNames)
            global_ontList = global_ontList.union(self.local_ontList)
        
    def initializeTable(self, df):
        try:
            ontNames = self.getOntNames() 
            return df.join(ontNames, 'ONT_NAME').distinct()
        except:
            print('iri has no corresponding ontology found!')
            raise

    def updateTable(self, df, dff, ontNames):
        try:
            dff= dff.join(ontNames, 'ONT_NAME').distinct()
            return df.union(dff)
        except:
            print('iri has no corresponding ontology found!')
            raise
        
    def getOntNames(self):
        sql = "SELECT distinct ONT_NAME from class_labels where CLASS_IRI = '"+ self.iri +"'"
        df = spark.sql(sql)
        return df

iri= 'http://purl.obolibrary.org/obo/iao_0000030'
test = IRIRelatedTables(iri)
# print(iri_class_labels.count())
# print(iri_class_hier.count())
print(global_ontList)

iri2 = 'http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay'
test2 = IRIRelatedTables(iri2)
print(global_ontList)

def getChildrenFast(iri):
    result = set()
    children = iri_class_hier.filter(iri_class_hier.PARENT_CLASS_IRI==iri).distinct().collect()
    for row in children:
        result.add(row['CLASS_IRI'])
#     print(result)
    return result

print(getChildrenFast(iri))
print(getChildrenFast(iri2))

In [None]:
def getAllChildren(iri, visited=None):
    print("search children for ", iri)
    if visited is None:
        visited = set()
    visited.add(iri)
    result = []
    pair = []
    for child_iri in getChildrenFast(iri) - visited:
        print("get child: ", child_iri)
        if child_iri:
            result.append(child_iri)
            pair.append((child_iri, iri))
            result1, pair1 = getAllChildren(child_iri, visited)
            result += result1
            pair += pair1
            visited.add(child_iri)
            
    return result, pair

iri= 'http://purl.obolibrary.org/obo/chebi_62943'
test = IRIRelatedTables(iri)
c_vertices, c_edges = getAllChildren(iri)


In [None]:
# iri = 'http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay'
iri= 'http://purl.obolibrary.org/obo/iao_0000030'
df_hier = df_class_hier.rdd.map(lambda x : (x[1], x[2], x[0])).distinct().toDF(['src', 'dst', 'ont'])

df_label = df_class_labels.rdd.map(lambda x: (x[1], x[2], x[0])).distinct().toDF(['id', 'label','ont'])
# df_hier.reduceByKey(lambda x, y: x +y ).take(2)

In [None]:
print(df_hier.count())
print(df_hier.distinct().count())
print(df_label.count())
print(df_label.distinct().count())

In [None]:
df_hier.printSchema()
df_label.printSchema()
df_hier.show()

In [9]:
# df_class_labels
# df_class_hier
# df_tax_areas
# df_tax_areas_concepts
# df_tax_areas_pareas

# return unioned df_union

df_hier = df_class_hier.rdd.map(lambda x : (x[1], x[2], x[0])).distinct().toDF(['src', 'dst', 'ont'])

# find which ont it belongs to 

# find which area it belongs to
result = df_class_labels.join(df_tax_areas_concepts, ['CLASS_IRI', 'ONT_NAME'])
# result.show(20 ,False)
result.printSchema()

# find which area level it belongs to
result = result.join(df_tax_areas, ['ONT_NAME', 'TAX_TYPE', 'AREA_ID'])
# result.show(20 ,False)
result.printSchema()


# find which parea it belongs to 
result = result.join(df_tax_areas_pareas, ['CLASS_IRI', 'ONT_NAME', 'TAX_TYPE'])
# result.show(20 ,False)
result.printSchema()



df_union = result.rdd.map(lambda x: (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7])).distinct().toDF(['id', 'ont','type', 'area_id','label', 'area_name', 'area_level', 'parea_root_id'])

root
 |-- CLASS_IRI: string (nullable = true)
 |-- ONT_NAME: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)

root
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)
 |-- CLASS_IRI: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)
 |-- AREA_NAME: string (nullable = true)
 |-- AREA_LEVEL: integer (nullable = true)

root
 |-- CLASS_IRI: string (nullable = true)
 |-- ONT_NAME: string (nullable = true)
 |-- TAX_TYPE: string (nullable = true)
 |-- AREA_ID: string (nullable = true)
 |-- CLASS_LABEL: string (nullable = true)
 |-- AREA_NAME: string (nullable = true)
 |-- AREA_LEVEL: integer (nullable = true)
 |-- PAREA_ROOT_IRI: string (nullable = true)



In [None]:
import os, errno

def silentRemove(filename):
    try:
        print("try to remove")
        os.remove(filename)
        print('remove done')
    except OSError as e: # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
            raise # re-raise exception if a different error occurred

# Remove the file if it exists
import shutil
shutil.rmtree("./data/df_label.parquet")
shutil.rmtree("./data/df_hier.parquet")
# write parquet

In [None]:
df_union.write.parquet("./data/df_union.parquet")
df_hier.write.parquet("./data/df_hier.parquet")

In [2]:
# read union parquet
df_union = sqlContext.read.parquet("./data/df_union.parquet")
pixiedust.display(df_union)

# read hier parquet
df_hier = sqlContext.read.parquet("./data/df_hier.parquet")
pixiedust.display(df_hier)


src,dst,ont
http://bioontology.org/projects/ontologies/birnlex#birnlex_7346,http://bioontology.org/projects/ontologies/birnlex#birnlex_471,birnlex.owl
http://purl.obolibrary.org/obo/chebi_103512,http://purl.obolibrary.org/obo/chebi_24995,chebi.obo
http://purl.obolibrary.org/obo/doid_13450,http://purl.obolibrary.org/obo/doid_0050292,doid.obo
http://purl.obolibrary.org/obo/chebi_16162,http://purl.obolibrary.org/obo/chebi_36084,chebi.obo
http://purl.obolibrary.org/obo/dron_00745389,http://purl.obolibrary.org/obo/dron_00000027,dron.owl
http://purl.obolibrary.org/obo/dron_00011383,http://purl.obolibrary.org/obo/obi_0000047,dron.owl
http://purl.obolibrary.org/obo/caro_0000068,http://purl.obolibrary.org/obo/caro_0000073,aeo.obo
http://purl.obolibrary.org/obo/chebi_70769,http://purl.obolibrary.org/obo/chebi_84194,chebi.obo
http://purl.obolibrary.org/obo/dron_00508557,http://purl.obolibrary.org/obo/dron_00000027,dron.owl
http://purl.obolibrary.org/obo/dron_00129981,http://purl.obolibrary.org/obo/dron_00000027,dron.owl


In [3]:
# remove duplicated label
import re
def isIRIEqualLabel(iri, label):
    iri = iri.split('/')[-1].replace('_',' ')
    return label ==iri 


# df_class_labels
# df_class_hier
# df_tax_areas
# df_tax_areas_concepts
# df_tax_areas_pareas

# work with df_union

label = 'information content entity'
iri= 'http://purl.obolibrary.org/obo/iao_0000030'

# get iri
def getIRI(label):
    result = []
    iris = df_union.filter(df_union.label==label).select('id','label').distinct().collect()
    for row in iris:
        if not isIRIEqualLabel(row["id"], row['label']):
            result.append(row["id"])
    return result

print(getIRI(label))

# get label
def getLabel(iri):
    result =[]
    labels = df_union.filter(df_union.id==iri).select('id','label').distinct().collect()
    for row in labels:
        if not isIRIEqualLabel(row["id"], row['label']):
            result.append(row["label"])
    return result

print(getLabel(iri))

# get ont
def getOnts(iri):
    result =[]
    ont_names = df_union.filter(df_union.id==iri).select('ont').distinct().collect()
    for row in ont_names:
        result.append(row["ont"])
    return result

print(getOnts(iri))

# get area info DF
def getAreaInfoDF(iri):
    return df_union.filter(df_union.id==iri).distinct()

# get area id from areaInfoDF
def getAreaDetail(areaInfoDF, ont, tax_type):
    return areaInfoDF.filter((areaInfoDF.ont==ont) & (areaInfoDF.type== tax_type)).first()

def getAreaID(areaDetailDF):
    if areaDetailDF:
        return areaDetailDF['area_id']
    else:
        return 'Not applicable'

def getAreaName(areaDetailDF):
    if areaDetailDF:
        return areaDetailDF['area_name']
    else:
        return 'Not applicable'

def getAreaLevel(areaDetailDF):
    if areaDetailDF:
        return areaDetailDF['area_level']
    else:
        return 'Not applicable'
    
areaInfoDF = getAreaInfoDF(iri)
areaDetailDF=getAreaDetail(areaInfoDF, 'aero.owl', 'op_restriction')
print(getAreaID(areaDetailDF))
print(getAreaName(areaDetailDF))
print(getAreaLevel(areaDetailDF))


# get area name

# get tax type

# get parea root id

# get area level






['http://purl.obolibrary.org/obo/iao_0000030']
['information content entity']
['chmo.owl', 'cno_acronym.owl', 'aero.owl', 'ddo.owl', 'cheminf.owl', 'ccont.owl', 'cdao.owl', 'dideo.owl', 'bco.owl', 'cogpo.owl', 'apollo-sv.owl']
[empty set]
[empty set]
0


In [4]:
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Button, Layout

ontList =[]
iri =None

# search by iri or label or other

searchType=widgets.ToggleButtons(
    options=['IRI', 'Label', 'Other'],
    description='Search by:',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=['Type in IRI in format of URL', 'Type in concept name', 'Feel Lucky'],
#     icons=['check'] * 3
)
display(searchType)

# search box
searchbox = widgets.Text(value='Search concept here', disabled=False)
display(searchbox)

# search button
searchButton = Button(description='Search')
searchButton.style.button_color = 'lightgreen'
display(searchButton)

def on_searchButton_clicked(b):
    global iri
    ont_dropdown.layout.visibility = 'hidden'
    type_dropdown.layout.visibility = 'hidden'
#     print("Button clicked.")
    if searchbox.value:
        text = searchbox.value
        if searchType.value == "IRI":
            iri = text
            ontList = getOnts(text)
        elif searchType.value == "Label":
            iri = getIRI(text)
            ontList = getOnts(iri[0])
        else:
            ontList = [] 
        if ontList:
            ont_dropdown.options = ontList
            ont_dropdown.layout.visibility = 'visible'
            type_dropdown.layout.visibility = 'visible'
    
searchButton.on_click(on_searchButton_clicked)

# label

style = {'description_width': 'initial'}
areaIDLabel = widgets.Textarea(
    value='',
    placeholder='Area ID',
    description='The Area ID is: ',
    layout=Layout(width='70%'),
    disabled=True,
    style=style
)

areaNameLabel =  widgets.Textarea(
    value='',
    placeholder='Area Name',
    description='The Area Name is: ',
    layout=Layout(width='70%'),
    disabled=True,
    style=style
)
areaLevelLabel =  widgets.Text(
    value='',
    placeholder='Area Level',
    description='The Area Level is: ',
    disabled=True,
    style=style
)
display(areaIDLabel)
display(areaNameLabel)
display(areaLevelLabel)
# iri= 'http://purl.obolibrary.org/obo/iao_0000030'
# print(getOnts(iri)[0])
# choose ontology
ont_dropdown = widgets.Dropdown(
    options = ontList,
    value = None,
    description='Ontology:',

)
ont_dropdown.layout.visibility = 'hidden'
display(ont_dropdown)

def on_ont_dropdonw_change(change):
#     print(iri)
#     print(change['old'])
#     print(change['new'])
#     print(type_dropdown.value)
    areaInfoDF = getAreaInfoDF(iri)
    areaDetailDF=getAreaDetail(areaInfoDF, change['new'], type_dropdown.value)
    areaIDLabel.value = getAreaID(areaDetailDF).replace(", ", "\n")
    areaNameLabel.value =  getAreaName(areaDetailDF).replace(", ", "\n")
    areaLevelLabel.value =  str(getAreaLevel(areaDetailDF))
    
ont_dropdown.observe(on_ont_dropdonw_change, names= 'value')



# choose restriction type
type_dropdown=widgets.Dropdown(
    options=['op_restriction', 'op_domain', 'dp_restriction', 'dp_domain'],
    value='op_restriction',
    description='Tax Type:',
    disabled= False
)
type_dropdown.layout.visibility = 'hidden'
display(type_dropdown)

def on_type_dropdown_change(change):
#     print(iri)
#     print(change['old'])
#     print(change['new'])
#     print(ont_dropdown.value)
    areaInfoDF = getAreaInfoDF(iri)
    areaDetailDF=getAreaDetail(areaInfoDF, ont_dropdown.value, change['new'])
    areaIDLabel.value = getAreaID(areaDetailDF).replace(", ", "\n")
    areaNameLabel.value =  getAreaName(areaDetailDF).replace(", ", "\n")
    areaLevelLabel.value =  str(getAreaLevel(areaDetailDF))
    
type_dropdown.observe(on_type_dropdown_change, names= 'value')
    
# print(ont_dropdown.value)
# print(type_dropdown.value)

In [5]:

from graphframes import *
g = GraphFrame(df_union, df_hier)
g.inDegrees.show(20, False)
# g = GraphFrame(vertex, edges)

+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|http://purl.oboli...|      10|
|http://purl.oboli...|      56|
|http://purl.oboli...|       1|
|http://purl.oboli...|      19|
|http://purl.oboli...|       4|
|http://purl.oboli...|       4|
|http://purl.oboli...|       7|
|http://purl.oboli...|       4|
|http://purl.oboli...|       1|
|http://purl.oboli...|       1|
|http://purl.oboli...|       3|
|http://purl.oboli...|       3|
|http://purl.oboli...|       1|
|http://purl.oboli...|       3|
|http://www.ifomis...|      25|
|http://purl.oboli...|      16|
|http://purl.oboli...|      59|
|http://ncicb.nci....|      15|
|http://edamontolo...|      14|
|http://purl.oboli...|       3|
+--------------------+--------+
only showing top 20 rows



In [6]:
v2 = g.vertices.filter("area_name = 'has part, is conjugate base of' and parea_root_id = 'http://purl.obolibrary.org/obo/chebi_29067'")
e2 = g.edges.filter("ont = 'chebi.obo'")
g2 = GraphFrame(v2, e2)
print(g2.vertices.count())
print(g2.edges.count())
# pixiedust.display(g2)

1692
181470


In [None]:
g2.vertices.select('id').show(10)
g2.vertices.show(10)
# print(g2.vertices.select('id').rdd.map(lambda x: x.id).collect())
# print(g2.vertices.select('label').rdd.map(lambda x: x.label).collect())
print(g2.vertices.rdd.map(lambda x: x.label).collect())

In [None]:
# print(g2.vertices.dtypes)

vertexDF=g2.vertices.toPandas()
# display(vertexDF)
# from pyspark.sql.functions import explode
# pixiedust.display(g2.vertices)
# g2.vertices.select('*').show(20)
# g2.vertices.select('id', explode('label')).show(20)

# i=0
# for vertex in vertexDF.iterrows():
#     i += 1
#     print(vertex[1])
#     if i==10:
#         break
i = 0        
for vertex in vertexDF.itertuples():
    i += 1
    print(vertex)
#     print(vertex.id, vertex.label)
    if i==10:
        break       
i=0
edgesDF=g2.edges.toPandas()
for edge in edgesDF.itertuples():
    i += 1
    print(edge.src, edge.dst, edge.ont)
    if i==10:
        break

In [51]:
import igraph as ig
g = ig.Graph()

g.add_vertices(g2.vertices.rdd.map(lambda x:x.id).collect())
g.vs["label"] = g2.vertices.rdd.map(lambda x: x.label).collect()
g.vs["area_level"] = g2.vertices.rdd.map(lambda x: x.area_level).collect()
# vertexDF=g2.vertices.toPandas()
# i=0
# for vertex in vertexDF.itertuples():
#     i += 1
# #     print(vertex)
#     g.add_vertex(vertex.id)
#     g.vs["label"] = vertex.label 
# #     print(vertex.id, vertex.label)
#     if i==10:
#         break 

N=g.vcount()
print('total number of vertices imported: ' , N)
print(g.vs[0])
print(g.vs[0]['label'])
print(g.vs[1])
print(g.vs[1]['area_level'])

total number of vertices imported:  1692
igraph.Vertex(<igraph.Graph object at 0x7fbb7c2c1408>, 0, {'label': 'amoxicilloate', 'area_level': 2, 'name': 'http://purl.obolibrary.org/obo/chebi_133943'})
amoxicilloate
igraph.Vertex(<igraph.Graph object at 0x7fbb7c2c1408>, 1, {'label': '5-oxo-d-prolinate', 'area_level': 2, 'name': 'http://purl.obolibrary.org/obo/chebi_57948'})
2


In [52]:
iri = 'http://purl.obolibrary.org/obo/chebi_103512' #False
# iri = 'http://purl.obolibrary.org/obo/chebi_29067' #True
# print(g.vs['name'])
# print(iri in g.vs['name'])

def checkInVertices(edgeEnd):
    return edgeEnd in g.vs['name']
    

def edgesPair(x):
    if checkInVertices(x.src) and checkInVertices(x.dst):
        return True
    else: 
        return False

# g2.edges.rdd.filter(lambda x: edgesPair(x)==True).count()
# g2.edges.rdd.filter(lambda x: edgesPair(x)==True).map(lambda x: (x.src,x.dst)).collect()
# print(g2.edges.rdd.filter(lambda x: edgesPair(x)==True).map(lambda x: (x.src,x.dst)).count())
g.add_edges(g2.edges.rdd.filter(lambda x: edgesPair(x)==True).map(lambda x: (x.src,x.dst)).collect())
g.es["ont"] = g2.edges.rdd.map(lambda x: x.ont).collect()
E=g.ecount()
print('total number of edges imported: ' , E)
print(g.es[0].tuple)
# print(g.es[0]['label'])
print(g.es[1].tuple)

total number of edges imported:  2155
(1553, 1570)
(649, 1634)


In [48]:
print(g.es[101]['ont'])
# g2.edges.rdd.filter(lambda x: x.src in g.vs['name'] and x.dst in g.vs['name']).collect()
# g2.edges.rdd.filter(lambda x: x.src in g.vs['name'] and x.dst in g.vs['name']).count()
# g2.edges.rdd.map(lambda x: x.dst).collect()

chebi.obo


In [None]:
g.add_vertex(name = 'http://purl.obolibrary.org/obo/chebi_62943')
g.add_vertices(p_vertices)

# for vertex in vertices:
#     g.add_vertex(name=vertex)

N=g.vcount()
print('total number of vertices imported: ' , N)
print(p_edges)

g.add_edges(p_edges)

L= g.ecount()
print('added # of edges: ', L)

In [None]:
iri = 'http://purl.obolibrary.org/obo/chebi_62943'

iri_res = result.filter(result.CLASS_IRI==iri)

iri_res.show()

iri_res2 = df_label.filter(df_label.id==iri)

iri_res2.show()

aid = iri_res2.collect()[0]['area_id']
print(aid)

pid = iri_res2.collect()[0]['parea_root_id']
print(pid)

pid2 = iri_res2.take(1)
print(pid2[0][5])
# iri= 'http://purl.obolibrary.org/obo/iao_0000030'
# http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay
# http://purl.obolibrary.org/obo/chebi_62943
# http://purl.obolibrary.org/obo/chebi_133771
# http://www.w3.org/2002/07/owl#thing

# paths = g.bfs(fromExpr="id = 'http://purl.obolibrary.org/obo/chebi_133771'", toExpr ="id = 'http://purl.obolibrary.org/obo/chebi_62943'", edgeFilter = "ont = 'chebi.obo'")
# paths.show(10, False)

# # Specify edge filters or max path lengths.
# g.bfs("name = 'Esther'", "age < 32",\
#   edgeFilter="relationship != 'friend'", maxPathLength=3)

In [None]:
# iri= 'http://purl.obolibrary.org/obo/iao_0000030'
# http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay
# http://purl.obolibrary.org/obo/chebi_62943
# http://purl.obolibrary.org/obo/chebi_133771
# http://www.w3.org/2002/07/owl#thing
paths = g.bfs(fromExpr="id = 'http://purl.obolibrary.org/obo/chebi_62943'", toExpr ="area_name = 'has part, is conjugate base of'", edgeFilter = "ont = 'chebi.obo'")
# paths.show(10, False)
display(paths)

# # Specify edge filters or max path lengths.
# g.bfs("name = 'Esther'", "age < 32",\
#   edgeFilter="relationship != 'friend'", maxPathLength=3)

In [None]:
paths = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")\
  .filter("b.area_level = 2 and c.ont = 'chebi.obo' and d.area_level >= a.area_level ")\
  .filter("a.id = 'http://purl.obolibrary.org/obo/chebi_62943'")
display(paths)

In [None]:
# iri= 'http://purl.obolibrary.org/obo/iao_0000030'
# http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay
# http://purl.obolibrary.org/obo/chebi_62943
# http://purl.obolibrary.org/obo/chebi_133771
# http://www.w3.org/2002/07/owl#thing
paths = g.bfs(fromExpr="id = 'http://purl.obolibrary.org/obo/chebi_62943'",\
              toExpr =" id  != 'http://purl.obolibrary.org/obo/chebi_62943'",\
              edgeFilter = "ont = 'chebi.obo'",\
              maxPathLength=5)
paths.show(10, False)

# # Specify edge filters or max path lengths.
# g.bfs("name = 'Esther'", "age < 32",\
#   edgeFilter="relationship != 'friend'", maxPathLength=3)

In [None]:
paths = g.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[cd]->(d)")\
  .filter("ab.ont = 'chebi.obo' and bc.ont = 'chebi.obo' and cd.ont = 'chebi.obo' ")\
  .filter("b.id = 'http://purl.obolibrary.org/obo/chebi_62943'")
display(paths)

In [None]:
paths = g.find("(a)-[e]->(b); (b)-[e1]->(c); (c)-[e2]->(d)")\
  .filter("e.ont = 'chebi.obo'")\
  .filter("a.id = 'http://purl.obolibrary.org/obo/chebi_133771'")
# "paths" contains vertex info. Extract the edges.
e2 = paths.select("e.src", "e.dst", "e.ont").union(paths.select("e2.src", "e2.dst", "e2.ont")).distinct()
# In Spark 1.5+, the user may simplify this call:
#  val e2 = paths.select("e.*")
v2 = g.vertices.filter("ont = 'chebi.obo'")
# Construct the subgraph
g2 = GraphFrame(v2, e2)
g2.vertices.show()
g2.edges.show(20, False)
print(g2.edges.count())
print(g2.vertices.count())
print(g.vertices.count())

In [None]:
from pyspark.sql.functions import *
degrees = g.degrees.sort(desc("degree"))
degrees.show()

In [None]:
# Display the vertex and edge DataFrames
g.vertices.show()
# +--+-------+---+
# |id|   name|age|
# +--+-------+---+
# | a|  Alice| 34|
# | b|    Bob| 36|
# | c|Charlie| 30|
# | d|  David| 29|
# | e| Esther| 32|
# | f|  Fanny| 36|
# | g|  Gabby| 60|
# +--+-------+---+

g.edges.show()
# +---+---+------------+
# |src|dst|relationship|
# +---+---+------------+
# |  a|  b|      friend|
# |  b|  c|      follow|
# |  c|  b|      follow|
# |  f|  c|      follow|
# |  e|  f|      follow|
# |  e|  d|      friend|
# |  d|  a|      friend|
# |  a|  e|      friend|
# +---+---+------------+

# Get a DataFrame with columns "id" and "inDegree" (in-degree)
vertexInDegrees = g.inDegrees

# Find the youngest user's age in the graph.
# This queries the vertex DataFrame.
# g.vertices.groupBy().min("age").show()

# Count the number of "follows" in the graph.
# This queries the edge DataFrame.
# numFollows = g.edges.filter("relationship = 'follow'").count()

In [None]:
iri= 'http://purl.obolibrary.org/obo/chebi_133771'
iri2 = 'http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay'
df_hier.filter(df_hier.src == iri).show()
df_label.filter(df_label.id == iri2).show()
# def findChildrenDF(iri):
#     return df_hier.filter(lambda x: x[0] == iri)

# df = sc.parallelize(iri)

In [None]:
df_class_labels.createOrReplaceTempView("class_labels")

iri = "http://purl.obolibrary.org/obo/iao_0000030"
print(spark.sql("SELECT * from class_labels where CLASS_IRI = '" + iri +"'").collect())


In [None]:
df_IRI_LABEL=df_class_hier.join(df_class_labels, 'CLASS_IRI').select(df_class_hier.CLASS_IRI, df_class_labels.CLASS_LABEL)

In [None]:
df_IRI_LABEL.count()
df_IRI_LABEL.printSchema()

In [None]:
df_IRI_LABEL.groupBy("CLASS_IRI").count().sort("count", ascending=False).limit(20).toPandas()

In [None]:
df_IRI_LABEL.show(10,truncate= True)
result = df_IRI_LABEL.where(df_IRI_LABEL.CLASS_IRI == 'http://purl.obolibrary.org/obo/iao_0000030').distinct()

In [None]:
print(result.count())
result.collect()[1]['CLASS_LABEL']

In [None]:
for row in result.collect():
    print(row['CLASS_IRI'], row['CLASS_LABEL'])

In [None]:
import re
def isIRIEqualLabel(iri, label):
    iri = iri.split('/')[-1].replace('_',' ')
    return label ==iri 

In [None]:
for row in result.collect():
    if not isIRIEqualLabel(row[0], row[1]):
        print(row[0], row[1])

In [None]:
def getIRI(label):
    result = []
    iris = df_class_labels.filter(df_class_labels.CLASS_LABEL==label).select('CLASS_IRI','CLASS_LABEL').distinct().collect()
    for row in iris:
        if not isIRIEqualLabel(row["CLASS_IRI"], row['CLASS_LABEL']):
            result.append(row["CLASS_IRI"])
    return result

def getLabel(iri):
    result =[]
    labels = df_class_labels.filter(df_class_labels.CLASS_IRI==iri).select('CLASS_IRI','CLASS_LABEL').distinct().collect()
    for row in labels:
        if not isIRIEqualLabel(row["CLASS_IRI"], row['CLASS_LABEL']):
            result.append(row["CLASS_LABEL"])
    return result

def getOnts(iri):
    result =[]
    ont_names = df_class_labels.filter(df_class_labels.CLASS_IRI==iri).select('ONT_NAME').distinct().collect()
    for row in ont_names:
        result.append(row["ONT_NAME"])
    return result
            

def getParents(iri):
    result =set()
    if iri == 'http://www.w3.org/2002/07/owl#thing':
        return result
    print("get parent for: ", iri) 
    parents = df_class_hier.filter(df_class_hier.CLASS_IRI==iri).distinct().collect()
    for row in parents:
        result.add(row['PARENT_CLASS_IRI'])
#     print(result)
    return result

def getChildren(iri):
    result =set()
    children = df_class_hier.filter(df_class_hier.PARENT_CLASS_IRI==iri).distinct().collect()
    for row in children:
        result.add(row['CLASS_IRI'])
#     print(result)
    return result

def getArea(iri, tax_type = 'op_restriction'):
    result = df_tax_areas_concepts.filter((df_tax_areas_concepts.TAX_TYPE==tax_type)&\
                                          (df_tax_areas_concepts.CLASS_IRI==iri) & \
                                          (df_tax_areas_concepts.AREA_ID!='[empty set]'))\
    .join(df_tax_areas,'AREA_ID').drop(df_tax_areas.TAX_TYPE)
    area = result.select('TAX_TYPE','AREA_ID', 'AREA_NAME', 'AREA_LEVEL').collect()
#     area = result.collect()
    return area

def getOntName(iri):
    return ont

def getPArea(iri, tax_type = 'op_restriction'):
    result = df_tax_areas_pareas.filter((df_tax_areas_pareas.TAX_TYPE==tax_type)&\
                                        (df_tax_areas_pareas.CLASS_IRI==iri) & \
                                          (df_tax_areas_pareas.PAREA_ROOT_IRI!='[empty set]'))
    parea = result.drop('ONT_NAME').distinct().collect()
    return parea

def getAreaLevel(iri, tax_type = 'op_restriction'):
    df_tax_areas_concepts2=df_tax_areas_concepts.filter((df_tax_areas_concepts.TAX_TYPE==tax_type)&\
                                                        (df_tax_areas_concepts.CLASS_IRI == iri)& \
                                                          (df_tax_areas_concepts.AREA_ID!='[empty set]'))
    
    result = df_tax_areas.join(df_tax_areas_concepts2, 'AREA_ID').drop('ONT_NAME').distinct().first()
    
    if not result:
        return 0
    else:
        return result['AREA_LEVEL']


In [None]:
getAreaLevel("http://purl.obolibrary.org/obo/apollo_sv_00000144")

In [None]:
print(getIRI("information content entity"))
print(getLabel("http://purl.obolibrary.org/obo/iao_0000030"))

In [None]:
getArea('http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay')
getChildren('http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay')
# result = df_tax_areas_concepts.filter(df_tax_areas_concepts.CLASS_IRI=='http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay')

# result.filter(df_tax_areas_concepts.AREA_ID!='[empty set]').show()
getParents('http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay')

In [None]:
getPArea('http://chem2bio2rdf.org/chem2bio2rdf.owl#bioassay')

In [None]:
df_class_hier
df_class_labels
df_tax_areas
df_tax_areas_concepts
df_tax_areas_pareas

In [None]:
getPArea('http://www.w3.org/2002/07/owl#thing')
getParents('http://www.w3.org/2002/07/owl#thing')
getParents('http://www.ifomis.org/bfo/1.1#entity')

In [None]:
def getPAreaParent(iri, depth=1):
    result = []
    for i in range(depth):
        for row in getPArea(iri):
            result.append((iri, row['PAREA_ROOT_IRI']))
            result + getPAreaParent(getParents(row['PAREA_ROOT_IRI']))
            
def getAreaParent(iri, depth):
    parents= []

    
    
def getPAreaChildren(iri, depth=1):
    result = []
    for i in range(depth):
        for row in getPArea(iri):
            result.append((row['PAREA_ROOT_IRI'], iri))
            result + getPAreaChilren(getChildren(row['PAREA_ROOT_IRI']))
    

In [None]:
def getAllChildren(iri, visited=None):
    if visited is None:
        visited = set()
    visited.add(iri)
    result = []
    pair = []
    for child_iri in getChildren(iri) - visited:
        print("get child: ",child_iri)
        if child_iri:
            result.append(child_iri)
            pair.append((child_iri, iri))
            result1, pair1 = getAllChildren(child_iri, visited)
            result += result1
            pair += pair1
            visited.add(child_iri)
            
    return result, pair
c_vertices, c_edges = getAllChildren('http://purl.obolibrary.org/obo/chebi_62943')


In [None]:
def getAllParents(iri, visited = None):
    if visited is None:
        visited = set()
    visited.add(iri)

    result = []
    pair = []
    if iri != 'http://www.w3.org/2002/07/owl#thing':
        for parent_iri in getParents(iri) - visited:
            print("get parent: ", parent_iri)
            if parent_iri:
                result.append(parent_iri)
                pair.append((iri, parent_iri))
                result1, pair1 = getAllParents(parent_iri, visited)
                result += result1
                pair += pair1
                visited.add(parent_iri)
    return result, pair

p_vertices, p_edges = getAllParents('http://purl.obolibrary.org/obo/chebi_62943')


In [None]:
import sys
print(sys.path)

import igraph as ig
g = ig.Graph()


In [None]:
g.add_vertex(name = 'http://purl.obolibrary.org/obo/chebi_62943')
g.add_vertices(p_vertices)

# for vertex in vertices:
#     g.add_vertex(name=vertex)

N=g.vcount()
print('total number of vertices imported: ' , N)
print(p_edges)

g.add_edges(p_edges)

L= g.ecount()
print('added # of edges: ', L)

In [None]:
# g.add_vertex(name = 'http://purl.obolibrary.org/obo/chebi_62943')
g.add_vertices(c_vertices)

# for vertex in vertices:
#     g.add_vertex(name=vertex)

N=g.vcount()
print('total number of vertices imported: ' , N)
print(c_edges)

g.add_edges(c_edges)

L= g.ecount()
print('added # of edges: ', L)

In [53]:
labels=[]
group=[]
for node in g.vs:
    labels.append(node['label'])
    group.append(node['area_level'])

In [None]:
for i in g.vs:
    print(i)

In [54]:
layt=g.layout_auto(dim=3)
layt[5]

[4.297656059265137, -0.5522169470787048, 1.1587646007537842]

In [55]:
Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
Yn=[layt[k][1] for k in range(N)]# y-coordinates
Zn=[layt[k][2] for k in range(N)]# z-coordinates
Xe=[]
Ye=[]
Ze=[]
for e in g.es:
    e=e.tuple
    Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
    Ye+=[layt[e[0]][1],layt[e[1]][1], None]
    Ze+=[layt[e[0]][2],layt[e[1]][2], None]

In [56]:
import plotly as py
from plotly.graph_objs import *

In [57]:
trace1=Scatter3d(x=Xe,
               y=Ye,
               z=Ze,
               mode='lines',
               line=Line(color='rgb(125,125,125)', width=1),
               hoverinfo='none'
               )
trace2=Scatter3d(x=Xn,
               y=Yn,
               z=Zn,
               mode='markers',
               name='actors',
               marker=Marker(symbol='dot',
                             size=6,
                             color=group,
                             colorscale='Viridis',
                             line=Line(color='rgb(50,50,50)', width=0.5)
                             ),
               text=labels,
               hoverinfo='text'
               )

In [58]:
axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )

In [59]:
layout = Layout(
         title="Network of coappearances of characters in Victor Hugo's novel<br> Les Miserables (3D visualization)",
         width=1000,
         height=1000,
         showlegend=False,
         scene=Scene(
         xaxis=XAxis(axis),
         yaxis=YAxis(axis),
         zaxis=ZAxis(axis),
        ),
     margin=Margin(
        t=100
    ),
    hovermode='closest',
    annotations=Annotations([
           Annotation(
           showarrow=False,
            text="Data source: <a href='http://bost.ocks.org/mike/miserables/miserables.json'>[1] miserables.json</a>",
            xref='paper',
            yref='paper',
            x=0,
            y=0.1,
            xanchor='left',
            yanchor='bottom',
            font=Font(
            size=14
            )
            )
        ]),    )

In [60]:
data=Data([trace1, trace2])
fig=Figure(data=data, layout=layout)

py.offline.init_notebook_mode(connected=True)

py.offline.iplot(fig, filename='Les-Miserables')