# Part 3 - Nobel Laureates who are also Writers

Tags for nobel prizes look like these: `'<Nobel_Prize_in_Chemistry>`, `<Nobel_Prize_in_Physics>'`, `<Nobel_Prize>` or `<Nobel_Prize>` etc.
We are also counting this one: `'<Nobel_Memorial_Prize_in_Economic_Sciences>'`.

The tag for writers is `'<wordnet_writer_110794014>'`.

You will need to use `'<hasWonPrize>'` as a predicate.

Please sort the output alphabetically by the person column.

In [2]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from datetime import datetime
sqlContext = SQLContext(sc)

In [3]:
# After having looked at the head of the file in the terminal, one can see that there is no header for the columns
# so we'll have to create the schema ourselves. Also, the first line is an explanation of what the database contains,
# which has to be removed before converting it into a df, so we'll upload the data to an RDD to do that removal
# with the following function:
def remove_first(itr_index, itr):
    return iter(list(itr)[1:]) if itr_index == 0 else itr

In [4]:
# Import Data to an RDD and remove first line
file = sc.textFile("/yago/yagoFacts.tsv")
data = file.mapPartitionsWithIndex(remove_first) # Maybe this is smarter: rdd.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0])

In [5]:
# Split by columns
rdd_map = data.map(lambda x: x.split("\t"))   
#rdd_map.take(3)

In [6]:
# We've already seen the data and datatypes it contains, because there's no header describing the dataset
# we create a schema
schema = StructType([
                StructField("id",StringType()),
                StructField("subject", StringType()),
                StructField("predicate", StringType()),
                StructField("object", StringType()),
                StructField("value", StringType())#DoubleType
                    ])
# It didn't work with the value as a DoubleType, so we'll create it with that column as a string
# and change the datatype afterwards
df = sqlContext.createDataFrame(rdd_map, schema)
#df.take(3)

In [7]:
# Change value Datatype
df = df.withColumn("value", df["value"].cast(DoubleType()))
#df.take(3)

Now for the Transitive Type Dataset

In [8]:
file = sc.textFile("/yago/yagoTransitiveType.tsv")
data2 = file.mapPartitionsWithIndex(remove_first) # Maybe this is smarter:
#data = file.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0])
rdd_map2 = data2.map(lambda x: x.split("\t"))   
schema = StructType([
                StructField("id",StringType()),
                StructField("subject", StringType()),
                StructField("predicate", StringType()),
                StructField("object", StringType()),
                StructField("value", StringType())#DoubleType
                    ])
# It didn't work with the value as a DoubleType, so we'll create it with that column as a string
# and change the datatype afterwards
df_subclasses = sqlContext.createDataFrame(rdd_map2, schema)
df_subclasses = df_subclasses.withColumn("value", df_subclasses["value"].cast(DoubleType()))
#df_subclasses.show(3)

In [9]:
# Import necessary libraries
import graphframes
from graphframes import *

import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql.functions import col, lit, when
from pyspark.sql import Row

from datetime import datetime
import re
import numpy as np

## Part 3 specific

## Create Separate Dataframes

In [11]:
nobels = df.select("subject", "object", "predicate").filter("predicate = '<hasWonPrize>'")\
.filter("object = '<Nobel_Prize_in_Chemistry>' OR object = '<Nobel_Prize_in_Physics>' \
OR object = '<Nobel_Prize_in_Literature>' OR object = '<Nobel_Prize>' \
OR object = '<Nobel_Memorial_Prize_in_Economic_Sciences>' OR object = '<Nobel_Prize_in_Physiology_or_Medicine>'")



In [12]:
writers = df_subclasses.select("subject", "object", "predicate")\
.filter("object = '<wordnet_writer_110794014>'").filter("predicate = 'rdf:type'")

Started at 07:23:02 
 Ended at 07:23:02
('Took', '-1 day, 23:59:59.957057')


In [13]:
nobels.createOrReplaceTempView("nobels")
writers.createOrReplaceTempView("writers")

In [14]:
nobel_writers = spark.sql("SELECT n.subject, n.object, n.predicate, w.object AS writer_obj, w.predicate AS writer_pred \
FROM nobels n JOIN writers w ON n.subject = w.subject")

### Checking Entries and Count are correct

In [None]:
beginning = datetime.now()

nobel_writers.show(10)
print(nobel_writers.count())

ending = datetime.now()
beginning_time = beginning.strftime("%H:%M:%S")
ending_time = ending.strftime("%H:%M:%S")

print("Started at {} \n Ended at {}".format(beginning_time, ending_time))
elapsed = (beginning-ending)
print("Took", str(elapsed))

+--------------------+--------------------+-------------+--------------------+-----------+
|             subject|              object|    predicate|          writer_obj|writer_pred|
+--------------------+--------------------+-------------+--------------------+-----------+
|    <Grazia_Deledda>|<Nobel_Prize_in_L...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
|        <John_Hicks>|<Nobel_Memorial_P...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
|<Aleksandr_Solzhe...|<Nobel_Prize_in_L...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
|       <W._B._Yeats>|<Nobel_Prize_in_L...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
|        <Lev_Landau>|<Nobel_Prize_in_P...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
| <Thomas_J._Sargent>|<Nobel_Memorial_P...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
| <Tjalling_Koopmans>|<Nobel_Memorial_P...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|
|        <Henry_Yule>|<Nobel_Prize_in_L...|<hasWonPrize>|<wordnet_writer_1...|   rdf:type|

### Creating Vertex, Edges and Graph

In [15]:
v_nw = nobel_writers.select("subject").withColumnRenamed("subject", "id")\
.union(nobel_writers.select("writer_obj").withColumnRenamed("writer_obj", "id"))\
.union(nobel_writers.select("object").withColumnRenamed("object", "id")).distinct()

v_nw

DataFrame[id: string]

In [16]:
e_l = nobel_writers.select("subject", "object", "predicate").withColumnRenamed("subject", "src")\
.withColumnRenamed("object", "dst").withColumnRenamed("predicate", "pred")

e_w = nobel_writers.select("subject", "writer_obj", "writer_pred").withColumnRenamed("subject", "src")\
.withColumnRenamed("writer_obj", "dst").withColumnRenamed("writer_pred", "pred")

e_nw = e_w.union(e_l)
e_nw.columns

['src', 'dst', 'pred']

In [17]:
g_nw = GraphFrame(v_nw, e_nw)

In [18]:
g_nw

GraphFrame(v:[id: string], e:[src: string, dst: string ... 1 more field])

### Query

In [None]:
beginning = datetime.now()

g_nw.find("(a)-[e]->(b); (a)-[e2]->(c)").filter("b.id = '<wordnet_writer_110794014>'")\
.filter("c.id = '<Nobel_Prize_in_Chemistry>' OR c.id = '<Nobel_Prize_in_Physics>' \
OR c.id = '<Nobel_Prize_in_Literature>' OR c.id = '<Nobel_Prize>' \
OR c.id = '<Nobel_Memorial_Prize_in_Economic_Sciences>' OR c.id = '<Nobel_Prize_in_Physiology_or_Medicine>'")\
.sort("a").show(20)


ending = datetime.now()
beginning_time = beginning.strftime("%H:%M:%S")
ending_time = ending.strftime("%H:%M:%S")

print("Started at {} \n Ended at {}".format(beginning_time, ending_time))
elapsed = (beginning-ending)
print("Took", str(elapsed))

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   a|                   e|                   b|                  e2|                   c|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[<Adrienne_Clarks...|[<Adrienne_Clarks...|[<wordnet_writer_...|[<Adrienne_Clarks...|[<Nobel_Prize_in_...|
|    [<Albert_Camus>]|[<Albert_Camus>, ...|[<wordnet_writer_...|[<Albert_Camus>, ...|[<Nobel_Prize_in_...|
| [<Albert_Einstein>]|[<Albert_Einstein...|[<wordnet_writer_...|[<Albert_Einstein...|[<Nobel_Prize_in_...|
|[<Aleksandr_Solzh...|[<Aleksandr_Solzh...|[<wordnet_writer_...|[<Aleksandr_Solzh...|[<Nobel_Prize_in_...|
|[<Alexander_Prokh...|[<Alexander_Prokh...|[<wordnet_writer_...|[<Alexander_Prokh...|[<Nobel_Prize_in_...|
|[<Alexei_Alexeyev...|[<Alexei_Alexeyev...|[<wordnet_writer_...|[<Alexei_Alexeyev...|[<Nobel_Prize_in_...|
|   [<Alexis_Carrel>]|[<Alexis_Carrel