## 5. Politicians that are affiliated with a right-wing party

We are looking for all connections of the form `polician -> party`, where party is a right-wing party and politicians are defined above. If one politician is associated with several right wing parties, you may count them several times.

Use `'<isAffiliatedTo>'` to find membership in organisations and `'<wikicat_Right-wing_parties>'` for right-wing parties organisations.

There are multiple ways to do this.

Please sort the output alphabetically by the person (politician) column.

In [25]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)

In [26]:
# After having looked at the head of the file in the terminal, one can see that there is no header for the columns
# so we'll have to create the schema ourselves. Also, the first line is an explanation of what the database contains,
# which has to be removed before converting it into a df, so we'll upload the data to an RDD to do that removal
# with the following function:
def remove_first(itr_index, itr):
    return iter(list(itr)[1:]) if itr_index == 0 else itr

In [27]:
# Import Data to an RDD and remove first line
file = sc.textFile("/yago/yagoFacts.tsv")
data = file.mapPartitionsWithIndex(remove_first) # Maybe this is smarter: rdd.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0])

In [28]:
# Split by columns
rdd_map = data.map(lambda x: x.split("\t"))   
#rdd_map.take(3)

In [29]:
# We've already seen the data and datatypes it contains, because there's no header describing the dataset
# we create a schema
schema = StructType([
                StructField("id",StringType()),
                StructField("subject", StringType()),
                StructField("predicate", StringType()),
                StructField("object", StringType()),
                StructField("value", StringType())#DoubleType
                    ])
# It didn't work with the value as a DoubleType, so we'll create it with that column as a string
# and change the datatype afterwards
df = sqlContext.createDataFrame(rdd_map, schema)
#df.take(3)

In [30]:
# Change value Datatype
df = df.withColumn("value", df["value"].cast(DoubleType()))
#df.take(3)

Now for the Transitive Type Dataset

In [31]:
file = sc.textFile("/yago/yagoTransitiveType.tsv")
data2 = file.mapPartitionsWithIndex(remove_first) # Maybe this is smarter:
#data = file.zipWithIndex().filter(lambda tup: tup[1] > 0).map(lambda tup: tup[0])
rdd_map2 = data2.map(lambda x: x.split("\t"))   
schema = StructType([
                StructField("id",StringType()),
                StructField("subject", StringType()),
                StructField("predicate", StringType()),
                StructField("object", StringType()),
                StructField("value", StringType())#DoubleType
                    ])
# It didn't work with the value as a DoubleType, so we'll create it with that column as a string
# and change the datatype afterwards
df_subclasses = sqlContext.createDataFrame(rdd_map2, schema)
df_subclasses = df_subclasses.withColumn("value", df_subclasses["value"].cast(DoubleType()))
#df_subclasses.show(3)

In [32]:
# Import necessary libraries
import graphframes
from graphframes import *

import matplotlib.pyplot as plt
%matplotlib inline

from pyspark.sql.functions import col, lit, when
from pyspark.sql import Row

from datetime import datetime
import re
import numpy as np

In [33]:
def vertices_edges_split(df, condition1):
    sub = df.filter(condition1).select("subject").withColumnRenamed("subject","id")
    obj = df.filter(condition1).select("object").withColumnRenamed("object","id")
    v = sub.union(obj).distinct()
    e = df.filter(condition1).select("subject","object","predicate")\
    .withColumnRenamed("subject","src").withColumnRenamed("object","dst").withColumnRenamed("predicate","pred")
    return v, e

def vertices_edges_split2(df, condition1, condition2):
    sub = df.filter(condition1).filter(condition2).select("subject").withColumnRenamed("subject","id")
    obj = df.filter(condition1).filter(condition2).select("object").withColumnRenamed("object","id")
    v = sub.union(obj).distinct()
    e = df.filter(condition1).select("subject","object","predicate")\
    .withColumnRenamed("subject","src").withColumnRenamed("object","dst").withColumnRenamed("predicate","pred")
    return v, e

In [34]:
politicians = df_subclasses.select("subject", "object", "predicate").filter("object = '<wordnet_politician_110450303>'")\
.filter("predicate = 'rdf:type'")

In [35]:
df.createOrReplaceTempView("df")
df_subclasses.createOrReplaceTempView("df_subclasses")
politicians.createOrReplaceTempView("politicians")

In [36]:
affiliated_politicians = spark.sql("SELECT p.subject AS politician, p.object AS pol_cat, p.predicate AS pol_pred, \
d.object AS party, d.predicate AS affiliated_to \
FROM politicians p JOIN df d ON p.subject = d.subject \
WHERE d.predicate = '<isAffiliatedTo>'")

In [37]:
affiliated_politicians.createOrReplaceTempView("affiliated_politicians")

In [38]:
right_wing = df_subclasses.select("subject", "object", "predicate").filter("object = '<wikicat_Right-wing_parties>'")\
.filter("predicate = 'rdf:type'")

In [39]:
right_wing.createOrReplaceTempView("right_wing")

In [40]:
politicians_right_wing = spark.sql("SELECT p.politician, p.pol_cat, p.pol_pred, p.party, p.affiliated_to, \
r.object AS rw_obj, r.predicate AS rw_pred \
FROM affiliated_politicians p JOIN right_wing r ON p.party = r.subject")

In [47]:
def vertices_edges_split2(df, subject, object1, predicate1, object2, predicate2, object3, predicate3):
    sub = df.select(subject).withColumnRenamed(subject, "id") #politician
    obj1 = df.select(object1).withColumnRenamed(object1, "id") #pol_cat
    obj2 = df.select(object2).withColumnRenamed(object2, "id") #party
    obj3 = df.select(object3).withColumnRenamed(object3, "id") #rw_obj
    v = sub.union(obj1).union(obj2).union(obj3).distinct()
    
    e_pol = df.select(subject, object1, predicate1)\
    .withColumnRenamed(subject, "src").withColumnRenamed(object1, "dst" ).withColumnRenamed(predicate1, "pred")
    e_par = df.select(subject, object2, predicate2)\
    .withColumnRenamed(subject, "src").withColumnRenamed(object2, "dst").withColumnRenamed(predicate2, "pred")
    e_rw = df.select(object2, object3, predicate3)\
    .withColumnRenamed(object2, "src").withColumnRenamed(object3, "dst").withColumnRenamed(predicate3, "pred")
    e = e_pol.union(e_par).union(e_rw).distinct()
    
    return v, e

In [48]:
v, e = vertices_edges_split2(politicians_right_wing, \
                             "politician", "pol_cat", "pol_pred", "party", "affiliated_to", "rw_obj", "rw_pred")

In [49]:
g = GraphFrame(v, e)

In [50]:
beginning = datetime.now()

In [None]:
g.find("(a)-[e]->(c); (a)-[e2]->(p); (p)-[e3]->(r) ").filter("c.id = '<wordnet_politician_110450303>'")\
.filter("e2.pred = '<isAffiliatedTo>'").filter("r.id = '<wikicat_Right-wing_parties>'").sort("a").show(20)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                   a|                   e|                   c|                  e2|                   p|                  e3|                   r|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|[<A.N.M._Ehsanul_...|[<A.N.M._Ehsanul_...|[<wordnet_politic...|[<A.N.M._Ehsanul_...|[<Bangladesh_Nati...|[<Bangladesh_Nati...|[<wikicat_Right-w...|
|[<A._A._Wijethunga>]|[<A._A._Wijethung...|[<wordnet_politic...|[<A._A._Wijethung...|[<United_National...|[<United_National...|[<wikicat_Right-w...|
|    [<A._B._Colton>]|[<A._B._Colton>, ...|[<wordnet_politic...|[<A._B._Colton>, ...|[<Republican_Part...|[<Republican_Part...|[<wikicat_Right-w...|
|   [<A._C._Clemons>]|[<A._C._Clemons>,...|[<wordnet_politic...|[<A._C._Clemons>,...|[<Republican_Part...|

In [None]:
ending = datetime.now()
beginning_time = beginning.strftime("%H:%M:%S")
ending_time = ending.strftime("%H:%M:%S")

print("Started at {} \n Ended at {}".format(beginning_time, ending_time))

Started at 09:40:13 
 Ended at 11:49:16


In [None]:
print(1)

1
