# Join Operation on RDD in Spark


In [1]:
# Let us create some RDDs

valueRDDA = sc.parallelize(["a", "b", "c", "d", "e"])
valueRDDB = sc.parallelize(["AA", "BB", "CC", "DD"])

rddB = sc.parallelize([1, 1 , 5,  2, 3])
rddC = sc.parallelize([1, 5, 5, 6])

print(valueRDDA.collect())
print(valueRDDB.collect())
print(rddB.collect())
print(rddC.collect())

['a', 'b', 'c', 'd', 'e']
['AA', 'BB', 'CC', 'DD']
[1, 1, 5, 2, 3]
[1, 5, 5, 6]


# Zip operation on RDD

We can zip the above RDDs to create new RDDs with different keys

In [2]:
# Now, we can zip these RDDs to create new RDDs with different keys

rdd1 = rddB.zip(valueRDDA)
rdd2 = rddC.zip(valueRDDB)

print("RDD 1 : ", rdd1.collect())
print("RDD 2 : ", rdd2.collect())


RDD 1 :  [(1, 'a'), (1, 'b'), (5, 'c'), (2, 'd'), (3, 'e')]
RDD 2 :  [(1, 'AA'), (5, 'BB'), (5, 'CC'), (6, 'DD')]


In [3]:
# We would like to see what we get from different kind of join operations 
rdd1.join(rdd2).collect()

# with join we get the intersection of two sets 

[(1, ('a', 'AA')), (1, ('b', 'AA')), (5, ('c', 'BB')), (5, ('c', 'CC'))]

In [4]:
rdd1.leftOuterJoin(rdd2).collect()

# with leftOuterJoin we get the intersection of two sets plus what is on the first left side set

[(1, ('a', 'AA')),
 (1, ('b', 'AA')),
 (2, ('d', None)),
 (3, ('e', None)),
 (5, ('c', 'BB')),
 (5, ('c', 'CC'))]

In [5]:
rdd1.rightOuterJoin(rdd2).collect()
# similar to the above but from the other side. 

[(1, ('a', 'AA')),
 (1, ('b', 'AA')),
 (5, ('c', 'BB')),
 (5, ('c', 'CC')),
 (6, (None, 'DD'))]

In [6]:

rdd1.fullOuterJoin(rdd2).collect()

# full outer join will give all of the keys

[(1, ('a', 'AA')),
 (1, ('b', 'AA')),
 (2, ('d', None)),
 (3, ('e', None)),
 (5, ('c', 'BB')),
 (5, ('c', 'CC')),
 (6, (None, 'DD'))]

In [7]:
# In case that you do not need the keys after the join operations 
# you can remove the keys using a map transformation. 


result = rdd1.join(rdd2).map(lambda x: (x[1][0], x[1][1]))
result.collect()


[('a', 'AA'), ('b', 'AA'), ('c', 'BB'), ('c', 'CC')]