Skip to content

Commit

Permalink
SPARK-1165 Implemented RDD.intersection in python.
Browse files Browse the repository at this point in the history
  • Loading branch information
ScrapCodes committed Mar 5, 2014
1 parent 0283665 commit d6effee
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,19 @@ def union(self, other):
return RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
self.ctx.serializer)

def intersection(self, other):
"""
Return the intersection of this RDD and another one.
>>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
>>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
>>> rdd1.intersection(rdd2).collect()
[1, 2, 3]
"""
return self.map(lambda v: (v, None)).cogroup(
other.map(lambda v: (v, None))).filter(
lambda x: (len(x[1][0]) != 0) and (len(x[1][1]) != 0)).keys()

def _reserialize(self):
if self._jrdd_deserializer == self.ctx.serializer:
return self
Expand Down

0 comments on commit d6effee

Please sign in to comment.