Permalink
Browse files

SPARK-1165 Implemented RDD.intersection in python.

  • Loading branch information...
1 parent 0283665 commit d6effee4ee967f15210d0d57526beab4e3f9c8e2 @ScrapCodes ScrapCodes committed Mar 5, 2014
Showing with 13 additions and 0 deletions.
  1. +13 −0 python/pyspark/rdd.py
View
@@ -319,6 +319,19 @@ def union(self, other):
return RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,
self.ctx.serializer)
+ def intersection(self, other):
+ """
+ Return the intersection of this RDD and another one.
+
+ >>> rdd1 = sc.parallelize([1, 10, 2, 3, 4, 5])
+ >>> rdd2 = sc.parallelize([1, 6, 2, 3, 7, 8])
+ >>> rdd1.intersection(rdd2).collect()
+ [1, 2, 3]
+ """
+ return self.map(lambda v: (v, None)).cogroup(
+ other.map(lambda v: (v, None))).filter(
+ lambda x: (len(x[1][0]) != 0) and (len(x[1][1]) != 0)).keys()
+
def _reserialize(self):
if self._jrdd_deserializer == self.ctx.serializer:
return self

0 comments on commit d6effee

Please sign in to comment.