In [3]:
# import libraries from pyspark 
from pyspark import SparkConf, SparkContext

# set values for spark configuration
conf = SparkConf().setMaster("local").setAppName("tranformations")

# get (if already running) or create a spark context
sc = SparkContext.getOrCreate(conf=conf)

In [4]:
rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [5]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

In [6]:
rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [7]:
mapped_rdd = rdd.map(lambda x: x * x)
print(mapped_rdd.collect())

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [8]:
mapped_rdd = rdd.map(lambda x: [x * x])
print(mapped_rdd.collect())

[[1], [4], [9], [16], [25], [36], [49], [64], [81], [100]]


In [9]:
filtered_rdd = rdd.filter(lambda x: x % 2 == 0)
print(filtered_rdd.collect())

[2, 4, 6, 8, 10]


In [10]:
flatmapped_rdd = rdd.flatMap(lambda x: [x, x * x])
print(flatmapped_rdd.collect())  

[1, 1, 2, 4, 3, 9, 4, 16, 5, 25, 6, 36, 7, 49, 8, 64, 9, 81, 10, 100]


In [11]:
flatmapped_rdd = rdd.flatMap(lambda x: [x * x])
print(flatmapped_rdd.collect())  

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [12]:
sampled_rdd = rdd.sample(withReplacement=False, fraction=0.3, seed=42)
print(sampled_rdd.collect())

[1, 2, 4, 5, 8]


In [13]:
another_rdd = sc.parallelize([11, 12, 13, 14])
union_rdd = rdd.union(another_rdd)
print(union_rdd.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


In [14]:
rdd_with_duplicates = sc.parallelize([1, 2, 2, 3, 3, 3, 4, 5, 5, 6])
distinct_rdd = rdd_with_duplicates.distinct()
print(distinct_rdd.collect())

[1, 2, 3, 4, 5, 6]


In [15]:
rdd = sc.parallelize([
    ("a", 1), ("b", 2), ("a", 3), ("b", 4), ("c", 5), ("c", 6)
])

In [16]:
grouped_rdd = rdd.groupByKey()

print([(k, list(v)) for k,v in grouped_rdd.collect()])

for k,v in grouped_rdd.collect():
    print(k,list(v))

for k,v in grouped_rdd.collect():
    print([k,list(v)])

[('a', [1, 3]), ('b', [2, 4]), ('c', [5, 6])]
a [1, 3]
b [2, 4]
c [5, 6]
['a', [1, 3]]
['b', [2, 4]]
['c', [5, 6]]


In [17]:
reduced_rdd = rdd.reduceByKey(lambda x,y : x + y)
print(reduced_rdd.collect())

[('a', 4), ('b', 6), ('c', 11)]


In [18]:
sorted_rdd = rdd.sortByKey(ascending=True)
print(sorted_rdd.collect())

[('a', 1), ('a', 3), ('b', 2), ('b', 4), ('c', 5), ('c', 6)]


In [19]:
rdd2 = sc.parallelize([
    ("a", "valuefora"), ("b", "valueforb"), ("c", "valueforc")
])

joined_rdd = rdd.join(rdd2)
print(joined_rdd.collect())

[('b', (2, 'valueforb')), ('b', (4, 'valueforb')), ('c', (5, 'valueforc')), ('c', (6, 'valueforc')), ('a', (1, 'valuefora')), ('a', (3, 'valuefora'))]


In [20]:
rdd3 = sc.parallelize([
    ("a", 100), ("b", 200), ("c", 300), ("a", 400)
])

cogroup_rdd = rdd.cogroup(rdd3)

for k, (v1, v2) in cogroup_rdd.collect():
    print(k, list(v1), list(v2))

b [2, 4] [200]
c [5, 6] [300]
a [1, 3] [100, 400]


In [21]:
rdd4 = sc.parallelize(["x", "y"])

cartesian_rdd = rdd.cartesian(rdd4)
print(cartesian_rdd.collect())

[(('a', 1), 'x'), (('a', 1), 'y'), (('b', 2), 'x'), (('b', 2), 'y'), (('a', 3), 'x'), (('a', 3), 'y'), (('b', 4), 'x'), (('b', 4), 'y'), (('c', 5), 'x'), (('c', 5), 'y'), (('c', 6), 'x'), (('c', 6), 'y')]


In [22]:
try:
    sc
except NameError:
    print("spark context does not context exist - nothing to stop")
else:
    sc.stop()