In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark

In [2]:
import os
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [3]:
# Calling spark session to register application
spark = SparkSession \
    .builder \
    .appName("MapReduce") \
    .getOrCreate()
# lambda word: (word, 1)

#### You can set number of partitions by changing local value. 

Spark can run 1 concurrent task for every partition of an RDD (up to the number of cores in the cluster).
If you’re cluster has 20 cores, you should have at least 20 partitions (in practice 2–3x times more). 
From the other hand a single partition typically shouldn’t contain more than 128MB and a single 
shuffle block cannot be larger than 2GB.



In [4]:
# If you want change property of SparkContext or want to create new SparkContext, first you have to stop existing one.
# sc.stop() stop existing SparkContext
# sc = SparkContext("locals[3]")
sc = spark.sparkContext

In [5]:
# Default Partitions
sc.defaultParallelism

8

In [6]:
# Content of demo.txt 
'''
['Matrix', 'factorization', 'works', 'great', 'for', 'building', 
'recommender', 'systems.', 'I', 'think', 'it', 'got', 'pretty', 
'popular', 'after', 'the', 'Netflix', 'prize', 'competition.', 
'All', 'you', 'need', 'to', 'build', 'one', 'is', 'information', 
'about', 'which', 'user', 'bought', 'or', 'rated', 'which', 'items',
'and', 'youre', 'good', 'to', 'go.', 'And', 'I', 'was', 'surprised',
'how', 'amazingly', 'simple', 'to', 'build', 'one', 'with', 'Pyspark', 
'ML', 'libraries.', 'So', "I'll", 'demonstrate', 'how', 'to', 'code', 
'one', 'up', 'quickly', 'using', 'RDDs', 'and', 'DataFrames', 'separately.']
'''
text_file = sc.textFile("check.txt")

<img src="https://miro.medium.com/max/1386/1*fs9qiYrqphSuFuKKVkEmaQ.png">

In [7]:
# define punctuation
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
# Converting lines in words using flatMap
flat_map_result = text_file.flatMap(lambda line: line.split(" ")) 

### Splitting Input

In [8]:
# getting a list of words from flat map result
words = flat_map_result.collect()
#distribute input to available partitions
words_rdd = sc.parallelize(words)
words_rdd

ParallelCollectionRDD[3] at readRDDFromFile at PythonRDD.scala:247

In [9]:
print("Number of partitions: {}".format(words_rdd.getNumPartitions()))
# Collecting different partitions in the list 
partitions = words_rdd.glom().collect()

print()
count = 1
# print list of words every partition contain
for ix,partition in enumerate(partitions):
    print("Partition -",count)
    #print(partition)
    print()
    for ind,word in enumerate(partition):
        for i in word:
            if i in punctuations: 
                #partition[ind] = word.translate(i, "").strip()
                word=word[:-1]
                partitions[ix][ind]=word
    #print(len(word))
    #print(partition)
    count+=1

Number of partitions: 8

Partition - 1

Partition - 2

Partition - 3

Partition - 4

Partition - 5

Partition - 6

Partition - 7

Partition - 8



### Map Phase

Map phase runs in parallel and (if possible) locally on each data block. Instead of delivering terabytes of data to a program, a small, user–defined program is copied onto data servers and does everything with them that does not require shuffling and data movement (shuffle);

In map phase map fuction will be performed on every rdd. In below case each word in rdd will converted into (word,count)tuple.

In [17]:
map_words_rdd = words_rdd.map(lambda word: (word,len(word), 1))
#wordCounts=words.map(lambda word:(len(word),1)).reduceByKey(lambda a,b:a+b)
print("Number of partitions: {}".format(map_words_rdd.getNumPartitions()))
# Collecting different partitions in the list 
partitions = map_words_rdd.glom().collect()

print()
count = 1
# print list of tuple every partition contain
for partition in partitions:
    print("Partition -",count)
    print(partition)
    print()
    count+=1

Number of partitions: 8

Partition - 1
[]

Partition - 2
[('world!', 6, 1)]

Partition - 3
[]

Partition - 4
[('text.', 5, 1)]

Partition - 5
[]

Partition - 6
[('text', 4, 1)]

Partition - 7
[]

Partition - 8
[('okay', 4, 1)]



### Reduce Phase

This phase complements Map with aggregate operations.

In the reduce phase, we calculate count value by adding values of the same key.

In [15]:
reduce_words_rdd = map_words_rdd.reduceByKey(lambda a, b: a + b)
print("Number of partitions: {}".format(reduce_words_rdd.getNumPartitions()))
# Collecting different partitions in the list 
partitions = reduce_words_rdd.glom().collect()
print()
count = 1
# print list of tuple every partition contain
for partition in partitions:
    print("Partition -",count)
    print(partition)
    print()
    count+=1

Number of partitions: 8

Partition - 1
[]

Partition - 2
[]

Partition - 3
[]

Partition - 4
[]

Partition - 5
[(4, 2)]

Partition - 6
[(5, 1)]

Partition - 7
[(6, 1)]

Partition - 8
[]



### Collect all partitons

In this phase all partitions are combine into one 

In [16]:
# combine all partitions into one partition
counts = reduce_words_rdd.coalesce(1)

print("Number of partitions: {}".format(counts.getNumPartitions()))
partitions = counts.glom().collect()
print()
count = 1
for partition in partitions:
    print("Partition -",count)
    print(partition)
    print()
    count+=1

Number of partitions: 1

Partition - 1
[(4, 2), (5, 1), (6, 1)]

