# Parallel and Distributed Processing

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
Collecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): still running...
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612248 sha256=c33ec8e108be548934baf0a128962bb4387c9ceafd1ebec2c34f006d8416cc9f
  Stored in directory: c:\users\kenec\appdata\local\pip\cache\wheels\5e\34\fa\b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
import pyspark

In [3]:
from pyspark import SparkContext

In [4]:
sc = SparkContext()

In [5]:
nums = sc.parallelize([1,2,3,4])

In [6]:
nums.take(1)

[1]

In [9]:
squared = nums.map(lambda x: x*x).collect()

for num in squared:
    print('%i'%(num))

1
4
9
16


### Dataframes

In [10]:
from pyspark.sql import Row
from pyspark.sql import SQLContext

In [11]:
sqlContext = SQLContext(sc)

Create list of Tuple with Names and their ages

In [12]:
list_p = [('Kene',26), ('Alo', 24), ('Chisom', 30), ('Akudo', 25)]

#### Build an RDD

In [13]:
rdd = sc.parallelize(list_p)

#### Convert the Tuples

In [14]:
ppl = rdd.map(lambda x: Row(name=x[0], age= int(x[1])))

#### Create a Dataframe

In [15]:
DF_ppl = sqlContext.createDataFrame(ppl)

##### Print Schema

In [16]:
DF_ppl.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



## Word Count Exercise

#### For this exercise, we will use the works of Shakespeare. These are provided in the file shakespeare.txt

In [18]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

#sc = SparkContext(appName = "wordcount")
spark = SparkSession.Builder().getOrCreate()

text_file = sc.textFile("shakespeare.txt").map(lambda x: x.replace(',',' ').replace('.',' ').replace('-',' ').lower())

counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a,b: a + b)


sorted_counts = counts.sortBy(lambda wordCounts: wordCounts[1], ascending = False)

i = 0

for word, count in sorted_counts.collect()[0:30]:
    print("{} : {} : {} ".format(i, word, count))
    i += 1

0 :  : 191593 
1 : the : 17670 
2 : and : 15983 
3 : i : 13810 
4 : to : 12426 
5 : ; : 10882 
6 : of : 10787 
7 : a : 9774 
8 : you : 9429 
9 : my : 7972 
10 : : : 7339 
11 : in : 7067 
12 : that : 6973 
13 : ? : 6535 
14 : ! : 6211 
15 : is : 6153 
16 : not : 5506 
17 : me : 5206 
18 : for : 5165 
19 : it : 5098 
20 : with : 4679 
21 : be : 4665 
22 : your : 4634 
23 : this : 4338 
24 : his : 4204 
25 : but : 4138 
26 : he : 4111 
27 : as : 3935 
28 : have : 3861 
29 : will : 3455 
