# CS 5683 - Big Data Analytics
## Assignment - 1: Intro. to Spark and RDD

###### Use Google Colab to use this notebook
###### Let's setup Spark first

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 64 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 64.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=bdc8e8b6637fa769027f953b2702653702a6860f3e55cd127ccfa3e6eced40a1
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


###### Import required libraries now

In [3]:
import sys
import re
 
from pyspark import SparkContext, SparkConf

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###### Let's initialize Spark context now

In [4]:
# create Spark context with necessary configuration
sc = SparkContext("local","PySpark - CS5683 - Assignment-1")

###### Follow the tutorial to mount your Google Drive. Give mounted Drive paths below

In [10]:
# Give **.txt FILE PATHS** here
# Use your own files which could match your application
file1 = '/content/drive/MyDrive/file1.txt'
file2 = '/content/drive/MyDrive/file2.txt'

# USE THESE FILES as input(s) FOR ALL BELOW QUESTIONS

### Example Spark program

In [None]:
# Example Spark application for a simple wordcount
# What is wordcount? 
    # Given a file, count the frequency of all words appearing in that file
    
# Step-1: Read the required file. In our case it is file1 or file2.
# NOTE: We do not need to initialize SparkContext as only one SparkContext can be initialized in one notebook
fileRDD = sc.textFile(file1)

# Step-2: 
    # Each line in our file(s) is a sentence. So, we need to split the sentence with ' ' to get words
    # Using map() will return RDD[list]. But we need RDD[string]. So we use flatMap()
wordsRDD = fileRDD.flatMap(lambda line: line.split(" ")) # <----------- TEST what happens when you use map()

# Step-3: For each input, we will make (K,V) pair, where K is the word and V is 1
pairRDD = wordsRDD.map(lambda word: (word,1))
pairRDD.collect()
# Step-4: Now we have to sum all 1's of each word
# NOTE: A word may present in multiple data partitions. So we use reduceByKey() to group by key and perform sum
# countRDD = pairRDD.reduceByKey(lambda a,b: a+b)

#Step-5: Save results in a text file
# countRDD.saveAsTextFile('/content/test') # <----------- GIVE FILE PATH

### Question - 1 (10 points) 

In [None]:
# YOUR CODE for Question-1 HERE

In [29]:
# Question 1
# Step-1: Read the file1.
# NOTE: We do not need to initialize SparkContext as only one SparkContext can be initialized in one notebook
fileRDD = sc.textFile(file1)

# Step-2: Split the sentence with ' ' to get words RDD[string] using flatMap() 
wordsRDD = fileRDD.flatMap(lambda line: line.split(" "))

# Step-3: Take first letter of each words.
lettersRDD = wordsRDD.map(lambda line: line[:1])

# Step-4: Take out emoticons from the list
EMOJI_Kill = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])') 
noemojiRDD = lettersRDD.flatMap(lambda l: EMOJI_Kill.sub(r'', l))

# Step-5: Leave only special characters.
filteredRDD = noemojiRDD.flatMap(lambda l: re.findall("\W" ,l)).filter(lambda x: x)

# Step-6: For each input, we will make (K,V) pair, where K is the word and V is 1
pairRDD = filteredRDD.map(lambda word: (word,1))

# Step-7: Use reduceByKey() to group by key and perform sum
countRDD = pairRDD.reduceByKey(lambda a,b: a+b)
#countRDD.collect()
# Step-8: Save results in a text file
countRDD.saveAsTextFile('/content/test1') # <----------- GIVE FILE PATH

In [None]:
# PRINT THE OUTPUT HERE
[('#', 9), ('@', 5), ('!', 2), ('[', 1), (']', 1), ('"', 4)]

### Question - 2 (10 points)

In [None]:
# YOUR CODE for Question-2 HERE

In [38]:
# Question 2
# Step-1: Read the file1.
# NOTE: We do not need to initialize SparkContext as only one SparkContext can be initialized in one notebook
fileRDD = sc.textFile(file1)

# Step-2: Split the line with '\n' to get lines RDD[sentence] using flatMap() 
linesplitRDD = fileRDD.flatMap(lambda line: line.split("\n")).filter(lambda x: x)

# Step-3: Split the sentence with ' ' to get words [[words], [words]]
spacesplitRDD = linesplitRDD.map(lambda word: word.lower().split(" "))

# Step-4: Change to set datastructure to remove duplicated words
setRDD = spacesplitRDD.map(lambda word: set(word))

# Step-5: Count each unique words in each line
distRDD = setRDD.map(lambda word: len(word))

# Step-6: Indexing the previous rdd
indexRDD = distRDD.zipWithIndex().map(lambda x : (x[1]+1,x[0]))

# indexRDD.collect()
# Step-8: Save results in a text file
indexRDD.saveAsTextFile('/content/test2') # <----------- GIVE FILE PATH

In [None]:
# PRINT THE OUTPUT HERE

[(1, 5),
 (2, 3),
 (3, 10),
 (4, 4),
 (5, 4),
 (6, 4),
 (7, 4),
 (8, 4),
 (9, 4),
 (10, 4),
 (11, 4),
 (12, 4),
 (13, 4),
 (14, 4),
 (15, 4),
 (16, 4),
 (17, 4),
 (18, 4),
 (19, 4),
 (20, 4),
 (21, 4),
 (22, 4),
 (23, 4),
 (24, 4),
 (25, 4),
 (26, 4),
 (27, 4),
 (28, 4),
 (29, 4),
 (30, 4),
 (31, 4),
 (32, 4),
 (33, 4),
 (34, 4),
 (35, 4),
 (36, 4),
 (37, 4),
 (38, 4),
 (39, 4),
 (40, 4),
 (41, 4),
 (42, 4),
 (43, 4),
 (44, 4),
 (45, 4),
 (46, 4),
 (47, 4),
 (48, 4),
 (49, 4),
 (50, 4),
 (51, 4),
 (52, 4),
 (53, 4),
 (54, 4),
 (55, 4),
 (56, 4),
 (57, 4),
 (58, 4),
 (59, 4),
 (60, 4),
 (61, 4),
 (62, 4),
 (63, 4),
 (64, 4),
 (65, 4),
 (66, 4),
 (67, 4),
 (68, 4),
 (69, 4),
 (70, 4),
 (71, 4),
 (72, 4),
 (73, 4),
 (74, 4),
 (75, 4),
 (76, 4),
 (77, 4),
 (78, 4),
 (79, 4),
 (80, 4),
 (81, 4),
 (82, 4),
 (83, 4),
 (84, 4),
 (85, 4),
 (86, 4),
 (87, 4),
 (88, 4),
 (89, 4),
 (90, 3),
 (91, 4)]

### Question - 3 (10 points)

In [None]:
# YOUR CODE for Question-3 HERE

In [45]:
# Question 3
# Step-1: Read the file1 and file2.
# NOTE: We do not need to initialize SparkContext as only one SparkContext can be initialized in one notebook
fileRDD1 = sc.textFile(file1)
fileRDD2 = sc.textFile(file2)

# Step-2: Split the sentence with ' ' to get words RDD[string] using flatMap() 
wordsRDD1 = fileRDD1.flatMap(lambda line: line.split(" ")).filter(lambda x: x)
wordsRDD2 = fileRDD2.flatMap(lambda line: line.split(" ")).filter(lambda x: x)

# Step-3: For each input, we will make (K,V) pair, where K is the word and V is 1
pairRDD1 = wordsRDD1.map(lambda word: (word.lower(), 1))
pairRDD2 = wordsRDD2.map(lambda word: (word.lower(), 1))

# Step-4: Use reduceByKey() to group by key and perform sum
countRDD1 = pairRDD1.reduceByKey(lambda a,b: a+b)
countRDD2 = pairRDD2.reduceByKey(lambda a,b: a+b)

# Step-5: Join two rdds and sort them in ascending order by key
sortRDD = countRDD1.join(countRDD2).sortByKey()

# sortRDD.collect()
# Step-8: Save results in a text file
sortRDD.saveAsTextFile('/content/test3') # <----------- GIVE FILE PATH

In [None]:
# PRINT THE OUTPUT HERE
[('!d', (1, 1)),
 ('!test', (1, 1)),
 ('#', (1, 1)),
 ('#test', (4, 4)),
 ('@', (1, 1)),
 ('@@', (1, 1)),
 ('@f', (1, 1)),
 ('@hard', (1, 1)),
 ('@life', (1, 1)),
 ('[', (1, 1)),
 (']', (1, 1)),
 ('abandon', (1, 2)),
 ('ability', (1, 2)),
 ('able', (1, 2)),
 ('abortion', (1, 2)),
 ('about', (1, 2)),
 ('above', (1, 2)),
 ('abroad', (1, 2)),
 ('absence', (1, 2)),
 ('absolute', (1, 2)),
 ('absolutely', (1, 2)),
 ('absorb', (1, 2)),
 ('abuse', (1, 2)),
 ('academic', (1, 2)),
 ('accept', (1, 2)),
 ('access', (1, 2)),
 ('accident', (1, 2)),
 ('accompany', (1, 2)),
 ('accomplish', (1, 2)),
 ('according', (1, 2)),
 ('account', (1, 2)),
 ('accurate', (1, 2)),
 ('accuse', (1, 2)),
 ('achieve', (1, 2)),
 ('achievement', (1, 2)),
 ('acid', (1, 2)),
 ('acknowledge', (1, 2)),
 ('acquire', (1, 2)),
 ('across', (1, 2)),
 ('act', (1, 2)),
 ('action', (1, 2)),
 ('active', (1, 2)),
 ('activist', (1, 2)),
 ('activity', (1, 2)),
 ('actor', (1, 2)),
 ('actress', (1, 2)),
 ('actual', (1, 2)),
 ('actually', (1, 2)),
 ('ad', (1, 2)),
 ('adapt', (1, 2)),
 ('add', (1, 2)),
 ('addition', (1, 2)),
 ('additional', (1, 2)),
 ('address', (1, 2)),
 ('adequate', (1, 2)),
 ('adjust', (1, 2)),
 ('adjustment', (1, 2)),
 ('administration', (1, 2)),
 ('administrator', (1, 2)),
 ('admire', (1, 2)),
 ('admission', (1, 2)),
 ('admit', (1, 2)),
 ('adolescent', (1, 2)),
 ('adopt', (1, 2)),
 ('adult', (1, 2)),
 ('advance', (1, 2)),
 ('advanced', (1, 2)),
 ('advantage', (1, 2)),
 ('adventure', (1, 2)),
 ('advertising', (1, 2)),
 ('advice', (1, 2)),
 ('advise', (1, 2)),
 ('adviser', (1, 2)),
 ('advocate', (1, 2)),
 ('affair', (1, 2)),
 ('affect', (1, 2)),
 ('afford', (1, 2)),
 ('afraid', (1, 2)),
 ('african', (1, 2)),
 ('african-american', (1, 2)),
 ('after', (1, 2)),
 ('afternoon', (1, 2)),
 ('again', (1, 2)),
 ('against', (1, 2)),
 ('age', (1, 2)),
 ('agency', (1, 2)),
 ('agenda', (1, 2)),
 ('agent', (1, 2)),
 ('aggressive', (1, 2)),
 ('ago', (1, 2)),
 ('agree', (1, 2)),
 ('agreement', (1, 2)),
 ('agricultural', (1, 2)),
 ('ah', (1, 2)),
 ('ahead', (1, 2)),
 ('aid', (1, 2)),
 ('aide', (1, 2)),
 ('amazing', (1, 1)),
 ('american', (1, 1)),
 ('among', (1, 1)),
 ('amount', (1, 1)),
 ('analysis', (1, 1)),
 ('analyst', (1, 1)),
 ('analyze', (1, 1)),
 ('ancient', (1, 1)),
 ('and', (1, 1)),
 ('anger', (1, 1)),
 ('angle', (1, 1)),
 ('angry', (1, 1)),
 ('animal', (1, 1)),
 ('anniversary', (1, 1)),
 ('announce', (1, 1)),
 ('annual', (1, 1)),
 ('another', (1, 1)),
 ('answer', (1, 1)),
 ('anticipate', (1, 1)),
 ('anxiety', (1, 1)),
 ('any', (1, 1)),
 ('anybody', (1, 1)),
 ('anymore', (1, 1)),
 ('anyone', (1, 1)),
 ('anything', (1, 1)),
 ('anyway', (1, 1)),
 ('anywhere', (1, 1)),
 ('apart', (1, 1)),
 ('apartment', (1, 1)),
 ('apparent', (1, 1)),
 ('apparently', (1, 1)),
 ('appeal', (1, 1)),
 ('appear', (1, 1)),
 ('appearance', (1, 1)),
 ('apple', (1, 1)),
 ('application', (1, 1)),
 ('apply', (1, 1)),
 ('appoint', (1, 1)),
 ('appointment', (1, 1)),
 ('appreciate', (1, 1)),
 ('approach', (1, 1)),
 ('appropriate', (1, 1)),
 ('approval', (1, 1)),
 ('approve', (1, 1)),
 ('approximately', (1, 1)),
 ('arab', (1, 1)),
 ('architect', (1, 1)),
 ('area', (1, 1)),
 ('argue', (1, 1)),
 ('argument', (1, 1)),
 ('arise', (1, 1)),
 ('arm', (1, 1)),
 ('armed', (1, 1)),
 ('army', (1, 1)),
 ('around', (1, 1)),
 ('arrange', (1, 1)),
 ('arrangement', (1, 1)),
 ('arrest', (1, 1)),
 ('arrival', (1, 1)),
 ('arrive', (1, 1)),
 ('art', (1, 1)),
 ('article', (2, 2)),
 ('artist', (1, 1)),
 ('artistic', (1, 1)),
 ('asian', (2, 1)),
 ('aside', (2, 1)),
 ('ask', (2, 1)),
 ('asleep', (2, 1)),
 ('aspect', (2, 1)),
 ('assault', (2, 1)),
 ('assert', (2, 1)),
 ('assess', (2, 1)),
 ('assessment', (2, 1)),
 ('asset', (2, 1)),
 ('assign', (2, 1)),
 ('assignment', (2, 1)),
 ('assist', (2, 1)),
 ('assistance', (2, 1)),
 ('assistant', (2, 1)),
 ('associate', (2, 1)),
 ('association', (2, 1)),
 ('assume', (2, 1)),
 ('assumption', (2, 1)),
 ('assure', (2, 1)),
 ('at', (2, 1)),
 ('athlete', (1, 1)),
 ('athletic', (1, 1)),
 ('atmosphere', (1, 1)),
 ('attach', (1, 1)),
 ('attack', (1, 1)),
 ('attempt', (1, 1)),
 ('attend', (1, 1)),
 ('attention', (1, 1)),
 ('attitude', (1, 1)),
 ('attorney', (1, 1)),
 ('attract', (1, 1)),
 ('attractive', (1, 1)),
 ('attribute', (1, 1)),
 ('audience', (1, 1)),
 ('author', (1, 1)),
 ('authority', (1, 1)),
 ('auto', (1, 1)),
 ('available', (1, 1)),
 ('average', (1, 1)),
 ('avoid', (1, 1)),
 ('award', (1, 1)),
 ('aware', (1, 1)),
 ('awareness', (1, 1)),
 ('away', (1, 1)),
 ('awful', (1, 1)),
 ('baby', (1, 1)),
 ('back', (1, 1)),
 ('background', (1, 1)),
 ('bad', (1, 1)),
 ('badly', (1, 1)),
 ('bag', (1, 1)),
 ('bake', (1, 1)),
 ('balance', (1, 1)),
 ('ball', (1, 1)),
 ('ban', (1, 1)),
 ('band', (1, 1)),
 ('bank', (1, 1)),
 ('bar', (1, 1)),
 ('barely', (1, 1)),
 ('barrel', (1, 1)),
 ('barrier', (1, 1)),
 ('base', (1, 1)),
 ('baseball', (1, 1)),
 ('basic', (1, 1)),
 ('basically', (1, 1)),
 ('basis', (1, 1)),
 ('basket', (1, 1)),
 ('basketball', (1, 1)),
 ('bathroom', (1, 1)),
 ('battery', (1, 1)),
 ('battle', (1, 1)),
 ('be', (1, 1)),
 ('beach', (1, 1)),
 ('bean', (1, 1)),
 ('bear', (1, 1)),
 ('beat', (1, 1)),
 ('beautiful', (1, 1)),
 ('beauty', (1, 1)),
 ('because', (1, 1)),
 ('become', (1, 1)),
 ('bed', (1, 1)),
 ('bedroom', (1, 1)),
 ('beer', (1, 1)),
 ('before', (1, 1)),
 ('s#df', (1, 1)),
 ('sdfs', (1, 1)),
 ('test!test', (1, 1)),
 ('wyed', (1, 1))]

### WHAT TO TURN-IN IN CANVAS

# Due Date: Sept. 1 at 11:59pm