In [None]:
#Step01 -- Initializing Spark

from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession

# We add this line to avoid an error : "Cannot run multiple SparkContexts at once". 
# If there is an existing spark context, we will reuse it instead of creating a new context.
sc = SparkContext.getOrCreate()

# local[*]: run Spark locally with as many working processors as logical cores on your machine.
# In the field of `master`, we use a local server with as many working processors (or threads) as possible (i.e. `local[*]`). 
# If we want Spark to run locally with 'k' worker threads, we can specify as `local[k]`.
# The `appName` field is a name to be shown on the Sparking cluster UI. 

# If there is no existing spark context, we now create a new context
if (sc is None):
    sc = SparkContext(master="local[*]", appName="FIT5202 Assignment 1 Part-A")

spark = SparkSession(sparkContext=sc)

In [None]:
#Step02 -- Create RDD's

#Renamed "Agile Processess in Software Engineering and Extreme Programming" as Agile.txt
#Renamed "Scrum Handbook" as Scrum.txt

agileRdd = sc.textFile('Agile.txt') #read agile text file
scrumRdd = sc.textFile('Scrum.txt') #read scrum text file

#To display number of lines

print("Number of lines in Agile Processess in Software Engineering and Extreme Programming are ",agileRdd.count())
print("Number of lines in Scrum Handbook are ",scrumRdd.count())

In [None]:
#Step 03 -- Cleaning and Manipulating text

#Import re package to use regex
import re  

def function(inputRDD):
    if(inputRDD!=" "):
        regex = re.compile('[^a-zA-Z \s]')  #1 To validate the characters present in RDD
        chkRegex=regex.sub('',inputRDD)
        value = chkRegex.lower()            #2 To convert characters into lower case
    return value.strip()                    #3 To removing trailing spaces
     
newAgileRDD=agileRdd.map(function)
newAgileRDD1=newAgileRDD.filter(lambda x:x!='') # To remove the empty '' element from RDD
print("New Agile result: \n",newAgileRDD1.collect())

In [None]:
#Step 03 -- Cleaning and Manipulating text for Scrum file

newScrumRDD=scrumRdd.map(function)
newScrumRDD1=newScrumRDD.filter(lambda x:x!='')  # To remove the empty '' element from RDD
print("New Scrum result: \n",newScrumRDD1.collect())

In [None]:
#Step 04 -- Transformation and Action for Agile

agileWords = newAgileRDD1.flatMap(lambda x: x.split(" "))     #To split each element by space
agileNewWords= agileWords.filter(lambda x:x!='')
agileResult = agileNewWords.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) #To calculate the number of occurrences
agileSortedResult = agileResult.sortBy(lambda x: x[1], ascending=False) # To display frequently used words

print("The 20 most frequently used words in Agile Processess in Software Engineering and Extreme Programming are: ")
print("*****************************************************************************")

agileSortedResult.take(20) # To display only first 20 words

In [None]:
#Step 04 -- Transformation and Action for Scrum

scrumWords = newScrumRDD1.flatMap(lambda x: x.split(" "))  #To split each element by space
scrumNewWords= scrumWords.filter(lambda x:x!='')
scrumResult = scrumNewWords.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) #To calculate the number of occurrences
scrumSortedResult = scrumResult.sortBy(lambda x: x[1], ascending=False) # To display frequently used words

print("The 20 most frequently used words in Scrum Handbook are: ")
print("*****************************************************************************") 

scrumSortedResult.take(20) # To display only first 20 words from Scrum file

In [None]:
#Step 05 - Removing stop words

#Import nltk package to remove the stop words from RDD

import nltk        
nltk.download('stopwords')

In [None]:
#Step 05 - Removing stop words 

#Agile txt file

from nltk.corpus import stopwords
englishWords = set(stopwords.words('english'))
unfilteredAgile = newAgileRDD1.flatMap(lambda x: x.split(" "))
tempAgile= unfilteredAgile.filter(lambda x:x!='')
agileStopWords = tempAgile.filter(lambda word: word not in englishWords and word!='') #Condition to check with stop words

print("Word count after removing stop words in Agile Processess in Software Engineering and Extreme Programming is :",agileStopWords.count()) #Displaying the count
print("*****************************************************************************")
print("Individual words after removing stop words in Agile Processess in Software Engineering and Extreme Programming are :\n",agileStopWords.collect())

In [None]:
#Step 05 - Displaying distinct words

#Agile txt file

distinctAgileWords=agileStopWords.distinct() #Collecting only distinct words in RDD

print("Total unique words count: ",distinctAgileWords.count())
print("*****************************************************************************")
print("Unique words in Agile Processess in Software Engineering and Extreme Programming are as follows: \n")

distinctAgileWords.collect()

In [None]:
#Step 05 - Removing stop words 

#Scrum txt file

unfilteredScrum = newScrumRDD1.flatMap(lambda x: x.split(" "))
tempScrum= unfilteredScrum.filter(lambda x:x!='')
scrumStopWords = tempScrum.filter(lambda word: word not in englishWords and word!='') #Condition to check with stop words

print("Number of individual word count in Scrum Handbook is :",scrumStopWords.count()) #Displaying the count
print("*****************************************************************************")
print("Individual words in Scrum Handbook are :\n",scrumStopWords.collect())


In [None]:
#Step 05 - Displaying distinct words

#Scrum txt file

distinctScrumWords=scrumStopWords.distinct() #Collecting only distinct words in RDD
print("Total unique words:",distinctScrumWords.count())
print("Unique words in Scrum Handbook are as follows: \n")

distinctScrumWords.collect()

In [None]:
#Step 06 - Find average occurrence

#Agile file

avgAgileCount = agileStopWords.count()/distinctAgileWords.count()
print("Average occurrence of each word in Agile Processess in Software Engineering and Extreme Programming is:",round(avgAgileCount,2))


In [None]:
#Step 06 - Find average occurrence

#Scrum file

avgScrumCount = scrumStopWords.count()/distinctScrumWords.count()
print("Average occurrence of each word in Scrum Handbook is:",round(avgScrumCount,2))


In [None]:
#Step 07 -- Exploratory Data Analysis

#Book 1 is Agile
#Book 2 is Scrum Handbook 
#1 -- Distribution of top 30 words in Book 1 and Book 2 using log scale(base 10) 

import matplotlib.pyplot as plt
%matplotlib inline

book1Count = agileStopWords.flatMap(lambda x: x.split(" "))\
.filter(lambda x:x!='')\
.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)\
.sortBy(lambda x: x[1], ascending=False)

book1FirstVal=book1Count.map(lambda x:x[0]) #fetching the first value from the tuple
book1SecondVal=book1Count.map(lambda x:x[1]) #fetching second value from the tuple

book2Count= scrumStopWords.flatMap(lambda x: x.split(" "))\
.filter(lambda x:x!='')\
.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)\
.sortBy(lambda x: x[1], ascending=False)

book2FirstVal=book2Count.map(lambda x:x[0])
book2SecondVal=book2Count.map(lambda x:x[1])

plt.subplots(figsize=(20,5))
plt.semilogy(book1FirstVal.take(30),book1SecondVal.take(30),label='Book 1') #using semilogarithmic graph to plot base 10
plt.semilogy(book2FirstVal.take(30),book2SecondVal.take(30),label='Book 2')
plt.xticks(rotation=50, ha="right",fontsize=17)
plt.yticks(fontsize=17)
plt.xlabel('Frequent Words',fontsize=17)
plt.ylabel('Count',fontsize=17)
plt.title('DISTRIBUTION OF BOOK 1 AND BOOK 2')
plt.legend(loc='best',fontsize=17)
plt.show()

## ANALYSIS:

The motive of using logarithmic scale is to find the occurences of words in both Agile and Scrum book. We can see there are quite lot of common words in both the books. The word Software has appeared higher number of times in book 1 than in book 2. Similarly for the other words. In this graph, I've taken only the first 30 frequently used words instead of displaying all to avoid congestion

In [None]:
#Step 07 -- Exploratory Data Analysis
#2 -- Comparing 15 frequent words from Agile and Scrum file

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
bar_width = 0.5

# Graph for Book 1
plt.subplots(figsize=(10,5))
plt.bar(book2FirstVal.take(15),book2SecondVal.take(15),bar_width, align='center', color='C0')
plt.xticks(rotation=45, ha="right",fontsize=12)
plt.xlabel('Frequent Words')
plt.ylabel('Count')
plt.title('Scrum Handbook Graph')

# Graph for Book 2
plt.subplots(figsize=(10,5))
plt.bar(book1FirstVal.take(15),book1SecondVal.take(15),bar_width, align='center', color='C0')
plt.xticks(rotation=45, ha="right",fontsize=12)
plt.xlabel('Frequent Words')
plt.ylabel('Count')
plt.title('Agile Handbook Graph')
plt.show()

## ANALYSIS

### SCRUM HANDBOOK

The first bar chart illustrates top 15 frequently used words in Scrum Handbook. It is evident that only three words(**Scrum,team,product**) has occurred more than 230 times while rest of the words are in the range of 50-150. 

### AGILE 

The second bar chart depicts the occurrences of top 15 frequently used words in Agile file. It is crystal clear that the word **Software** has appeared 850 times which is four times the average occurences of other words. This implies, there are many words in Agile file compared to Scrum handbook file.  