# EPFL Big Data Analysis Week 1 Assignment

## Objective
Produce a rudimentary metric using Wikipedia articles to provide a popularity ranking for programming languages.  
Determine whether this ranking bear any relation to the popular Red Monk ranking.

## Steps
1. The file wikipedia.dat contains full text articles from Wikipedia.  Read in the source data as a distributed dataset
2. Produce a simple metric - this simple metric is a ranking based on the number of articles that mention the language at least once
3. Create an inverted index - maps the language name to the collection of Wikipedia articles mentioned that language

In [1]:
import time 

# Credits to Fahim Sakri 
# Source (https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d)
# An annotation for timing a python function
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ("%r  %2.2f ms" % (method.__name__, (te - ts) * 1000))
        return result
    return timed


In [51]:
## Constants and Helper functions

langs = ["JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
    "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy"]

class WikiArticle:
    def __init__(self, title, text):
        self.title = title
        self.text = text
        
def readWikiArticle(filepath):
    subs = "</title><text>"
    l = len(subs)
    articles = []
    z = 0
    with open(filepath) as f:
         for line in f:
            i = int(line.find(subs))
            title = line[14:i]
            text  = line[(i + l): (len(line)-17)]
            if ("Across the many fields" in text):
                print ("z" + str(z))
            z = z + 1
            articles.append(WikiArticle(title, text))
    return articles


data = [a.text for a in readWikiArticle('/data/epfl-big-data-analysis/wikipedia.dat')]
print(len(data))
print(any("JavaScript" in s for s in data[1542].split()))
print("small" in "smalla")
print(mentionedLang('Javascript', data[1542]))

z1542
4086
True
True
True


In [54]:
## Setup
from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession \
    .builder \
    .appName("EPFL Wk1 Assignment") \
    .getOrCreate()

# Create an RDD
articles = spark.sparkContext.parallelize(data[1200:1600])
type(articles)

pyspark.rdd.RDD

In [57]:
# True if the article mentioned lang, otherwise false
def mentionedLang(lang, article):
    return any(lang in s for s in article.split())

# Test the helper functions in driver program
print("First article contains the word Taxobox:" + str(mentionedLang('Taxobox', data[0])))

print("Test addition using boolean:" + str(0 + mentionedLang('Taxobox', data[0])))

First article contains the word Taxobox:True
Test addition using boolean:1


In [58]:
articles.filter(lambda a: mentionedLang("JavaScript", a)).count()
articles.aggregate(0, lambda acc, a: acc + mentionedLang("JavaScript", a), lambda acc1, acc2: acc1 + acc2)

17

In [64]:
#@timeit
def occurrencesOfLangAggregate(lang, articles):
    return articles.aggregate(0, lambda acc, a: acc + mentionedLang(lang, a), lambda acc1, acc2: acc1 + acc2)
    
#@timeit
def occurrencesOfLangCount(lang, articles):
    return articles.filter(lambda a: mentionedLang(lang, a)).count()

# Testing performance per call
for l in langs:
    print('Counting ' + l)
    print(occurrencesOfLangAggregate(l, articles))
    print(occurrencesOfLangCount(l, articles))

Counting JavaScript
17
17
Counting Java
17
17
Counting PHP
5
5
Counting Python
1
1
Counting C#
0
0
Counting C++
3
3
Counting Ruby
2
2
Counting CSS
3
3
Counting Objective-C
0
0
Counting Perl
1
1
Counting Scala
0
0
Counting Haskell
1
1
Counting MATLAB
1
1
Counting Clojure
0
0
Counting Groovy
0
0


In [73]:
# Return a list of pairs (k, v) where k is the language 
# and v is the count of articles mentioned that language
# sorted in descending order of count

@timeit
def rankLangsBaseline(langs, data):
    result = []
    for l in langs:
        result.append((l, sum(mentionedLang(l, a) for a in data)))
    result.sort(key=lambda t: t[1], reverse=True)
    return result
result = rankLangsBaseline(langs, data)
print(result)

@timeit
def rankLangsBaselineSpark(langs, articles):
    result = []
    for l in langs:
        result.append((l, occurrencesOfLangCount(l, articles)))
    result.sort(key=lambda t: t[1], reverse=True)
    return result
result = rankLangsBaselineSpark(langs, articles)
print(result)


'rankLangsBaseline'  27196.28 ms
'rankLangsBaselineSpark'  1044.83 ms
[('JavaScript', 17), ('Java', 17), ('PHP', 5), ('C++', 3), ('CSS', 3), ('Ruby', 2), ('Python', 1), ('Perl', 1), ('Haskell', 1), ('MATLAB', 1), ('C#', 0), ('Objective-C', 0), ('Scala', 0), ('Clojure', 0), ('Groovy', 0)]


In [96]:
def occurred(langs, a):    
   return [(l, int(mentionedLang(l, a))) for l in langs]
        
@timeit
def rankLangsUsingReduceByKey(langs, articles):
    return articles.flatMap(lambda a: occurred(langs, a)).reduceByKey(lambda c1, c2: c1 + c2).collect()
print(rankLangsUsingReduceByKey(langs, articles))

'rankLangsUsingReduceByKey'  355.76 ms
[('JavaScript', 17), ('Java', 17), ('Python', 1), ('CSS', 3), ('Groovy', 0), ('C++', 3), ('Objective-C', 0), ('Haskell', 1), ('Ruby', 2), ('Scala', 0), ('MATLAB', 1), ('Clojure', 0), ('PHP', 5), ('Perl', 1), ('C#', 0)]
