# Popularity ranking for Programming Languages based on presences in Wikipedia articles

EPFL Big Data Analysis Week 1 Assignment
https://www.coursera.org/learn/scala-spark-big-data/home/info

The objective of the assignment is to produce a rudimentary metric using Wikipedia articles to provide a popularity ranking for programming languages using Scala and Spark.  Here, I have attempted the exercise using PySpark.  

Data file download link: http://alaska.epfl.ch/~dockermoocs/bigdata/wikipedia.dat

In [1]:
import time 

# Credits to Fahim Sakri 
# Source (https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d)
# An annotation for timing a python function
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ("%r  %2.2f ms" % (method.__name__, (te - ts) * 1000))
        return result
    return timed


In [2]:
## Constants and Helper functions

langs = ["JavaScript", "Java", "PHP", "Python", "C#", "C++", "Ruby", "CSS",
    "Objective-C", "Perl", "Scala", "Haskell", "MATLAB", "Clojure", "Groovy"]

class WikiArticle:
    def __init__(self, title, text):
        self.title = title
        self.text = text
        
def readWikiArticle(filepath):
    subs = "</title><text>"
    l = len(subs)
    articles = []
    z = 0
    with open(filepath) as f:
         for line in f:
            i = int(line.find(subs))
            title = line[14:i]
            text  = line[(i + l): (len(line)-17)]
            if ("Across the many fields" in text):
            z = z + 1
            articles.append(WikiArticle(title, text))
    return articles


data = [a.text for a in readWikiArticle('/data/epfl-big-data-analysis/wikipedia.dat')]
print(len(data))

z1542
4086


In [4]:
## Setup
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext

spark = SparkSession \
    .builder \
    .appName("EPFL Wk1 Assignment") \
    .getOrCreate()
        
spark.conf.set("spark.executor.instances", 1)
spark.conf.set("spark.executor.cores", 1)
spark.conf.set("spark.cores.max", 1)

# Create RDD
some = data[1000:2000]
some_articles = spark.sparkContext.parallelize(some)
articles = spark.sparkContext.parallelize(data)

In [5]:
my_data = data
my_articles = articles

In [6]:
# True if the article mentioned lang, otherwise false
def mentionedLang(lang, article):
    return any(lang in s for s in article.split())

# Test the helper functions in driver program
print("First article contains the word Taxobox:" + str(mentionedLang('Taxobox', data[0])))

print("Test addition using boolean:" + str(0 + mentionedLang('Taxobox', data[0])))

First article contains the word Taxobox:True
Test addition using boolean:1


In [7]:
# Baseline for providing a count of articles for each language sorted in descending order of count (ranking)
def occurrencesOfLangAggregate(lang, articles):
    return articles.aggregate(0, lambda acc, a: acc + mentionedLang(lang, a), lambda acc1, acc2: acc1 + acc2)
    
def occurrencesOfLangCount(lang, articles):
    return articles.filter(lambda a: mentionedLang(lang, a)).count()

# using for loop
@timeit
def rankLangsBaseline(langs, data):
    result = []
    for l in langs:
        result.append((l, sum(mentionedLang(l, a) for a in data)))
    result.sort(key=lambda t: t[1], reverse=True)
    return result

@timeit
def rankLangsBaselineSpark(langs, articles):
    result = []
    for l in langs:
        result.append((l, occurrencesOfLangCount(l, articles)))
    result.sort(key=lambda t: t[1], reverse=True)
    return result

print(rankLangsBaseline(langs, my_data))
print(rankLangsBaselineSpark(langs, my_articles))

'rankLangsBaseline'  26665.69 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]
'rankLangsBaselineSpark'  13319.24 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]


`'rankLangsBaseline'  26665.69 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]
'rankLangsBaselineSpark'  13319.24 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]`

In [9]:
# Attempt 2 - Second approach of calculating the ranking using an inverted index

# Step 1: Computed an inverted index that maps the language to a list (Iterable) 
# of wikipedia articles mentioning that language, result should be of type RDD[(String, Iterable[WikipediaArticle])]
# Hint, use flatMap and groupByKey

def mentionedInArticle(langs, a):    
   return [(l, a) for l in langs if mentionedLang(l, a)]

@timeit
def rankLangsUsingIndex(langs, articles):
    return articles \
    .flatMap(lambda a: mentionedInArticle(langs, a)) \
    .filter(lambda x: x != []) \
    .groupByKey() \
    .mapValues(len) \
    .sortBy(lambda a: a[1], ascending=False) \
    .collect()

print(rankLangsUsingIndex(langs, my_articles))

'rankLangsUsingIndex'  8956.59 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]


`'rankLangsUsingIndex'  8956.59 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]`

In [10]:
# Attempt 3 - Use reduceByKey
def mentioned(langs, a):    
   return [(l, int(mentionedLang(l, a))) for l in langs]
        
@timeit
def rankLangsReduceByKey(langs, articles):
    return articles \
    .flatMap(lambda a: mentioned(langs, a)) \
    .reduceByKey(lambda c1, c2: c1 + c2) \
    .sortBy(lambda a: a[1], ascending=False) \
    .collect()

print(rankLangsUsingIndex(langs, my_articles))

'rankLangsUsingIndex'  8529.96 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]


In [None]:
`'rankLangsUsingIndex'  8529.96 ms
[('Java', 2017), ('JavaScript', 1738), ('C#', 849), ('CSS', 555), ('C++', 554), ('Python', 545), ('PHP', 452), ('MATLAB', 324), ('Perl', 300), ('Ruby', 287), ('Scala', 161), ('Haskell', 128), ('Objective-C', 112), ('Clojure', 60), ('Groovy', 55)]`