In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("hadoop_example")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [3]:
# Reading the English transcript with Spark and count the number of lines
lines_en = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.da-en.en")
print(lines_en.count())

1968800


In [4]:
# Reading the Swedish transcript with Spark and count the number of lines
lines_da = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.da-en.da")
print(lines_da.count())

1968800


In [37]:
# Verifying the line counts are the same for the two languages
if lines_en.count() == lines_da.count():
    print("line counts are same")
else:
    print("line counts are not same")

line counts are same


In [38]:
# Counting the number of partitions
lines_en.getNumPartitions()

3

In [39]:
lines_da.getNumPartitions()

3

In [6]:
# Reading 10 lines from an English transcript 
lines_en = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.da-en.en")
print(lines_en.take(10))

['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'You have requested a debate on this subject in the course of the next few days, during this part-session.', "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.", "Please rise, then, for this minute' s silence.", "(The House rose and observed a minute' s silence)", 'Madam President, on a point of order.', 'You will be aware from the press and television that there have been a num

In [41]:
# Reading 10 lines from an other transcript
lines_da = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.da-en.da")
print(lines_da.take(10))

['Genoptagelse af sessionen', 'Jeg erklærer Europa-Parlamentets session, der blev afbrudt fredag den 17. december, for genoptaget. Endnu en gang vil jeg ønske Dem godt nytår, og jeg håber, De har haft en god ferie.', 'Som De kan se, indfandt det store "år 2000-problem" sig ikke. Til gengæld har borgerne i en del af medlemslandene været ramt af meget forfærdelige naturkatastrofer.', 'De har udtrykt ønske om en debat om dette emne i løbet af mødeperioden.', 'I mellemtiden ønsker jeg - som også en del kolleger har anmodet om - at vi iagttager et minuts stilhed til minde om ofrene for bl.a. stormene i de medlemslande, der blev ramt.', 'Jeg opfordrer Dem til stående at iagttage et minuts stilhed.', '(Parlamentet iagttog stående et minuts stilhed', 'Fru formand, en bemærkning til forretningsordenen.', 'Gennem pressen og tv vil De være bekendt med en række bombeeksplosioner og drab i Sri Lanka.', 'En af de personer, der blev myrdet for ganske nylig i Sri Lanka, var hr. Kumar Ponnambalam, der 

In [5]:
# Pre-processing the text: Lowercase the text, Tokenize the text (split on space)
def prepro_txt(line):
    line = line.lower()
    line = line.split(' ')
    return line
# mapping the files to a defined function for pre-processing
file1 = lines_en.map(prepro_txt)
file2 = lines_da.map(prepro_txt)
# taking 10 entries from each RDD
print(file1.take(10))
print(file2.take(10))
# Verifying the line counts
print(file1.count())
print(file2.count())


[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [9]:
# 10 most frequently according words in English transcript
words_en = lines_en.flatMap(lambda l: l.split())
word_counts_to_sum_en = words_en.map(lambda w: (w, 1))
word_freqt_en = word_counts_to_sum_en.reduceByKey(lambda f1, f2: f1 + f2)
top10 = word_freqt_en.top(10, lambda wf: wf[1])
for word, freq in top10:
    print(word, freq)

the 3463316
of 1758135
to 1625165
and 1359258
in 1064804
that 807555
a 805207
is 800321
for 546323
I 532905


In [43]:
# 10 most frequently according words in Swedish transcript
words_da = lines_da.flatMap(lambda l: l.split())
word_counts_to_sum_da = words_da.map(lambda w: (w, 1))
word_freqt_da = word_counts_to_sum_da.reduceByKey(lambda f1, f2: f1 + f2)
top10 = word_freqt_da.top(10, lambda wf: wf[1])
for word, freq in top10:
    print(word, freq)

at 1542543
og 1418244
i 1175970
er 1023391
af 905133
for 885747
til 744325
en 658567
det 654194
de 552178


In [8]:
# Key the lines by their line number 
en_1 = file1.zipWithIndex()
da_1 = file2.zipWithIndex()
print(en_1.take(10))
print(da_1.take(10))

[(['resumption', 'of', 'the', 'session'], 0), (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], 1), (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], 2), (['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], 3), (['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'o

In [10]:
# Swap the key and value - so that the line number is the key 
en_2 = en_1.map(lambda x: (x[1], x[0]))
da_2 = da_1.map(lambda x: (x[1], x[0]))
print(en_2.take(10))
print(da_2.take(10))

[(0, ['resumption', 'of', 'the', 'session']), (1, ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.']), (2, ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.']), (3, ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.']), (4, ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,',

In [11]:
# Join the two RDDs together according to the line number key
en_da = en_2.join(da_2)
print(en_da.take(10))

[(902406, (['be', 'that', 'as', 'it', 'may,', 'we', 'can', 'only', 'rejoice', 'wholeheartedly', 'at', 'the', 'outcome.'], ['men', 'vi', 'vil', 'dog', 'se', 'meget,', 'meget', 'positivt', 'på', 'resultatet.'])), (902934, (['the', 'following', 'amendments', 'have', 'been', 'requested', 'to', 'this', 'draft', 'agenda:'], ['der', 'er', 'indleveret', 'følgende', 'ændringsforslag', 'til', 'dagsordensforslaget:'])), (917994, (['the', 'crime', 'could', 'only', 'have', 'been', 'authorised', 'at', 'the', 'highest', 'level', 'in', 'the', 'russian', 'political', 'establishment.'], ['forbrydelsen', 'kunne', 'kun', 'have', 'været', 'bemyndiget', 'fra', 'højeste', 'niveau', 'i', 'de', 'politiske', 'kredse', 'i', 'rusland.'])), (973854, (['on', 'a', 'number', 'of', 'occasions,', 'i', 'have', 'put', 'the', 'question', 'to', 'commissioner', 'fischer', 'boel', 'in', 'this', 'house,', 'and', 'each', 'time', 'she', 'has', 'reassured', 'me', 'that', 'commissioner', 'mandelson', 'is', 'staying', 'within', 'h

In [12]:
# Filter to exclude line pairs that have an empty/missing “corresponding” sentence
en_da1 = en_da.filter(lambda x: len(x[1][0]) > 0  and len(x[1][1]) > 0)
print(en_da1.take(10))

[(1809060, (['in', 'some', 'cases', 'non-governmental', 'organisations', 'are', 'also', 'involved', 'somewhere', 'along', 'the', 'line.'], ['i', 'nogle', 'tilfælde', 'kommer', 'også', 'ikke-statslige', 'organisationer', 'ind', 'i', 'billedet.'])), (1809300, (['it', 'is', 'demanding', 'of', 'them,', 'so', 'it', 'should', 'also', 'be', 'demanding', 'of', 'itself.'], ['kommissionen', 'er', 'krævende,', 'hvad', 'angår', 'medlemslandene.', 'den', 'skal', 'være', 'krævende.'])), (1810032, (['discussion', 'has', 'already', 'begun', 'on', 'introducing', 'certain', 'minimum', 'standards', 'for', 'this', 'type', 'of', 'protection.'], ['man', 'har', 'allerede', 'indledt', 'forhandlinger', 'om', 'indførelse', 'af', 'visse', 'mindstestandarder', 'i', 'unionen', 'for', 'også', 'disse', 'former', 'for', 'beskyttelse.'])), (1810218, (['at', 'the', 'same', 'time', 'the', 'wto', 'can', 'only', 'play', 'its', 'proper', 'role', 'if', 'the', 'concerns', 'of', 'all', 'participating', 'states', 'are', 'met.'

In [13]:
# Filter to leave only pairs of sentences with a small number of words per sentence 
# For example 10 
en_da2 = en_da1.filter(lambda x: len(x[1][0]) <= 10  and len(x[1][1]) <= 10)
print(en_da2.take(10))

[(2862, (['mr', 'florenz', 'also', 'raised', 'the', 'question', 'of', 'anonymity.'], ['hr.', 'florenz', 'rejste', 'ligeledes', 'spørgsmålet', 'om', 'anonymitet.'])), (263862, ([''], ['lad', 'os', 'prioritere', 'det', 'højere', 'end', 'alle', 'andre', 'anliggender.'])), (387990, (['this', 'we', 'have', 'always', 'supported.'], ['det', 'har', 'vi', 'også', 'altid', 'støttet.'])), (524778, (['the', 'plenary', 'is', 'to', 'decide', 'on', 'this', 'today.'], ['det', 'skal', 'plenarforsamlingen', 'tage', 'stilling', 'til', 'i', 'dag.'])), (548034, (['there', 'will', 'be', 'more', 'enlargements;', 'indeed,', 'they', 'are', 'already', 'imminent.'], ['vi', 'får', 'flere', 'udvidelser,', 'de', 'står', 'jo', 'allerede', 'for', 'døren.'])), (10716, (['the', 'framework', 'directive', 'on', 'water', 'is', 'a', 'necessary', 'initiative.'], ['rammedirektivet', 'for', 'vandpolitiske', 'foranstaltninger', 'er', 'et', 'nødvendigt', 'initiativ.'])), (230046, (['question', 'no', '2', 'by', '(h-0862/01):'], 

In [15]:
# Filter to leave only pairs of sentences with the same number of words in each sentence
# For example 3
en_da3 = en_da2.filter(lambda x: len(x[1][0]) == 3  and len(x[1][1]) == 3)
print(en_da3.take(10))

[(529140, (['with', 'both', 'things.'], ['med', 'begge', 'dele.'])), (837078, (['-', 'report:', 'gauzès'], ['betænkning', 'af', 'gauzès'])), (1908084, (['thank', 'you,', 'commissioner.'], ['tak,', 'hr.', 'kommissær.'])), (965934, (['it', 'was', 'painful.'], ['det', 'var', 'pinefuldt.'])), (612222, (['who', 'feeds', 'it?'], ['hvem', 'nærer', 'den?'])), (157086, (['is', 'that', 'clear?'], ['er', 'det', 'klart?'])), (848892, (['who', 'does', 'what?'], ['hvem', 'gør', 'hvad?'])), (339828, (['temporary', 'means', 'temporary.'], ['midlertidig', 'betyder', 'midlertidig.'])), (293694, (['for', 'what', 'transpires?'], ['hvad', 'foregår', 'der?'])), (860202, (['-', 'report:', 'roure'], ['betænkning', 'af', 'roure']))]


In [22]:
# For each sentence pair, map in order to pair each word in the two sentences
# No line numbers
def en_da_pair(x):
    a = zip(x[1][0],x[1][1])
    a = list(a)
    return a
    
en_da4 = en_da3.flatMap(lambda x: en_da_pair(x))
print(en_da4.take(10))

[('-', 'betænkning'), ('report:', 'af'), ('herczog', 'herczog'), ('-', 'betænkning'), ('report:', 'af'), ('herczog', 'herczog'), ('this', 'det'), ('is', 'er'), ('unacceptable.', 'uacceptabelt.'), ('it', 'den')]


In [23]:
# Use reduce to count the number of occurrences of the word-translation-pairs
en_da5 = en_da4.map(lambda w: (w, 1)).reduceByKey(lambda f1, f2: f1 + f2)
print(en_da5.take(10))

[(('herczog', 'herczog'), 16), (('it', 'den'), 36), (('of', 'af'), 83), (('me', 'mig'), 8), (('that', 'det'), 381), (('true.', 'sandt.'), 14), (('court', 'straffedomstol'), 4), (('my', 'mine'), 3), (('this', 'denne'), 4), (('smoke-free', 'røgfrit'), 1)]


In [24]:
# Print some of the most frequently occurring pairs of words
en_da6 = en_da5.top(10, lambda wf: wf[1])
for word, freq in en_da6:
    print(word, freq)

('is', 'er') 791
('that', 'det') 381
('thank', 'tak,') 259
('commissioner.', 'kommissær.') 236
('-', 'betænkning') 210
('this', 'det') 205
('it', 'det') 160
('you,', 'hr.') 148
('report:', 'af') 126
('are', 'er') 95


In [None]:
spark_session.stop()