In [2]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("linneaeriksson_A2.1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [3]:
#part 1
lines_eng = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
print("Number of lines for english: ",lines_eng.count())
print("Number of partitions for english: ",lines_eng.getNumPartitions())
print(lines_eng.take(10))
                                                                   
     
lines_swe = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")                                   
print("Number of lines for swedish: ",lines_swe.count())
print("Number of partitions for swedish: ",lines_swe.getNumPartitions())
print(lines_swe.take(10))

Number of lines for english:  1862234
Number of partitions for english:  2
['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'You have requested a debate on this subject in the course of the next few days, during this part-session.', "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.", "Please rise, then, for this minute' s silence.", "(The House rose and observed a minute' s silence)", 'Madam President, on a point of order.', 

In [4]:
 #part 2
def lower_split(text):
        text = text.map(lambda text: text.lower())
        text = text.map(lambda text: text.split(' '))
        return(text)
    
preprocessed_eng = lower_split(lines_eng)
print(preprocessed_eng.take(5))
print(preprocessed_eng.count())

preprocessed_swe = lower_split(lines_swe)
print(preprocessed_swe.take(5))
print(preprocessed_swe.count())


[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [5]:
#part 3
from operator import add

map_eng = preprocessed_eng.flatMap(lambda x: x).map(lambda x: (x,1))
map_swe = preprocessed_swe.flatMap(lambda x: x).map(lambda x: (x,1))

count_en = map_eng.reduceByKey(add)
count_sv = map_swe.reduceByKey(add)

sorted_en = count_en.sortBy(lambda x: -x[1])
sorted_sv = count_sv.sortBy(lambda x: -x[1])

print("10 most common english words: ",sorted_en.take(10),"\n\n10 most common swedish words: ",sorted_sv.take(10))

10 most common english words:  [('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)] 

10 most common swedish words:  [('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [6]:
# part 4 (1-4) - zips the english and swedish together, changes places of keys and values, removes empty rows. 

rdds_zipped = preprocessed_eng.zipWithIndex()\
.map(lambda x: (x[1],x[0]))\
.join(preprocessed_swe.zipWithIndex()\
     .map(lambda x: (x[1],x[0])))

rdds_zipped_nonempty = rdds_zipped.filter(lambda x: (x[1][0]!=[''] and x[1][1]!=['']))
print("Nr of rows: ",rdds_zipped_nonempty.count())

Nr of rows:  1848423


In [8]:
# part 5, Keeping only sentences with a specific length. 

shorter_sentences = rdds_zipped_nonempty.filter(lambda x: (len(x[1][0]) < 7 and len(x[1][1]) < 7))
print("Nr of rows: ",shorter_sentences.count())

Nr of rows:  90219


In [10]:
# part 6, keeping only senteces with the same length. 

same_sentence_length = shorter_sentences.filter(lambda x: (len(x[1][0])==len(x[1][1])))
print("Nr of rows: ",same_sentence_length.count())


Nr of rows:  44531


In [11]:
# part 7, mapping words at the same spot. 

word_pairs_translated = same_sentence_length.map(lambda x: set(zip(x[1][0],x[1][1])))
print(word_pairs_translated.take(10))

[{('9.', '9.')}, {('no', 'nr'), ('35', '35'), ('(h-0778/99):', '(h-0778/99):'), ('question', 'fråga'), ('by', 'från')}, {('transparency', 'öppenheten'), ('is', 'är'), ('such', 'sådan'), ('one', 'en'), ('issue.', 'fråga.')}, {('would', 'rekommenderar'), ('i', 'jag'), ('recommend...', '...')}, {('orders,', 'order,'), ('europe', 'europa'), ("pays!'", 'betalar!”'), ('we', 'vi'), ('give', 'ger')}, {('much,', 'herr'), ('thank', 'tack'), ('very', 'mycket,'), ('you', 'så'), ('commissioner.', 'kommissionär.')}, {('(continuation)', '(fortsättning)'), ('vote', 'omröstning')}, {('way.', 'sätt.'), ('better', 'bättre'), ('is', 'finns'), ('there', 'det'), ('a', 'ett')}, {('women.', 'kvinnor.'), ('of', 'av'), ('are', 'är'), ('half', 'hälften'), ('these', 'dessa')}, {('i', 'jag'), ('these', 'dessa'), ('proposals.', 'förslag.'), ('support', 'stöder')}]


In [14]:
# part 8 & 9, number of occurece of the word-translation pairs. 

common_translation_pairs = word_pairs_translated.flatMap(lambda x: x)\
.map(lambda w: (w,1))\
.reduceByKey(add)\
.sortBy(lambda x: -x[1])

print("The most frequently occuring words and their count: ",common_translation_pairs.take(10))

The most frequently occuring words and their count:  [(('is', 'är'), 6239), (('closed.', 'avslutad.'), 2958), (('(applause)', '(applåder)'), 2546), (('we', 'vi'), 2366), (('.', '.'), 2107), (('i', 'jag'), 1887), (('that', 'det'), 1881), (('this', 'detta'), 1618), (('it', 'det'), 1477), (('\xa0\xa0', '\xa0\xa0'), 1353)]


In [15]:
# release the cores for another application!
spark_context.stop()