In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("erik_partA")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

spark_context = spark_session.sparkContext

In [2]:
# A1
spark_context.setLogLevel("DEBUG")

lines_english = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
lines_swedish = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")

number_of_english_rows = lines_english.count()
number_of_swedish_rows = lines_swedish.count()

print("Number of lines for english, sve-eng: ",number_of_english_rows)
print("Number of lines for swedish, sve-eng: ",number_of_swedish_rows)

if (number_of_english_rows == number_of_swedish_rows):
    print("The number of rows are the same!")
else:
    print("The number of rows are NOT the same!")

print("The number of partitions for the english language: ",lines_english.getNumPartitions())
print("The number of partitions for the swedish language: ",lines_swedish.getNumPartitions())


Number of lines for english, sve-eng:  1862234
Number of lines for swedish, sve-eng:  1862234
The number of rows are the same!
The number of partitions for the english language 2
The number of partitions for the swedish language 3


In [3]:
# A2
def remove_uppercase(line):
    return line.lower()

def split_line(line):
    return line.split(' ')

# Making the lines lowercase.

lines_english_lowercase = lines_english.map(remove_uppercase)
lines_swedish_lowercase = lines_swedish.map(remove_uppercase)

# Tokenizing the text.

lines_eng_lower_token = lines_english_lowercase.map(split_line)
lines_swe_lower_token = lines_swedish_lowercase.map(split_line)

# Prining the 10 first entries. 

print(lines_english_lowercase.take(10))
print(lines_eng_lower_token.take(10))

# Verification that the line counts still match. 

nmbr_eng_rows_post_process = lines_eng_lower_token.count()
nmbr_swe_rows_post_process = lines_swe_lower_token.count()

if (nmbr_eng_rows_post_process == nmbr_swe_rows_post_process == number_of_english_rows):
    print("The number of rows are still the same!")
else: 
    print("The number of rows are NOT still the same!")



['resumption of the session', 'i declare resumed the session of the european parliament adjourned on friday 17 december 1999, and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'you have requested a debate on this subject in the course of the next few days, during this part-session.', "in the meantime, i should like to observe a minute' s silence, as a number of members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the european union.", "please rise, then, for this minute' s silence.", "(the house rose and observed a minute' s silence)", 'madam president, on a point of order.', 'you will be aware from the press and television that there have been a num

In [4]:
# A3 - Counting the 10 most common words from the English language. 

from operator import add

most_common_english = lines_eng_lower_token.flatMap(lambda x: x) \
.map(lambda w: (w,1))\
.reduceByKey(add)\
.sortBy(lambda x: -x[1])

print(most_common_english.take(10))

# This result seems resonable because the words are common stop words. 

[('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]


In [5]:
# A3 - counting the 10 most common words from the Swedish language. 

most_common_swedish = lines_swe_lower_token.flatMap(lambda x: x) \
.map(lambda w: (w,1))\
.reduceByKey(add)\
.sortBy(lambda x: -x[1])

print(most_common_swedish.take(10))

# This result also seems resonable for the same reason as above.  

[('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]


In [6]:
# A4 (1-4) - This first section of the code zips the english and swedish
# together, changes place of the key and value and removes the rows that are empty. 

zipped_rdd = lines_eng_lower_token.zipWithIndex()\
.map(lambda x: (x[1],x[0]))\
.join(lines_swe_lower_token.zipWithIndex()\
     .map(lambda x: (x[1],x[0])))

zipped_nonempty_rdd = zipped_rdd.filter(lambda x: (x[1][0]!=[''] and x[1][1]!=['']))
print("The number of rows after removing empty rows: ",zipped_nonempty_rdd.count())


The number of rows after removing empty rows:  1848423


In [7]:
# part 5, Keeping only sentences with a specific length. 

shorter_sentences = zipped_nonempty_rdd.filter(lambda x: (len(x[1][0]) < 7 and len(x[1][1]) < 7))
print("Number of rows left: ",shorter_sentences.count())

Number of rows left:  90219


In [8]:
# part 6, keeping only senteces with the same length. 

equal_sentence_length = shorter_sentences.filter(lambda x: (len(x[1][0])==len(x[1][1])))
print("Number of rows left after filtering to only keep those with the same length: ",equal_sentence_length.count())


Number of rows left after filtering to only keep those with the same length:  44531


In [9]:
# part 7, mapping words that occur at the same spot in the two languages. 

translated_word_pairs = equal_sentence_length.map(lambda x: set(zip(x[1][0],x[1][1])))
print(translated_word_pairs.take(10))

[{('i', 'varje'), ('every', 'där'), ('used', 'ord'), ('was', 'nu'), ('important.', 'viktigt.'), ('word', 'är')}, {('small', 'låg'), ('it', 'det'), ('relatively', 'relativt'), ('percentage.', 'procentsats.'), ('a', 'en'), ('is', 'är')}, {('.', '.')}, {('bangladesh', 'bangladesh')}, {('.', '.')}, {('enthusiasm.', 'entusiasm.'), ('involvement,', 'engagemang,'), ('they', 'de'), ('have', 'har')}, {('.', '.')}, {('2.', '2.')}, {('commission', 'att'), ('will', 'kommer'), ('what', 'vilken'), ('the', 'kommissionen'), ('role', 'roll'), ('play?', 'spela?')}, {('i', 'jag'), ('think', 'tror'), ('absolutely', 'helt'), ('are', 'har'), ('right.', 'rätt.'), ('you', 'ni')}]


In [10]:
# part 8-9, number of occurece of the word-translation pairs. 

most_common_translation = translated_word_pairs.flatMap(lambda x: x)\
.map(lambda w: (w,1))\
.reduceByKey(add)\
.sortBy(lambda x: -x[1])

print("The most frequently occuring words and their count: ",most_common_translation.take(10))

The most frequently occuring words and their count:  [(('is', 'är'), 6239), (('closed.', 'avslutad.'), 2958), (('(applause)', '(applåder)'), 2546), (('we', 'vi'), 2366), (('.', '.'), 2107), (('i', 'jag'), 1887), (('that', 'det'), 1881), (('this', 'detta'), 1618), (('it', 'det'), 1477), (('\xa0\xa0', '\xa0\xa0'), 1353)]


In [11]:
spark_context.stop()