In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.1.153:7077") \
        .appName("linneaeriksson_A2.1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:
#A1
lines_eng = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
print(lines_eng.count())
print(lines_eng.getNumPartitions())
print(lines_eng.take(10))
                                                                   
     
lines_swe = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")                                   
print(lines_swe.count())
print(lines_swe.getNumPartitions())
print(lines_swe.take(10))

1862234
2
['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.", 'You have requested a debate on this subject in the course of the next few days, during this part-session.', "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.", "Please rise, then, for this minute' s silence.", "(The House rose and observed a minute' s silence)", 'Madam President, on a point of order.', 'You will be aware from the press and television that there have 

In [3]:
 #A2
def lower_split(text):
        text = text.map(lambda text: text.lower())
        text = text.map(lambda text: text.split())
        return(text)
    
preprocessed_eng = lower_split(lines_eng)
print(preprocessed_eng.take(10))
print(preprocessed_eng.count())

preprocessed_swe = lower_split(lines_swe)
print(preprocessed_swe.take(10))
print(preprocessed_swe.count())


[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

In [4]:
#A3
from operator import add

map_eng = preprocessed_eng.flatMap(lambda x: x).map(lambda x: (x,1))
map_swe = preprocessed_swe.flatMap(lambda x: x).map(lambda x: (x,1))

count_en = map_eng.reduceByKey(add)
count_sv = map_swe.reduceByKey(add)

sorted_en = count_en.sortBy(lambda x: -x[1])
sorted_sv = count_sv.sortBy(lambda x: -x[1])



In [6]:
en_1 = sorted_en
sv_1 = sorted_sv

print("10 most common english words: ",sorted_en.take(10),"\n\n10 most common swedish words: ",sorted_sv.take(10))

10 most common english words:  [('the', 3498574), ('of', 1659884), ('to', 1539823), ('and', 1288620), ('in', 1086089), ('that', 797576), ('a', 773812), ('is', 758087), ('for', 534270), ('we', 522879)] 

10 most common swedish words:  [('att', 1706309), ('och', 1344895), ('i', 1050989), ('det', 924878), ('som', 913302), ('för', 908703), ('av', 738102), ('är', 694389), ('en', 620347), ('vi', 539808)]


In [None]:
#A3
#def map_reduce(text):
#    text = text.flatMap(lambda line: line.split())
#    text = text.map(lambda line: (line, 1)).reduceByKey(lambda a, b: a + b)
#    return text


#eng_1 = map_reduce(preprocessed_eng)

#print(eng_1.takeOrdered(10, key = lambda x: -x[1]))

#swe_1 = mapred(preprocessed_swe)
#print(swe.takeOrdered(10, key = lambda x: -x[1]))

In [7]:
#A4.1 and A4.2
en_2 = en_1.zipWithIndex().map(lambda x: (x[1],x[0]))
sv_2 = sv_1.zipWithIndex().map(lambda x: (x[1],x[0]))



In [8]:
#A4.3
joined_rdds = en_2.join(sv_2)
print(joined_rdds.first())


(0, (('the', 3498574), ('att', 1706309)))


In [13]:
#A4.4
joined_rdds_2 = joined_rdds.filter(lambda x: len(x[1][0]) > 0 and len(x[1][1]) > 0)
print(joined_rdds_2.take(10))

[(0, (('the', 3498574), ('att', 1706309))), (5, (('that', 797576), ('för', 908703))), (10, (('i', 501678), ('de', 513915))), (15, (('are', 337454), ('på', 478223))), (20, (('european', 268758), ('detta', 236637))), (25, (('mr', 178733), ('europeiska', 144474))), (30, (('should', 142827), ('skulle', 124616))), (35, (('all', 131242), ('herr', 118389))), (40, (('president,', 108314), ('sig', 96981))), (45, (('more', 100651), ('andra', 86938)))]


In [2]:
# release the cores for another application!
spark_context.stop()