In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("Jonathan-A.1")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/02 21:19:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/02 21:19:13 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
# A.1.1 
# A.1.2
lines_eng = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.en")
lines_sv = spark_context.textFile("hdfs://192.168.2.119:9000/europarl/europarl-v7.sv-en.sv")

In [3]:
eng_count = lines_eng.count()
sv_count = lines_sv.count()

print("ENG Count: " + str(eng_count))
print("SWE Count: " + str(sv_count))
# A.1.3
print("Difference: " + str(sv_count - eng_count))



ENG Count: 1862234
SWE Count: 1862234
Difference: 0


                                                                                

In [4]:
# A.1.4
print("SWE Partitions: " + str(lines_sv.getNumPartitions()))
print("ENG Partitions: " + str(lines_eng.getNumPartitions()))

SWE Partitions: 3
ENG Partitions: 2


In [5]:
# A.2.1
def preprocess(lines):
    processed = lines.map(lambda line: line.lower())
    processed = processed.map(lambda line: line.split(' '))
    
    
    return processed

In [6]:
eng_processed = preprocess(lines_eng)
sv_processed = preprocess(lines_sv)

In [7]:
# A.2.2
eng_processed.take(10)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999,',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period.'],
 ['although,',
  'as',
  'you',
  'will',
  'have',
  'seen,',
  'the',
  'dreaded',
  "'millennium",
  "bug'",
  'failed',
  'to',
  'materialise,',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful.'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days,',
  'during',
  'this',
  'part-session.'],
 ['in',
  

In [8]:
sv_processed.take(10)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '"den',
  'stora',
  'år',
  '2000-buggen"',
  'aldrig',
  'rum.',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga.'],
 ['ni',
  'har',
  'begärt',
  'en',
  'debatt',
  'i',
  'ämnet',
  'under',
  'sammanträdesperiodens',
  'kommande',
  'dagar.'],
 ['till',
  'dess',
  'vill',
  'jag',
  'att',
  'vi,',
  'som',
  'ett',
  'antal',
  'kolleger',
  'begärt,',
  'håller',
  'en',
  'tyst',
  'minut',
  'för',
  'offren',
  'f

In [9]:
# A.2.3
eng_count = eng_processed.count()
sv_count = sv_processed.count()

                                                                                

In [10]:
print("ENG Count: " + str(eng_count))
print("SWE Count: " + str(sv_count))
print("Difference: " + str(eng_count - sv_count))

ENG Count: 1862234
SWE Count: 1862234
Difference: 0


In [11]:
# A.3.1
def most_frequent(words):
    flatten = words.flatMap(lambda word: word)
    assign = flatten.map(lambda word: (word,1))
    add = assign.reduceByKey(lambda x, y: x+y)
    sort = add.map(lambda x: (x[1], x[0])).sortByKey(False)
    return sort

In [12]:
eng_most_freq = most_frequent(eng_processed)
sv_most_freq = most_frequent(sv_processed)

                                                                                

In [13]:
eng_most_freq.take(10)

[(3498375, 'the'),
 (1659758, 'of'),
 (1539760, 'to'),
 (1288401, 'and'),
 (1085993, 'in'),
 (797516, 'that'),
 (773522, 'a'),
 (758050, 'is'),
 (534242, 'for'),
 (522849, 'we')]

In [14]:
sv_most_freq.take(10)

                                                                                

[(1706293, 'att'),
 (1344830, 'och'),
 (1050774, 'i'),
 (924866, 'det'),
 (913276, 'som'),
 (908680, 'för'),
 (738068, 'av'),
 (694381, 'är'),
 (620310, 'en'),
 (539797, 'vi')]

In [15]:
# A.3.2
# The determined most frequent words in each language corpus are extremely common

In [16]:
# A.4.1
sv_1 = sv_processed.zipWithIndex()
en_1 = eng_processed.zipWithIndex()

                                                                                

In [17]:
sv_2 = sv_1.map(lambda x: (x[1], x[0]))
en_2 = en_1.map(lambda x: (x[1], x[0]))

In [25]:
# 3
joined_3 = sv_2.join(en_2)

In [49]:
filter_4 = joined_3.filter(lambda x: x[1][0] != [] and x[1][1] != [])

In [50]:
filter_5 = filter_4.filter(lambda x: len(x[1][0]) <= 8)

In [51]:
filter_6 = filter_5.filter(lambda x: len(x[1][0]) == len(x[1][1]))

In [52]:
zip_7 = filter_6.flatMap(lambda x: zip(x[1][0], x[1][1]))

In [53]:
r = zip_7.map(lambda tup: (tup,1))
reduce_8 = r.reduceByKey(lambda x, y: x+y)

In [54]:
most_freq_9 = reduce_8.map(lambda x: (x[1], x[0])).sortByKey(False)

                                                                                

In [57]:
most_freq_9.take(8)

[(8820, ('är', 'is')),
 (4530, ('vi', 'we')),
 (3918, ('jag', 'i')),
 (2963, ('avslutad.', 'closed.')),
 (2714, ('detta', 'this')),
 (2567, ('det', 'that')),
 (2548, ('(applåder)', '(applause)')),
 (2351, ('det', 'it'))]

In [None]:
# The translations seems to be identical