In [1]:
from pyspark.sql import SparkSession
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools

In [2]:
# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("haodong_zhao_partA")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/25 16:40:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/25 16:40:21 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


---

# Question A.1

## A.1.1 

Read the English transcripts with Spark, and count the number of lines

In [None]:
lines_en = spark_context.textFile("hdfs://host-192-168-2-119-de1:9000/europarl/europarl-v7.sv-en.en")
lines_en_num = lines_en.count()
print (lines_en_num)

[Stage 0:>                                                          (0 + 2) / 2]

1862234


                                                                                

## A.1.2 

Do the same with the other language (so that you have a separate lineage of RDDs for each).

In [None]:
lines_sv = spark_context.textFile("hdfs://host-192-168-2-119-de1:9000/europarl/europarl-v7.sv-en.sv")
lines_sv_num = lines_sv.count()
print (lines_sv_num)



1862234


                                                                                

## A.1.3 

Verify that the line counts are the same for the two languages.

In [None]:
lines_en_num == lines_sv_num

True

## A.1.4 

Count the number of partitions.

In [None]:
en_partitions_num = lines_en.getNumPartitions()
sv_partitions_num = lines_sv.getNumPartitions()
print('#Partitions EN: {}\n#Partitions SV: {}'.format(en_partitions_num, sv_partitions_num))

#Partitions EN: 2
#Partitions SV: 3


---

# Question A.2

## A.2.1

Pre-process the text from both RDDs by doing the following:
- Lowercase the text
- Tokenize the text (split on space)

Hint: define a function to run in your driver application to avoid writing this code twice

In [None]:
def preprocess(rdd):
    rdd = rdd.map(lambda line: line.lower())
    pattern = r'[^\w\s]' # matches chars other than alphanumeric chars and whitespace chars
    rdd = rdd.map(lambda line: re.sub(pattern,'',line)) # replace
    rdd = rdd.map(lambda line: line.split(' '))
    return rdd
    

In [None]:
tokenize_en = preprocess(lines_en)
tokenize_sv = preprocess(lines_sv)

# A 2.2

Inspect 10 entries from each of your RDDs to verify your pre-processing

In [None]:
tokenize_en.take(10)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period'],
 ['although',
  'as',
  'you',
  'will',
  'have',
  'seen',
  'the',
  'dreaded',
  'millennium',
  'bug',
  'failed',
  'to',
  'materialise',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days',
  'during',
  'this',
  'partsession'],
 ['in',
  'the',
  'm

In [None]:
tokenize_sv.take(10)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  'den',
  'stora',
  'år',
  '2000buggen',
  'aldrig',
  'rum',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga'],
 ['ni',
  'har',
  'begärt',
  'en',
  'debatt',
  'i',
  'ämnet',
  'under',
  'sammanträdesperiodens',
  'kommande',
  'dagar'],
 ['till',
  'dess',
  'vill',
  'jag',
  'att',
  'vi',
  'som',
  'ett',
  'antal',
  'kolleger',
  'begärt',
  'håller',
  'en',
  'tyst',
  'minut',
  'för',
  'offren',
  'för',
  'bl

# A.2.3

Verify that the line counts still match after the pre-processing

In [None]:
tokenize_en.count() == tokenize_sv.count()

                                                                                

True

---

# Question A.3

## A.3.1

Use Spark to compute the 10 most frequently according words in the English language corpus. 
Repeat for the other language.


In [14]:
def word_count(rdd):
    rdd = rdd.flatMap(lambda line: line)\
            .map(lambda word: (word, 1))\
            .reduceByKey(lambda v1, v2 : v1+v2)\
            .map(lambda kv: (kv[1], kv[0])) \
            .sortByKey(False)\
            .map(lambda vk: (vk[1], vk[0]))
    return rdd

In [15]:
word_count(tokenize_en).take(10)

                                                                                

[('the', 3505085),
 ('of', 1662002),
 ('to', 1543739),
 ('and', 1318342),
 ('in', 1088891),
 ('that', 839072),
 ('is', 774939),
 ('a', 774536),
 ('for', 538191),
 ('we', 526480)]

In [16]:
word_count(tokenize_sv).take(10)

                                                                                

[('att', 1709936),
 ('och', 1350369),
 ('i', 1054248),
 ('det', 952985),
 ('som', 917580),
 ('för', 915079),
 ('av', 740724),
 ('är', 701840),
 ('en', 636827),
 ('vi', 546068)]

## A.3.2

***Verify that your results are reasonable***

According to the studies related to this topic, as shown in the links below, the results are reasonable.
1. [Most common words in English](https://en.wikipedia.org/wiki/Most_common_words_in_English)
2. [1000 MOST COMMON SWEDISH WORDS](https://1000mostcommonwords.com/1000-most-common-swedish-words/)

---

# A.4

## A.4.1

Use this parallel corpus to mine some translations in the form of word pairs, for the two languages. Do this by pairing words found on short lines with the same number of words respectively. We (incorrectly) assume the words stay in the same order when translated.

1. Key the lines by their line number (hint: ZipWithIndex()).

In [17]:
lines_kv_en = tokenize_en.zipWithIndex()
lines_kv_sv = tokenize_sv.zipWithIndex()

                                                                                

2. Swap the key and value - so that the line number is the key.

In [18]:
lines_vk_en = lines_kv_en.map(lambda kv: (kv[1], kv[0]))
lines_vk_sv = lines_kv_sv.map(lambda kv: (kv[1], kv[0]))

3. Join the two RDDs together according to the line number key, so you have pairs of matching lines.

In [19]:
joined_rdd = lines_vk_en.join(lines_vk_sv)

4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.

In [21]:
joined_rdd = joined_rdd.filter(lambda vk: len(vk[1]) == 2)

5. Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation (you can experiment).

In [22]:
joined_rdd.take(5)

                                                                                

[(901935,
  (['with',
    'the',
    'exception',
    'of',
    'compliance',
    'with',
    'the',
    'new',
    'comitology',
    'decision',
    'through',
    'the',
    'introduction',
    'of',
    'the',
    'regulatory',
    'procedure',
    'with',
    'scrutiny',
    'the',
    'only',
    'remaining',
    'major',
    'component',
    'of',
    'this',
    'proposal',
    'is',
    'the',
    'issue',
    'of',
    'railway',
    'vehicle',
    'maintenance',
    'and',
    'the',
    'role',
    'of',
    'vehicle',
    'keepers'],
   ['med',
    'undantag',
    'för',
    'förenligheten',
    'med',
    'det',
    'nya',
    'kommittébeslutet',
    'genom',
    'införandet',
    'av',
    'det',
    'föreskrivande',
    'förfarandet',
    'med',
    'kontroll',
    'är',
    'den',
    'enda',
    'kvarstående',
    'delen',
    'av',
    'detta',
    'förslag',
    'frågan',
    'om',
    'underhåll',
    'av',
    'järnvägsfordon',
    'och',
    'fordonsinnehavarnas',

6. Filter to leave only pairs of sentences with the same number of words in each sentence.

In [23]:
same_words_sentences = joined_rdd.filter(lambda lines: len(lines[1][0])==len(lines[1][1]))

7. For each sentence pair, map so that you pair each (in order) word in the two sentences. We no longer need the line numbers. (hint: use python’s built in zip() function)

In [50]:
word_pairs = same_words_sentences.mapValues(lambda s: zip(s[0], s[1]))

wp_list = word_pairs.mapValues(lambda wp: list(wp))

pure_pairs = wp_list.values()

wt_pairs = pure_pairs.flatMap(lambda lst: lst)

In [51]:
wt_pairs.take(10)

                                                                                

[('and', 'och'),
 ('when', 'när'),
 ('they', 'de'),
 ('get', 'kommer'),
 ('home', 'hem'),
 ('they', 'kommer'),
 ('will', 'de'),
 ('doubtless', 'säkert'),
 ('continue', 'att'),
 ('the', 'fortsätta')]

8. Use reduce to count the number of occurrences of the word-translation-pairs.

In [53]:
wp_occurance = wt_pairs.map(lambda word:(word,1))
occurance = wp_occurance.reduceByKey(lambda a, b: a + b)

In [54]:
occurance.take(4)

                                                                                

[(('the', 'handlande'), 2),
 (('comments', 'synpunkter'), 274),
 (('protest', 'protester'), 10),
 (('mainly', 'beror'), 2)]

9. Print some of the most frequently occurring pairs of words.

In [55]:
occurrance_vk = occurance.map(lambda p: (p[1], p[0]))
word_rank = occurrance_vk .sortByKey(False)
word_rank = word_rank.map(lambda p: (p[1], p[0]))

                                                                                

In [57]:
word_rank.take(10)

                                                                                

[(('and', 'och'), 35860),
 (('is', 'är'), 28373),
 (('i', 'jag'), 27290),
 (('we', 'vi'), 26201),
 (('in', 'i'), 18646),
 (('to', 'att'), 18617),
 (('that', 'att'), 16997),
 (('it', 'det'), 15613),
 (('a', 'en'), 15474),
 (('of', 'av'), 13026)]

In [58]:
spark_session.stop()