# Part 1


## Section A - Working with the RDD API

In [1]:
from pyspark.sql import SparkSession
#from pyspark.ml import Pipeline


# (8 cores, 16gb per machine) x 5 = 40 cores


# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077")\
        .appName("Julie_Rajkumar_Amarwani_Lab3")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

## Question A.1

###  1.1 Reading the English transcripts with Spark and count the number of lines

In [2]:
eng_lines = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en")

#Count number of rows 
eng_count = eng_lines.count()
print("Total number of lines in the English transcript is :",eng_count)

Total number of lines in the English transcript is : 1862234


###  1.2 Reading the Swedish transcript with Spark and count the number of lines

In [3]:
swe_lines = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv")

#Count number of rows 
swe_count = swe_lines.count()
print("Total number of lines in the Swedish transcript is :",swe_count)

Total number of lines in the Swedish transcript is : 1862234


###  1.3 Verifying that the line counts are the same for the two languages.

In [4]:
if eng_count==swe_count:
    print("The line counts are same for the two languages:")
else:
    print("The line counts are not same for the two languages:")

The line counts are same for the two languages:


### 1.4 Count the number of partitions

In [5]:
print("Partitions for English RDD are :",eng_lines.getNumPartitions())
print("Partitions for Swedish RDD are :",swe_lines.getNumPartitions())

Partitions for English RDD are : 2
Partitions for Swedish RDD are : 3


## Question A.2

### 2.1 Pre-process the text from both RDDs by doing the following:
- Lowercase the text
- Tokenize the text (split on space)


In [6]:
def pre_process_text(line):   
    line = line.lower()    #1. Lowercase the text     
    line = line.split(' ') #2. Tokenize the text (split on space)
    return line

# Mapping the files to a defined function for pre-processing

eng_RDD = eng_lines.map(pre_process_text) #Mapping for English RDD .

swe_RDD = swe_lines.map(pre_process_text) #Mapping for Swedish RDD


### 2.2  Inspect 10 entries from each of your RDDs to verify pre-processing.

In [7]:
print(eng_RDD.take(10))   #Inspecting 10 entries for English RDD
print("\n")
print(swe_RDD.take(10))   #Inspecting 10 entries for Swedish RDD

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'on', 'behalf', 'of', '

### 2.3 Verify that the line counts still match after the pre-processing.

In [8]:
#Counting the number of lines after pre-processing

#counting lines for English language RDD
pre_eng_count = eng_RDD.count()     
print("\nTotal number of lines in the English RDD:- \t Before Pre-Processing :",pre_eng_count,\
      "\t After Pre-Processing : ",eng_count) 

#counting lines for Swedish language RDD
pre_swe_count = swe_RDD.count()     
print("\nTotal number of lines in the Swedish RDD:- \t Before Pre-Processing :",pre_swe_count,\
      "\t After Pre-Processing : ",swe_count)

#Verify that the line counts still match after the pre-processing
if eng_count==pre_eng_count and swe_count==pre_swe_count:
    print("\nThe line counts are the same after pre-processing:")
else:
    print("\nThe line counts are not the same after pre-processing:")


Total number of lines in the English RDD:- 	 Before Pre-Processing : 1862234 	 After Pre-Processing :  1862234

Total number of lines in the Swedish RDD:- 	 Before Pre-Processing : 1862234 	 After Pre-Processing :  1862234

The line counts are the same after pre-processing:


## Question A.3

### 3.1 Use Spark to compute the 10 most frequently according words in the English language corpus. Repeat for the other language.

#####  Most frequently occurring words in the English language corpus.

In [9]:
# eng_lines = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.en")

# flatMap() to output multiple elements for each input value, split on based of all white space charatcter
eng_rdd = eng_lines.flatMap(lambda x: x.lower().split())

# Map a tuple and append int 1 for each word 
eng_rdd = eng_rdd.map(lambda x:(x,1))

# Perform aggregation (sum) all the int values for each unique key
eng_rdd = eng_rdd.reduceByKey(lambda x, y: x+y)

# Here I used lambda to sort by descending order so I can grab the top 10 outputs and print them

print("\n",eng_rdd.takeOrdered(10, key=lambda x: -x[1]),"\n")



 [('the', 3498574), ('of', 1659884), ('to', 1539823), ('and', 1288620), ('in', 1086089), ('that', 797576), ('a', 773812), ('is', 758087), ('for', 534270), ('we', 522879)] 



#####  Most frequently occurring words in the Swedish language corpus.

In [10]:
# swe_lines = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.sv-en.sv")

# flatMap() to output multiple elements for each input value, split on based of all white space charatcter
swe_rdd = swe_lines.flatMap(lambda x: x.lower().split())

# Map a tuple and append int 1 for each word
swe_rdd = swe_rdd.map(lambda a:(a,1))

# Perform aggregation (sum) all the int values for each unique key
swe_rdd = swe_rdd.reduceByKey(lambda a, b: a+b)

# This is where I need a function or lambda to sort by descending order so I can grab the top 10 elements, 
# Then print them out below with for loop

top_10 = swe_rdd.top(10, lambda w: w[1])
for word, occur in top_10:
    print(word, occur)

att 1706309
och 1344895
i 1050989
det 924878
som 913302
för 908703
av 738102
är 694389
en 620347
vi 539808


###  3.2 Verify that your results are reasonable.

- After translation Swedish to English, it was found that many words matched with the frequent English words. Considering the same it seems that results are reasonable

## Question A.4

###  4.1 Use this parallel corpus to mine some translations in the form of word pairs, for the two languages. We’ll achieve this by looking for pairs of words that frequently occur in the same position within lines.

#### Work with the pair of RDDs you created in question A.2. 
#### Hint: make a new pair of RDDs for each step, sv_1, en_1, sv_2, en_2, ...


### 1. Key the lines by their line number (hint: ZipWithIndex()).

In [11]:
#eng_RDD and swe_RDD from A2
en_1 = eng_RDD.zipWithIndex()
print(en_1.take(10))

print("\n")

sv_1 = swe_RDD.zipWithIndex()
print(sv_1.take(10))

[(['resumption', 'of', 'the', 'session'], 0), (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], 1), (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], 2), (['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], 3), (['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,', 'o

### 2. Swap the key and value - so that the line number is the key.

In [12]:
en_2 = en_1.map(lambda a: (a[1], a[0]))
sv_2 = sv_1.map(lambda b: (b[1], b[0]))
print("\n",en_2.take(10))
print("\n\n",sv_2.take(10))


 [(0, ['resumption', 'of', 'the', 'session']), (1, ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.']), (2, ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.']), (3, ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.']), (4, ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 'members', 'have', 'requested,

### 3. Join the two RDDs together according to the line number key, so you have pairs of lines with the same line number.

In [13]:
en_sv = en_2.join(sv_2)
print(en_sv.takeOrdered(10))

[(0, (['resumption', 'of', 'the', 'session'], ['återupptagande', 'av', 'sessionen'])), (1, (['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester.'])), (2, (['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['som', 'ni', 'kunnat',

### 4. Filter to exclude line pairs that have an empty/missing “corresponding” sentence.

In [14]:
en_sv1 = en_sv.filter(lambda x: len(x[1][0]) >0 and len(x[1][1]) >0)
print(en_sv1.top(10))

[(1862233, (['(the', 'sitting', 'was', 'closed', 'at', '10.50', 'a.m.)'], ['(sammanträdet', 'avslutades', 'kl.10.50.)'])), (1862232, (['i', 'declare', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned.'], ['jag', 'förklarar', 'europaparlamentets', 'session', 'avbruten.'])), (1862231, (['adjournment', 'of', 'the', 'session'], ['avbrytande', 'av', 'sessionen'])), (1862230, (['i', 'am', 'not', 'going', 'to', 're-open', 'the', "'millennium", 'or', 'not', 'the', "millennium'", 'debate,', 'but', 'i', 'am', 'going', 'to', 'wish', 'all', 'of', 'you,', 'and', 'by', 'extension,', 'all', 'the', 'citizens', 'of', 'europe', 'which', 'we', 'represent,', 'a', 'happy', 'year', '2000.'], ['jag', 'kommer', 'inte', 'att', 'åter', 'igen', 'sätta', 'igång', 'debatten', 'om', '"', 'millennium', 'ja,', 'millennium', 'nej"', 'men', 'jag', 'önskar', 'er', 'alla', 'och', 'alla', 'europas', 'medborgare,', 'som', 'vi', 'företräder', 'här,', 'ett', 'gott', 'nytt', 'år', '2000.'])), (1862229, (['i

### 5. Filter to leave only pairs of sentences with a small number of words per sentence, this should give a more reliable translation (you can experiment).

In [15]:
# I am taking here less than or equals to 8 words

en_sv2 = en_sv1.filter(lambda x: len(x[1][0]) <= 8 and len(x[1][1]) <= 8)
print(en_sv2.takeOrdered(10))

[(0, (['resumption', 'of', 'the', 'session'], ['återupptagande', 'av', 'sessionen'])), (7, (['madam', 'president,', 'on', 'a', 'point', 'of', 'order.'], ['fru', 'talman!', 'det', 'gäller', 'en', 'ordningsfråga.'])), (13, (['madam', 'president,', 'on', 'a', 'point', 'of', 'order.'], ['fru', 'talman!', 'det', 'gäller', 'en', 'ordningsfråga.'])), (21, ([''], ['fru', 'talman!'])), (28, (['it', 'is', 'the', 'case', 'of', 'alexander', 'nikitin.'], ['det', 'gäller', 'fallet', 'alexander', 'nikitin.'])), (46, (['mr', 'berenguer', 'fuster,', 'we', 'shall', 'check', 'all', 'this.'], ['kära', 'kollega!', 'vi', 'skall', 'kontrollera', 'allt', 'detta.'])), (50, (['agenda'], ['arbetsplan'])), (53, (['relating', 'to', 'wednesday:'], ['beträffande', 'onsdag:'])), (69, (['(applause', 'from', 'the', 'pse', 'group)'], ['(applåder', 'från', 'pse)'])), (71, (['(parliament', 'rejected', 'the', 'request)', 'president.'], ['(parlamentet', 'avslog', 'begäran.)', 'talmannen.']))]


### 6. Filter to leave only pairs of sentences with the same number of words in each sentence.

In [16]:
en_sv3 = en_sv2.filter(lambda x: len(x[1][0]) == 4 and len(x[1][1]) == 4)
print(en_sv3.takeOrdered(10))

[(96, (['that', 'did', 'not', 'happen.'], ['så', 'blev', 'inte', 'fallet.'])), (183, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.'])), (255, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.'])), (468, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.'])), (633, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.'])), (638, (['are', 'there', 'any', 'comments?'], ['finns', 'det', 'några', 'synpunkter?'])), (1521, (['nothing', 'has', 'changed', 'there.'], ['det', 'har', 'inte', 'ändrats.'])), (1951, (['many', 'thanks', 'for', 'this.'], ['tusen', 'tack', 'för', 'detta.'])), (2189, (['the', 'debate', 'is', 'closed.'], ['jag', 'förklarar', 'debatten', 'avslutad.'])), (2338, (['it', 'was', 'very', 'small.'], ['den', 'var', 'mycket', 'liten.']))]


### 7. For each sentence pair, map to give a list of word pairs (in order) from the two sentences. We no longer need the line numbers.  (hint: use python’s built in zip() function)

In [17]:
en_sv4 = en_sv3.flatMap(lambda x: zip(x[1][0],x[1][1]))
print("\n",en_sv4.takeOrdered(10))
#eng_swe4.collect()


 [('"women', '"kvinnor'), ("'cdu,", '”cdu,'), ("'commercial'", 'begreppet'), ("'compensated'.", '”gottgörs”.'), ("'ecological", '”ekologiskt'), ("'excessive'", '?överdrivna?'), ("'for'.", '”ja”.'), ("'good", '”lycka'), ("'how", '"hur'), ("'i", 'jag')]


### 8. Use reduce to count the number of occurrences of the word-translation-pairs.

In [18]:
# Ref:- https://pythonexamples.org/pyspark-word-count-example/

en_sv5 = en_sv4.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
print(en_sv5.take(10))

[(('acknowledged.', 'enhälligt.'), 1), (('the', 'dagens'), 1), (('received:', 'dokument:'), 226), (('subject:', 'angående:'), 57), (('cattle', 'tillväxthormoner'), 1), (('that', 'det'), 561), (('requires', 'kräver'), 10), (('written', 'skriftliga'), 847), (('closed.', 'avslutad.'), 2534), (('voted', 'röstade'), 66)]


### 9. Print some of the most frequently occurring pairs of words.

In [19]:
print("\n",en_sv5.takeOrdered(10, key=lambda x: -x[1]),"\n")


 [(('closed.', 'avslutad.'), 2534), (('is', 'är'), 1588), (('is', 'debatten'), 1324), (('the', 'jag'), 1324), (('debate', 'förklarar'), 1317), (('the', 'debatten'), 1225), (('is', 'härmed'), 1215), (('debate', 'är'), 1187), (('(rule', '(artikel'), 893), (('written', 'skriftliga'), 847)] 



### Do your translations seem reasonable? Use a dictionary to check a few 
#### (don’t worry, you won’t be marked down for incorrect translations!).

While manually comparing with google translate, the translation seemed reasonable.

In [20]:
spark_session.stop()