In [2]:
from pyspark.sql import SparkSession

spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("Kolonskopi")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

### Question A1

In [3]:
en_tF = spark_context.textFile('hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en')

en_lc = en_tF.count()
print(f"Number of lines in English document {en_lc}\n")

sv_tF = spark_context.textFile('hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv')

sv_lc = sv_tF.count()
print(f"Number of lines in Swedish document {sv_lc}\n")

if en_lc == sv_lc:
    print(f"There are equal number of lines in the two documents\n")

print(f"Number of partitions in files:\n EN:\t{en_tF.getNumPartitions()}\n SV:\t{sv_tF.getNumPartitions()}")



Number of lines in English document 1862234

Number of lines in Swedish document 1862234

There are equal number of lines in the two documents

Number of partitions in files:
 EN:	2
 SV:	3


### Question A2 

In [4]:
en_low_tF = en_tF.map(lambda line: line.lower().split(' ')) #split lines on spaces 
sv_low_tF = sv_tF.map(lambda line: line.lower().split(' '))

print(f"The first 10 objects in English and then Swedish file:\n")
print(en_low_tF.take(10))
print(sv_low_tF.take(10))

en_low_lc = en_low_tF.count()
sv_low_lc = sv_low_tF.count()

if en_low_lc == sv_low_lc:
    print("After pre-processing the files have the same number of lines")

The first 10 objects in English and then Swedish file:

[['resumption', 'of', 'the', 'session'], ['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999,', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period.'], ['although,', 'as', 'you', 'will', 'have', 'seen,', 'the', 'dreaded', "'millennium", "bug'", 'failed', 'to', 'materialise,', 'still', 'the', 'people', 'in', 'a', 'number', 'of', 'countries', 'suffered', 'a', 'series', 'of', 'natural', 'disasters', 'that', 'truly', 'were', 'dreadful.'], ['you', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days,', 'during', 'this', 'part-session.'], ['in', 'the', 'meantime,', 'i', 'should', 'like', 'to', 'observe', 'a', "minute'", 's', 'silence,', 'as', 'a', 'number', 'of', 

### Question A3

In [73]:
en_words = en_low_tF.flatMap(lambda line: line)
en_word_counts = en_words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x+y).sortBy(lambda word_count: -word_count[1]).take(10)

sv_words = sv_low_tF.flatMap(lambda line: line)
sv_word_counts = sv_words\
    .map(lambda word: (word, 1))\ #tuplify word with count
    .reduceByKey(lambda x, y: x+y)\ #aggregate word counts
    .sortBy(lambda word_count: -word_count[1])\ #Sort by descending occurances.
    .take(10)


print(f"Most commonly used words in English carpus, and their count:\n {en_word_counts}\n\n"
    f"Most commonly used words in Swedish carpus, and their count:\n {sv_word_counts}\n\n"
    f"Seems reasonable that articles, conjunctions, determiners and prepositions are most ocuring in texts."
     )

Most commonly used words in English carpus, and their count:
 [('the', 3498375), ('of', 1659758), ('to', 1539760), ('and', 1288401), ('in', 1085993), ('that', 797516), ('a', 773522), ('is', 758050), ('for', 534242), ('we', 522849)]
Most commonly used words in Swedish carpus, and their count:
 [('att', 1706293), ('och', 1344830), ('i', 1050774), ('det', 924866), ('som', 913276), ('för', 908680), ('av', 738068), ('är', 694381), ('en', 620310), ('vi', 539797)]



### Question A4

In [None]:
indx_en_low_tF = en_low_tF\
    .zipWithIndex()\
    .map(lambda line: (line[1], line[0])) #index lines, swap key and value
    
indx_sv_low_tF = sv_low_tF\
    .zipWithIndex()\
    .map(lambda line: (line[1], line[0]))

#Inner join keys, filter for equal length lines, randomly selected number of words to start with 
j_sv_en_indx = indx_en_low_tF\
    .join(indx_sv_low_tF)\
    .filter(lambda line: len(line[1][0]) == len(line[1][1]))\
    .filter(lambda line: len(line[1][0]) < 5)\
    .map(lambda words: (zip(words[1][0], words[1][1]), 1))\
    .reduce(lambda x, y: x+y)


In [139]:
j_sv_en_indx.take(10)

[(63065, (['vote'], ['omröstning'])),
 (225205, (['the', 'balkan', 'crisis.'], ['krisen', 'på', 'balkan.'])),
 (663330, (['that', 'is', 'absurd.'], ['det', 'är', 'orimligt.'])),
 (812935, (['childcare', '(debate)'], ['barnomsorg', '(debatt)'])),
 (836445, (['via', 'baltica', '(debate)'], ['via', 'baltica', '(debatt)'])),
 (856635, (['(applause)'], ['(applåder)'])),
 (20160, (['we', 'welcome', 'that!'], ['den', 'är', 'välkommen!'])),
 (174590, ([''], ['.'])),
 (277570,
  (['are', 'there', 'any', 'comments?'],
   ['finns', 'det', 'några', 'synpunkter?'])),
 (40160, (['zimbabwe'], ['zimbabwe']))]

In [None]:
spark_context.stop()