In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("Part-A-Hamza_Imran_Saeed")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:
## A1.1

#counting lines
def countLines(myrdd):
    lineCount = myrdd.map(lambda s: 1)
    lineCount.persist()
    totalLines = lineCount.reduce(lambda a,b:a+b)
    return totalLines

en_1 = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
totalLinesEN = countLines(en_1)
print(totalLinesEN)

1862234


In [3]:
## A1.2
sv_1 = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")
totalLinesSV = countLines(sv_1)
print(totalLinesSV)

1862234


In [4]:
## A1.3
print("Is line count same for the two laguages? "+ str(totalLinesEN==totalLinesSV) )

Is line count same for the two laguages? True


In [5]:
## A1.4
print("Partitions for English transcripts: "+ str(en_1.getNumPartitions()))
print("Partitions for Swedish transcripts: "+ str(sv_1.getNumPartitions()))

Partitions for English transcripts: 2
Partitions for Swedish transcripts: 3


In [6]:
## A2.1
# Preprocessing text fucntion
def preProcess(myrdd):
    myrdd = myrdd.lower()
    myrdd = myrdd.split()
    return myrdd

en_2 = en_1.map(preProcess)
sv_2 = sv_1.map(preProcess)


In [7]:
## A2.2 Part 1
en_2.take(10)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999,',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period.'],
 ['although,',
  'as',
  'you',
  'will',
  'have',
  'seen,',
  'the',
  'dreaded',
  "'millennium",
  "bug'",
  'failed',
  'to',
  'materialise,',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful.'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days,',
  'during',
  'this',
  'part-session.'],
 ['in',
  

In [8]:
## A2.2 Part 2
sv_2.take(10)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '"den',
  'stora',
  'år',
  '2000-buggen"',
  'aldrig',
  'rum.',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga.'],
 ['ni',
  'har',
  'begärt',
  'en',
  'debatt',
  'i',
  'ämnet',
  'under',
  'sammanträdesperiodens',
  'kommande',
  'dagar.'],
 ['till',
  'dess',
  'vill',
  'jag',
  'att',
  'vi,',
  'som',
  'ett',
  'antal',
  'kolleger',
  'begärt,',
  'håller',
  'en',
  'tyst',
  'minut',
  'för',
  'offren',
  'f

In [9]:
## A2.3

totalLinesEN2 = countLines(en_2)
print(totalLinesEN2)
totalLinesSV2 = countLines(sv_2)
print(totalLinesSV2)
print("Is line count same for the two laguages after pre processing? "+ str(totalLinesEN2==totalLinesSV2) )

1862234
1862234
Is line count same for the two laguages after pre processing? True


In [10]:
## A3.1 English

en_2.flatMap(lambda a: a).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1],ascending=False).take(10)


[('the', 3498574),
 ('of', 1659884),
 ('to', 1539823),
 ('and', 1288620),
 ('in', 1086089),
 ('that', 797576),
 ('a', 773812),
 ('is', 758087),
 ('for', 534270),
 ('we', 522879)]

In [11]:
## A3.1 Swedish

sv_2.flatMap(lambda a: a).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1],ascending=False).take(10)


[('att', 1706309),
 ('och', 1344895),
 ('i', 1050989),
 ('det', 924878),
 ('som', 913302),
 ('för', 908703),
 ('av', 738102),
 ('är', 694389),
 ('en', 620347),
 ('vi', 539808)]

In [12]:
## A4

en_3 = en_2.zipWithIndex().map(lambda t : (t[1],t[0]))
sv_3 = sv_2.zipWithIndex().map(lambda t : (t[1],t[0]))


In [13]:
en_3.take(2)

[(0, ['resumption', 'of', 'the', 'session']),
 (1,
  ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.'])]

In [14]:
sv_3.take(2)

[(0, ['återupptagande', 'av', 'sessionen']),
 (1,
  ['jag',
   'förklarar',
   'europaparlamentets',
   'session',
   'återupptagen',
   'efter',
   'avbrottet',
   'den',
   '17',
   'december.',
   'jag',
   'vill',
   'på',
   'nytt',
   'önska',
   'er',
   'ett',
   'gott',
   'nytt',
   'år',
   'och',
   'jag',
   'hoppas',
   'att',
   'ni',
   'haft',
   'en',
   'trevlig',
   'semester.'])]

In [15]:
joined = en_3.join(sv_3)
# joined.take(2)



In [16]:
# joined.take(10)
filterMissing = joined.filter(lambda a: a[1][1] and a[1][0])

In [17]:
filterMissing.take(3)

[(163840,
  (['mr',
    'blak,',
    'as',
    'i',
    'will',
    'be',
    'visiting',
    'denmark',
    'in',
    'a',
    'few',
    'days,',
    'i',
    'shall',
    'certainly',
    'make',
    'it',
    'clear',
    'that',
    'there',
    'was',
    'no',
    'discrimination.'],
   ['herr',
    'blak!',
    'eftersom',
    'jag',
    'om',
    'några',
    'dagar',
    'kommer',
    'att',
    'besöka',
    'danmark',
    'kan',
    'jag',
    'klargöra',
    'att',
    'det',
    'inte',
    'handlar',
    'om',
    'någon',
    'diskriminering,',
    'tro',
    'mig.'])),
 (674475,
  (['mr',
    'president,',
    'mr',
    'bartenstein,',
    'mr',
    'barroso,',
    'ladies',
    'and',
    'gentlemen,',
    'today,',
    'we',
    'come',
    'to',
    'the',
    'final',
    'round',
    'of',
    'what',
    'is',
    '–',
    'next',
    'to',
    'the',
    'constitution',
    'for',
    'europe',
    '–',
    'the',
    'european',
    'union’s',
    'most',
    '

In [18]:
filterWordsPerSentence = filterMissing.filter(lambda a: ((len(a[1][1]) < 5) and (len(a[1][0]) < 5)) and (len(a[1][1])==len(a[1][0])) )

In [19]:
filterWordsPerSentence.take(10)

[(376960,
  (['thank', 'you', 'commissioner.'], ['tack,', 'herr', 'kommissionär.'])),
 (1835140,
  (['the', 'debate', 'is', 'closed.'],
   ['jag', 'förklarar', 'debatten', 'avslutad.'])),
 (1090050, (['12.'], ['12.'])),
 (1131135, (['17.'], ['17.'])),
 (623300,
  (['this', 'is', 'our', 'duty.'], ['detta', 'är', 'vår', 'skyldighet.'])),
 (263300,
  (['thank', 'you', 'commissioner.'],
   ['tack,', 'herr', 'kommissionsledamot.'])),
 (1262765, (['(applause)'], ['(applåder)'])),
 (689795,
  (['the', 'debate', 'is', 'closed.'], ['–', 'debatten', 'är', 'avslutad.'])),
 (67310,
  (['this', 'concludes', 'my', 'response.'],
   ['detta', 'var', 'mitt', 'svar.'])),
 (378665,
  (['this', 'is', 'not', 'unilateralism.'],
   ['detta', 'är', 'inte', 'unilateralism.']))]

In [20]:
pairs = filterWordsPerSentence.map(lambda a: list(zip(a[1][0],a[1][1]))).flatMap(lambda a: a).map(lambda word: (word, 1))

In [21]:
pairs.take(10)

[(('those', 'detta'), 1),
 (('are', 'är'), 1),
 (('my', 'mina'), 1),
 (('points.', 'åsikter.'), 1),
 (('(applause)', '(applåder)'), 1),
 (('i', 'jag'), 1),
 (('agree.', 'instämmer.'), 1),
 (('(applause)', '(applåder)'), 1),
 (('so', 'så'), 1),
 (('many', 'många'), 1)]

In [22]:
pairCount = pairs.reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1],ascending=False)

In [23]:
pairCount.take(30)

[(('closed.', 'avslutad.'), 2862),
 (('is', 'är'), 2710),
 (('(applause)', '(applåder)'), 2546),
 (('.', '.'), 2083),
 (('is', 'debatten'), 1325),
 (('the', 'jag'), 1324),
 (('debate', 'förklarar'), 1318),
 (('the', 'debatten'), 1225),
 (('is', 'härmed'), 1215),
 (('debate', 'är'), 1187),
 (('(rule', '(artikel'), 893),
 (('that', 'det'), 852),
 (('written', 'skriftliga'), 847),
 (('statements', 'förklaringar'), 801),
 (('we', 'vi'), 636),
 (('i', 'jag'), 631),
 (('this', 'detta'), 582),
 (('142)', '142)'), 557),
 (('it', 'det'), 515),
 (('applause', 'applåder'), 461),
 (('1.', '1.'), 438),
 (('2.', '2.'), 438),
 (('there', 'det'), 429),
 (('3.', '3.'), 405),
 (('why?', 'varför?'), 372),
 (('-', '-'), 367),
 (('are', 'är'), 364),
 (('this', 'det'), 361),
 (('are', 'finns'), 360),
 (('not', 'inte'), 352)]

In [24]:
# release the cores for another application!
spark_context.stop()