In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("local[1]") \
        .appName("Part-A-Hamza_Imran_Saeed")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [2]:
## A1.1

#counting lines
def countLines(myrdd):
    lineCount = myrdd.map(lambda s: 1)
    lineCount.persist()
    totalLines = lineCount.reduce(lambda a,b:a+b)
    return totalLines

en_1 = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.en")
totalLinesEN = countLines(en_1)
print(totalLinesEN)


# wordCounts = lines.map{line => line.split("[\\p{Punct}\\p{Space}]").filter(_ == "ERROR").size}.reduce(_ + _)

1862234


In [3]:
## A1.2
sv_1 = spark_context.textFile("hdfs://192.168.1.153:9000/europarl/europarl-v7.sv-en.sv")
totalLinesSV = countLines(sv_1)
print(totalLinesSV)

1862234


In [4]:
## A1.3
print("Is line count same for the two laguages? "+ str(totalLinesEN==totalLinesSV) )

Is line count same for the two laguages? True


In [5]:
## A1.4
print("Partitions for English transcripts: "+ str(en_1.getNumPartitions()))
print("Partitions for Swedish transcripts: "+ str(sv_1.getNumPartitions()))

Partitions for English transcripts: 2
Partitions for Swedish transcripts: 3


In [6]:
## A2.1
# Preprocessing text fucntion
def preProcess(myrdd):
    myrdd = myrdd.lower()
    myrdd = myrdd.split()
    return myrdd

en_2 = en_1.map(preProcess)
sv_2 = sv_1.map(preProcess)


In [7]:
## A2.2 Part 1
en_2.take(10)

[['resumption', 'of', 'the', 'session'],
 ['i',
  'declare',
  'resumed',
  'the',
  'session',
  'of',
  'the',
  'european',
  'parliament',
  'adjourned',
  'on',
  'friday',
  '17',
  'december',
  '1999,',
  'and',
  'i',
  'would',
  'like',
  'once',
  'again',
  'to',
  'wish',
  'you',
  'a',
  'happy',
  'new',
  'year',
  'in',
  'the',
  'hope',
  'that',
  'you',
  'enjoyed',
  'a',
  'pleasant',
  'festive',
  'period.'],
 ['although,',
  'as',
  'you',
  'will',
  'have',
  'seen,',
  'the',
  'dreaded',
  "'millennium",
  "bug'",
  'failed',
  'to',
  'materialise,',
  'still',
  'the',
  'people',
  'in',
  'a',
  'number',
  'of',
  'countries',
  'suffered',
  'a',
  'series',
  'of',
  'natural',
  'disasters',
  'that',
  'truly',
  'were',
  'dreadful.'],
 ['you',
  'have',
  'requested',
  'a',
  'debate',
  'on',
  'this',
  'subject',
  'in',
  'the',
  'course',
  'of',
  'the',
  'next',
  'few',
  'days,',
  'during',
  'this',
  'part-session.'],
 ['in',
  

In [8]:
## A2.2 Part 2
sv_2.take(10)

[['återupptagande', 'av', 'sessionen'],
 ['jag',
  'förklarar',
  'europaparlamentets',
  'session',
  'återupptagen',
  'efter',
  'avbrottet',
  'den',
  '17',
  'december.',
  'jag',
  'vill',
  'på',
  'nytt',
  'önska',
  'er',
  'ett',
  'gott',
  'nytt',
  'år',
  'och',
  'jag',
  'hoppas',
  'att',
  'ni',
  'haft',
  'en',
  'trevlig',
  'semester.'],
 ['som',
  'ni',
  'kunnat',
  'konstatera',
  'ägde',
  '"den',
  'stora',
  'år',
  '2000-buggen"',
  'aldrig',
  'rum.',
  'däremot',
  'har',
  'invånarna',
  'i',
  'ett',
  'antal',
  'av',
  'våra',
  'medlemsländer',
  'drabbats',
  'av',
  'naturkatastrofer',
  'som',
  'verkligen',
  'varit',
  'förskräckliga.'],
 ['ni',
  'har',
  'begärt',
  'en',
  'debatt',
  'i',
  'ämnet',
  'under',
  'sammanträdesperiodens',
  'kommande',
  'dagar.'],
 ['till',
  'dess',
  'vill',
  'jag',
  'att',
  'vi,',
  'som',
  'ett',
  'antal',
  'kolleger',
  'begärt,',
  'håller',
  'en',
  'tyst',
  'minut',
  'för',
  'offren',
  'f

In [9]:
## A2.3

totalLinesEN2 = countLines(en_2)
print(totalLinesEN2)
totalLinesSV2 = countLines(sv_2)
print(totalLinesSV2)
print("Is line count same for the two laguages after pre processing? "+ str(totalLinesEN2==totalLinesSV2) )

1862234
1862234
Is line count same for the two laguages after pre processing? True


In [10]:
## A3.1 English

en_2.flatMap(lambda a: a).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1],ascending=False).take(10)


[('the', 3498574),
 ('of', 1659884),
 ('to', 1539823),
 ('and', 1288620),
 ('in', 1086089),
 ('that', 797576),
 ('a', 773812),
 ('is', 758087),
 ('for', 534270),
 ('we', 522879)]

In [11]:
## A3.1 Swedish

sv_2.flatMap(lambda a: a).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda a: a[1],ascending=False).take(10)


[('att', 1706309),
 ('och', 1344895),
 ('i', 1050989),
 ('det', 924878),
 ('som', 913302),
 ('för', 908703),
 ('av', 738102),
 ('är', 694389),
 ('en', 620347),
 ('vi', 539808)]

In [12]:
## A4

en_3 = en_2.zipWithIndex().map(lambda t : (t[1],t[0]))
sv_3 = sv_2.zipWithIndex().map(lambda t : (t[1],t[0]))


In [16]:
en_3.take(2)

[(0, ['resumption', 'of', 'the', 'session']),
 (1,
  ['i',
   'declare',
   'resumed',
   'the',
   'session',
   'of',
   'the',
   'european',
   'parliament',
   'adjourned',
   'on',
   'friday',
   '17',
   'december',
   '1999,',
   'and',
   'i',
   'would',
   'like',
   'once',
   'again',
   'to',
   'wish',
   'you',
   'a',
   'happy',
   'new',
   'year',
   'in',
   'the',
   'hope',
   'that',
   'you',
   'enjoyed',
   'a',
   'pleasant',
   'festive',
   'period.'])]

In [17]:
sv_3.take(2)

[(0, ['återupptagande', 'av', 'sessionen']),
 (1,
  ['jag',
   'förklarar',
   'europaparlamentets',
   'session',
   'återupptagen',
   'efter',
   'avbrottet',
   'den',
   '17',
   'december.',
   'jag',
   'vill',
   'på',
   'nytt',
   'önska',
   'er',
   'ett',
   'gott',
   'nytt',
   'år',
   'och',
   'jag',
   'hoppas',
   'att',
   'ni',
   'haft',
   'en',
   'trevlig',
   'semester.'])]

In [18]:
joined = en_3.join(sv_3)
# joined.take(2)



In [27]:
# joined.take(10)
filterMissing = joined.filter(lambda a: a[1][1] and a[1][0])

In [28]:
filterMissing.take(3)

[(1228800,
  (['i',
    'welcome',
    'the',
    'high',
    "representative's",
    'announcement',
    'today',
    'on',
    'aid',
    'and',
    'ask',
    'her',
    'to',
    'ensure',
    'that',
    'next',
    "week's",
    'meetings',
    'address',
    'actual',
    'commitments',
    'of',
    'aid',
    'money',
    'to',
    'yemen,',
    'from',
    'all',
    'participants,',
    'at',
    'a',
    'time',
    'when',
    'the',
    "un's",
    'consolidated',
    'appeal',
    'for',
    'the',
    'country',
    'has',
    'generated',
    'less',
    'than',
    '1%',
    'of',
    'the',
    'funds',
    'needed.'],
   ['jag',
    'välkomnar',
    'utrikesrepresentantens',
    'tillkännagivande',
    'om',
    'bistånd',
    'i',
    'dag',
    'och',
    'ber',
    'henne',
    'att',
    'se',
    'till',
    'att',
    'man',
    'vid',
    'nästa',
    'veckas',
    'möten',
    'behandlar',
    'faktiska',
    'åtaganden',
    'från',
    'alla',
    'deltaga

In [34]:
filterWordsPerSentence = filterMissing.filter(lambda a: ((len(a[1][1]) <= 5) and (len(a[1][0]) <= 5)) and (len(a[1][1])==len(a[1][0])) )

In [36]:
filterWordsPerSentence.take(10)

[(180365,
  (['question', 'no', '31', 'by', '(h-0728/01):'],
   ['fråga', 'nr', '31', 'från', '(h-0728/01):'])),
 (770675, (['(applause)'], ['(applåder)'])),
 (983810, (['9.'], ['9.'])),
 (1606450,
  (['and', 'they', 'do', 'just', 'that.'],
   ['och', 'det', 'gör', 'de', 'också!'])),
 (1125,
  (['i', 'agree', 'with', 'this', 'request.'],
   ['jag', 'instämmer', 'i', 'det', 'förslaget.'])),
 (1475735, (['belarus', '(debate)'], ['vitryssland', '(debatt)'])),
 (1432155,
  (['documents', 'received:', 'see', 'minutes'],
   ['inkomna', 'dokument:', 'se', 'protokollet'])),
 (1738770,
  (['that', 'is', 'my', 'experience.'], ['det', 'är', 'min', 'erfarenhet.'])),
 (853605, (['3.'], ['3.'])),
 (1066810, (['4.'], ['4.']))]

In [48]:
pairs = filterWordsPerSentence.map(lambda a: list(zip(a[1][0],a[1][1]))).flatMap(lambda a: a).map(lambda word: (word, 1))

In [49]:
pairs.take(10)

[(('question', 'fråga'), 1),
 (('no', 'nr'), 1),
 (('31', '31'), 1),
 (('by', 'från'), 1),
 (('(h-0728/01):', '(h-0728/01):'), 1),
 (('(applause)', '(applåder)'), 1),
 (('9.', '9.'), 1),
 (('and', 'och'), 1),
 (('they', 'det'), 1),
 (('do', 'gör'), 1)]

In [53]:
pairCount.take(30)

[(('is', 'är'), 4718),
 (('closed.', 'avslutad.'), 2971),
 (('(applause)', '(applåder)'), 2546),
 (('.', '.'), 2107),
 (('that', 'det'), 1495),
 (('we', 'vi'), 1443),
 (('the', 'jag'), 1336),
 (('is', 'debatten'), 1327),
 (('debate', 'förklarar'), 1319),
 (('the', 'debatten'), 1253),
 (('is', 'härmed'), 1240),
 (('debate', 'är'), 1211),
 (('i', 'jag'), 1179),
 (('this', 'detta'), 1111),
 (('it', 'det'), 986),
 (('written', 'skriftliga'), 893),
 (('(rule', '(artikel'), 893),
 (('statements', 'förklaringar'), 801),
 (('not', 'inte'), 742),
 (('are', 'är'), 693),
 (('this', 'det'), 669),
 (('a', 'en'), 656),
 (('there', 'det'), 644),
 (('question', 'fråga'), 564),
 (('142)', '142)'), 557),
 (('by', 'från'), 547),
 (('no', 'nr'), 537),
 (('that', 'detta'), 512),
 (('have', 'har'), 475),
 (('are', 'finns'), 467)]

In [None]:
# release the cores for another application!
spark_context.stop()