In [3]:
from pyspark.sql import SparkSession

#Starting the sparks session

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("hadoop_example")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()
spark_context = spark_session.sparkContext

In [99]:
#Get the number of lines in the english transcript
linesEn = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.ro-en.en")
print("Lines in english transcript : {}".format(linesEn.count()))

#Get the number of lines in the romanian transcript
linesRo = spark_context.textFile("hdfs://192.168.2.87:9000/europarl/europarl-v7.ro-en.ro")
print("Lines in romanian transcript : {}".format(linesRo.count()))

Lines in english transcript : 399375
Lines in romanian transcript : 399375


In [100]:
#Get the number of partitions
print("Number of partitions in english transcript : {}".format(linesEn.getNumPartitions()))
print("Number of partitions in romanian transcript : {}".format(linesRo.getNumPartitions()))
print("Total number of partitions of both transcript : {}".format(linesEn.getNumPartitions()+linesRo.getNumPartitions()))


Number of partitions in english transcript : 2
Number of partitions in romanian transcript : 2
Total number of partitions of both transcript : 4


In [111]:
#Function that takes an RDD and returns the lower case words of each line
def splitLower(rdd):
    
    return rdd.map(lambda word : word.lower())\
              .map(lambda line: line.split(' '))
              


In [112]:
#Few samples of Lowercase english transcript words
print(splitLower(linesEn).take(10))
#Few samples of Lowercase romanian transcript words
print(splitLower(linesRo).take(10))


[['membership', 'of', 'parliament:', 'see', 'minutes'], ['approval', 'of', 'minutes', 'of', 'previous', 'sitting:', 'see', 'minutes'], ['membership', 'of', 'parliament:', 'see', 'minutes'], ['verification', 'of', 'credentials:', 'see', 'minutes'], ['documents', 'received:', 'see', 'minutes'], ['written', 'statements', 'and', 'oral', 'questions', '(tabling):', 'see', 'minutes'], ['petitions:', 'see', 'minutes'], ['texts', 'of', 'agreements', 'forwarded', 'by', 'the', 'council:', 'see', 'minutes'], ['action', 'taken', 'on', "parliament's", 'resolutions:', 'see', 'minutes'], ['agenda', 'for', 'next', 'sitting:', 'see', 'minutes']]
[['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal'], ['aprobarea', 'procesului-verbal', 'al', 'şedinţei', 'precedente:', 'a', 'se', 'vedea', 'procesul-verbal'], ['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal'], ['verificarea', 'prerogativelor:', 'a', 'se', 'vedea', 'procesul-verbal'], ['depunere', 'de', 'documente:', '

In [113]:
#Get the number of lines in the english transcript after using splitLower()
print("Lines in english transcript after using splitLower() : {}".format(splitLower(linesEn).count()))
#Get the number of lines in the romanian transcript after using splitLower()
print("Lines in romanian transcript after using splitLower() : {}".format(splitLower(linesRo).count()))

Lines in english transcript after using splitLower() : 399375
Lines in romanian transcript after using splitLower() : 399375


In [142]:
from operator import add

#Get the first ten most occuring words in the English transcript
print(splitLower(linesEn).flatMap(lambda w: w)\
                   .map(lambda w: (w,1))\
                   .reduceByKey(add)\
                   .sortBy(lambda x: -x[1])\
                   .take(10))

#Get the first ten most occuring words in the Romanian transcript
print(splitLower(linesRo).flatMap(lambda w: w)\
                   .map(lambda w: (w,1))\
                   .reduceByKey(add)\
                   .sortBy(lambda x: -x[1])\
                   .take(10))
                    

[('the', 740292), ('of', 357378), ('to', 322434), ('and', 288989), ('in', 235625), ('a', 162975), ('that', 159878), ('is', 155300), ('for', 121107), ('we', 108204)]
[('de', 454842), ('în', 342724), ('a', 221238), ('să', 205834), ('şi', 184005), ('pentru', 135231), ('la', 131109), ('care', 129987), ('și', 118273), ('că', 105950)]


In [181]:

#A4.1 A4.2
Index_En=splitLower(linesEn).zipWithIndex()
Index_En=Index_En.map(lambda w: (w[1],w[0]))

Index_Ro=splitLower(linesRo).zipWithIndex()
Index_Ro=Index_Ro.map(lambda w: (w[1],w[0]))

print(Index_Ro.take(5))
print(Index_En.take(5))

    


[(0, ['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal']), (1, ['aprobarea', 'procesului-verbal', 'al', 'şedinţei', 'precedente:', 'a', 'se', 'vedea', 'procesul-verbal']), (2, ['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal']), (3, ['verificarea', 'prerogativelor:', 'a', 'se', 'vedea', 'procesul-verbal']), (4, ['depunere', 'de', 'documente:', 'a', 'se', 'vedea', 'procesul-verbal'])]
[(0, ['membership', 'of', 'parliament:', 'see', 'minutes']), (1, ['approval', 'of', 'minutes', 'of', 'previous', 'sitting:', 'see', 'minutes']), (2, ['membership', 'of', 'parliament:', 'see', 'minutes']), (3, ['verification', 'of', 'credentials:', 'see', 'minutes']), (4, ['documents', 'received:', 'see', 'minutes'])]


In [220]:
#A4.3 A4.4
Index_Comb=Index_Ro.join(Index_En).reduceByKey(lambda x,y:x+y)\
                                  
print(Index_Comb.takeOrdered(1))

[(0, (['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal'], ['membership', 'of', 'parliament:', 'see', 'minutes'])), (1, (['aprobarea', 'procesului-verbal', 'al', 'şedinţei', 'precedente:', 'a', 'se', 'vedea', 'procesul-verbal'], ['approval', 'of', 'minutes', 'of', 'previous', 'sitting:', 'see', 'minutes'])), (2, (['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal'], ['membership', 'of', 'parliament:', 'see', 'minutes'])), (3, (['verificarea', 'prerogativelor:', 'a', 'se', 'vedea', 'procesul-verbal'], ['verification', 'of', 'credentials:', 'see', 'minutes'])), (4, (['depunere', 'de', 'documente:', 'a', 'se', 'vedea', 'procesul-verbal'], ['documents', 'received:', 'see', 'minutes']))]


In [179]:
#A4.4 #           .filter(lambda x : [] not in x)

In [221]:
print(Index_Comb.takeOrdered(1))

[(0, (['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal'], ['membership', 'of', 'parliament:', 'see', 'minutes']))]


In [223]:
Index_Comb=Index_Ro.join(Index_En).reduceByKey(lambda x,y:x+y)\
                                  .filter(lambda x : x where x[1])
print(Index_Comb.takeOrdered(1))

[(0, (['componenţa', 'parlamentului:', 'a', 'se', 'vedea', 'procesul-verbal'], ['membership', 'of', 'parliament:', 'see', 'minutes']))]
