In [75]:
from pyspark import SparkConf, SparkContext
import re
import csv
import time
import pandas as pd

In [105]:
sc.stop()

# Task 1.2

In [2]:
# Configure Spark
conf = SparkConf().setAppName("LocalSparkCluster").setMaster("local[3]")
conf.set("spark.executor.memory", "12g")
conf.set("spark.driver.memory", "8g")
sc = SparkContext(conf=conf)

**Getting users papers library**

In [91]:
start_time = time.time()

# Read the text file and create an RDD of lines
users_papers_rdd = sc.textFile("users_libraries.txt")

# Split the lines based on a criteria using the map() transformation
users_rdd = users_papers_rdd.map(lambda line: line.split(";")).map(lambda x: (x[0], x[1].split(",")))
split_users_rdd = users_rdd.mapValues(lambda x: list(set(x))).cache()

out = split_users_rdd.takeSample(False,50,250)
end_time = time.time()
out

[('b87eeeafcb9eefd8b09123f579bc8b02',
  ['10419341',
   '7011442',
   '9708628',
   '2932009',
   '1272533',
   '6499860',
   '4222235',
   '4504728',
   '832173',
   '3006077',
   '463787',
   '3039910',
   '2274657',
   '8924794',
   '3835932',
   '1809040',
   '3438983',
   '822669',
   '3150416',
   '4810395',
   '7355618',
   '9266822',
   '3681665',
   '4565518',
   '3391362',
   '2291418',
   '9474823',
   '9176032',
   '9958805',
   '5031916',
   '3812641',
   '9313047',
   '8798552',
   '7358340',
   '1229798',
   '9008059',
   '1097083',
   '106615',
   '10161003',
   '4170553',
   '9500423',
   '8982212',
   '9428633',
   '6434409',
   '2369865',
   '1959297',
   '7933814',
   '1643685',
   '2311330',
   '4537943',
   '9144429',
   '6016560',
   '4645085',
   '1732275',
   '5699291',
   '6391345',
   '3681053',
   '1455378',
   '1103165',
   '251',
   '3578200',
   '767656',
   '5435806',
   '8668153',
   '539244',
   '3341129',
   '7576911',
   '3271136',
   '1357150',
   '

In [92]:
print('time taken: %s seconds' % (end_time - start_time))

time taken: 1.2544267177581787 seconds


**Getting papers words**

In [93]:
start_time = time.time()

papers_rdd = sc.textFile("papers.csv")
parsed_rdd = papers_rdd.map(lambda line: next(csv.reader([line])))
# Access the last field (column) of each row
extract_abstract_rdd = parsed_rdd.map(lambda row: (row[0],row[-1]))
split_papers_rdd = extract_abstract_rdd.mapValues(lambda x: list(filter(None,re.split("[' ()}{,.?-]", x))))\
                                    .mapValues(lambda x: [word for word in x if word.isalpha()]).cache()
out = split_papers_rdd.takeSample(False,50,250)
end_time = time.time()
out

[('6262360', []),
 ('4255578',
  ['the',
   'information',
   'carrier',
   'of',
   'today',
   's',
   'communications',
   'a',
   'weak',
   'pulse',
   'of',
   'light',
   'is',
   'an',
   'intrinsically',
   'quantum',
   'object',
   'as',
   'a',
   'consequence',
   'complete',
   'information',
   'about',
   'the',
   'pulse',
   'cannot',
   'be',
   'perfectly',
   'recorded',
   'in',
   'a',
   'classical',
   'memory',
   'even',
   'in',
   'principle',
   'in',
   'the',
   'field',
   'of',
   'quantum',
   'information',
   'this',
   'has',
   'led',
   'to',
   'the',
   'long',
   'standing',
   'challenge',
   'of',
   'how',
   'to',
   'achieve',
   'a',
   'high',
   'fidelity',
   'transfer',
   'of',
   'an',
   'independently',
   'prepared',
   'quantum',
   'state',
   'of',
   'light',
   'onto',
   'an',
   'atomic',
   'quantum',
   'here',
   'we',
   'propose',
   'and',
   'experimentally',
   'demonstrate',
   'a',
   'protocol',
   'for',
   's

In [94]:
print('time taken: %s seconds' % (end_time - start_time))

time taken: 11.804882049560547 seconds


# Task 1.3

**Getting stopwords and removing them from papers words**

In [95]:
with open('stopwords_en.txt', 'r') as file:
    # Read all the lines of the file into a list
    lines = file.readlines()
    lines_without_newline = [line.rstrip('\n') for line in lines]
    

In [96]:
stop_word_rdd = sc.broadcast(lines_without_newline)
def stopwords_filter(words_list):
    return [word for word in words_list if word not in stop_word_rdd.value]

**Getting users with text then counting words**

In [97]:
start_time = time.time()


papres_no_stopwords_rdd = split_papers_rdd.map(lambda paper_tuple: (paper_tuple[0],stopwords_filter(paper_tuple[1])))

filtered_papres_no_stopwords_rdd = papres_no_stopwords_rdd.filter(lambda x: isinstance(x[1],list))
#  --------------------------------------------------------------

users_abstracts_rdd = split_users_rdd.cartesian(filtered_papres_no_stopwords_rdd)\
                        .filter(lambda x: x[1][0] in x[0][1])\
                        .map(lambda x: (x[0][0],x[1][1])).cache()
users_words_rdd = users_abstracts_rdd.reduceByKey(lambda x,y: x + y)

# --------------------------------------------------------------

word_counts_rdd = users_words_rdd.flatMap(lambda x: [((x[0], word),1) for word in x[1]])\
                    .reduceByKey(lambda x, y: x+y)\
                    .map(lambda x: (x[0][0], [x[0][1],x[1]]))

sorted_word_counts_rdd = word_counts_rdd.groupByKey()\
                            .mapValues(lambda x: sorted(x, key=lambda x:x[1], reverse=True)[:10])\
                            .mapValues(lambda x: [z[0] for z in x])

out = sorted_word_counts_rdd.takeSample(False,50,250)
end_time = time.time()
out

[('8086a01986fde083a40ce1f2b732224f',
  ['quantum',
   'single',
   'photon',
   'cavity',
   'hole',
   'photons',
   'state',
   'entanglement',
   'atoms',
   'coupling']),
 ('07c74cc7237ec4dfaba6f45ddd2ab73a',
  ['type',
   'haskell',
   'language',
   'types',
   'functional',
   'languages',
   'programming',
   'paper',
   'monads',
   'based']),
 ('68f69281d42236c349e51ffa315cc1e4',
  ['method',
   'topologies',
   'results',
   'neat',
   'evolving',
   'neuroevolution',
   'illustrated',
   'claim',
   'important',
   'computer']),
 ('726115e5c6fc3c7f5dd64d9c408c0471',
  ['games',
   'virtual',
   'worlds',
   'game',
   'design',
   'user',
   'play',
   'online',
   'world',
   'life']),
 ('351285e9f15a3842febf1715ac7fd677',
  ['patients',
   'fontan',
   'survival',
   'heart',
   'term',
   'arrhythmia',
   'years',
   'follow',
   'long',
   'congenital']),
 ('80f2526f6c1c97232f7d6049760d9066',
  ['patients',
   'clinical',
   'treatment',
   'research',
   'criteria',
 

In [98]:
print('time taken: %s seconds' % (end_time - start_time))

time taken: 5694.382145404816 seconds


**Alternate implementation**

In [186]:
start_time = time.time()


papres_no_stopwords_rdd = split_papers_rdd.map(lambda paper_tuple: (int(paper_tuple[0]),stopwords_filter(paper_tuple[1])))

filtered_papres_no_stopwords_rdd = papres_no_stopwords_rdd.filter(lambda x: isinstance(x[1],list))
#  --------------------------------------------------------------

users_abstracts_rdd = split_users_rdd.flatMapValues(lambda x: x).map(lambda x: (int(x[1]),x[0]))

joint_rdd = users_abstracts_rdd.join(filtered_papres_no_stopwords_rdd).cache()

joint_rdd.takeSample(False,50,250)


users_abstracts_rdd = joint_rdd.map(lambda x: (x[1][0],x[1][1]))
users_words_rdd = users_abstracts_rdd.flatMapValues(lambda x: x)
users_words_counts = users_words_rdd.map(lambda x: (x,1)).reduceByKey(lambda x, y: x+y)\
                                    .map(lambda x: (x[0][0], [x[0][1],x[1]])).cache()


sorted_word_counts_rdd = users_words_counts.groupByKey()\
                            .mapValues(lambda x: sorted(x, key=lambda x:x[1], reverse=True)[:10])\
                            .mapValues(lambda x: [z[0] for z in x])

out = sorted_word_counts_rdd.takeSample(False,50,250)
end_time = time.time()
out

[('efe5909b8df510080677c1eb3326cd74',
  ['library',
   'web',
   'libraries',
   'paper',
   'digital',
   'risks',
   'topic',
   'xml',
   'maps',
   'approaches']),
 ('626e6c770b8abb28deb70801a082046f',
  ['document',
   'image',
   'classifier',
   'choice',
   'classification',
   'classes',
   'diverse',
   'performance',
   'features',
   'due']),
 ('fb89af47fc70177c60bf3c0f6d33eca5',
  ['technology',
   'social',
   'organizational',
   'research',
   'concept',
   'paper',
   'model',
   'organization',
   'control',
   'author']),
 ('83bcbc0b8abdc79bd96c052051270cba',
  ['fermi',
   'temperature',
   'energy',
   'resonance',
   'shape',
   'surface',
   'gap',
   'critical',
   'quantum',
   'shows']),
 ('3828f32eafd4e7e098a3824bdbf5a4ae', ['abstract']),
 ('502e6a799d552e73893bddccde18076f',
  ['based',
   'insider',
   'system',
   'detection',
   'systems',
   'attack',
   'models',
   'paper',
   'anomaly',
   'data']),
 ('cadc7039b893c9766735dda2f46bb581',
  ['learning',

In [187]:
print('time taken: %s seconds' % (end_time - start_time))

time taken: 1006.0912668704987 seconds


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 53304)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/opt/conda/lib/python3.11/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/opt/conda/lib/python3.11/site-packages/pyspark/accumulators.py", line 253, in poll
    if func():
       ^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/pyspark/accumulators.py", line 257, in accum_updates
    num_u

In [182]:
users_abstracts_rdd = joint_rdd.map(lambda x: (x[1][0],x[1][1]))
users_words_rdd = users_abstracts_rdd.flatMapValues(lambda x: x)
users_words_counts = users_words_rdd.map(lambda x: (x,1)).reduceByKey(lambda x, y: x+y)\
                                    .map(lambda x: (x[0][0], [x[0][1],x[1]])).cache()
users_words_counts.takeSample(False,50,250)

[('68278f345a966201886e7323412e91c2', ['concert', 1]),
 ('a0978625a886f8dd1c8e3bf0e2d033af', ['closes', 1]),
 ('d1e0de2a945f786f2796c962683de180', ['rarely', 1]),
 ('73f330836c8e6afc06e9ae222b6cef0a', ['rates', 1]),
 ('da2cf87c99168d853a9956de98ff92c6', ['reconfigurations', 1]),
 ('5d943ca74652cace03cc8b4481c47c95', ['tlss', 1]),
 ('d78a37bce0b51f931221e4ccdcc8a6f1', ['working', 2]),
 ('5db4ee3dee6c35e90321811df0490196', ['social', 1]),
 ('9208e273a85b8f60ea552ca766decefd', ['supplemented', 1]),
 ('1507e0abf427bfc9bc3d017d63644092', ['regression', 43]),
 ('9ebb2637a71329a1998ea6f459718dad', ['medium', 4]),
 ('f268e3f8f8f60e4bf0c6d312bce3229a', ['neural', 8]),
 ('4bebe20c069262d64500cee5e13cd49b', ['bsc', 1]),
 ('4c6d35140374098baacf98c1757539a3', ['scientific', 1]),
 ('30fe80352cd59aefbec8fc542f7c6a27', ['focused', 3]),
 ('9e2903b274a180b517ffa9fb305b3f4d', ['annotation', 4]),
 ('44f452b12b9dc82567fecfbac873a9b5', ['comprise', 2]),
 ('56610ddbb9f53f9fb02b783381162139', ['past', 1]),
 (

In [183]:
sorted_word_counts_rdd = users_words_counts.groupByKey()\
                            .mapValues(lambda x: sorted(x, key=lambda x:x[1], reverse=True)[:10])\
                            .mapValues(lambda x: [z[0] for z in x])
sorted_word_counts_rdd.takeSample(False,50,250)


[('973f4a47102133b7f8d76e56524e680d',
  ['microtubule',
   'assembly',
   'end',
   'gtp',
   'gmpcp',
   'tubulin',
   'induced',
   'glycerol',
   'growing',
   'entropy']),
 ('70564a29db250b6c4a93ade92fc27743',
  ['ligo',
   'detectors',
   'sensitivity',
   'gravitational',
   'wave',
   'advanced',
   'data',
   'initial',
   'mode',
   'run']),
 ('ec1bf390a55832b46c543b14db6b801d',
  ['vehicle',
   'model',
   'mpc',
   'control',
   'approach',
   'dynamic',
   'trajectory',
   'visual',
   'systems',
   'steering']),
 ('4bcafac527c3b63ab0fb41094a388962',
  ['reaction',
   'metal',
   'nmr',
   'catalytic',
   'catalyst',
   'ionic',
   'formation',
   'chemical',
   'developed',
   'bond']),
 ('0fe7814a82880b84b16bb944b99c9d4d',
  ['gender',
   'migration',
   'networks',
   'research',
   'structure',
   'author',
   'attention',
   'argues',
   'conceptualize',
   'inequality']),
 ('90612e151b518da510eb2baa0fdbda9b',
  ['psoriatic',
   'immune',
   'arthritis',
   'klks',
   

In [185]:
users_words_counts.groupByKey().mapValues(lambda x: sorted(x, key=lambda x:x[1], reverse=True)[:10]).takeSample(False,50,250)

[('4edac041314fd5c3d05a8b8dfca339ab',
  [['code', 7],
   ['system', 5],
   ['arbitrariness', 5],
   ['world', 4],
   ['internal', 4],
   ['properties', 4],
   ['reactive', 3],
   ['model', 3],
   ['requires', 3],
   ['systems', 3]]),
 ('be1037d8b24cf7e057bd8150ae91b868',
  [['breastfeeding', 19],
   ['benefits', 5],
   ['infant', 5],
   ['health', 4],
   ['statement', 3],
   ['academy', 3],
   ['pediatrics', 3],
   ['american', 3],
   ['medical', 3],
   ['infants', 3]]),
 ('e04a5b0e60cf273da934751ceccd58cf',
  [['tv', 58],
   ['user', 26],
   ['mobile', 22],
   ['social', 21],
   ['information', 21],
   ['iptv', 21],
   ['services', 20],
   ['system', 18],
   ['users', 18],
   ['design', 14]]),
 ('e91fce6ffec7ecbbfd580fa1ed1384d1',
  [['data', 24],
   ['database', 14],
   ['mapreduce', 13],
   ['large', 11],
   ['molecular', 9],
   ['systems', 9],
   ['gene', 8],
   ['processing', 8],
   ['ncbi', 7],
   ['biological', 6]]),
 ('fca9bb1e0d0e052e6230ec2ea4437a08',
  [['identity', 30],
   

# Task 1.4

**Number of distinct users and items**

In [74]:
print("Number of distinct users is " + str(split_users_rdd.map(lambda x: x[0]).distinct().count()))

Number of distinct users is 4


In [75]:
print("Number of distinct papers is " + str(split_papers_rdd.map(lambda x: x[0]).distinct().count()))

Number of distinct users is 172079


**Users ratings statistics**

In [61]:
users_stats = split_users_rdd.mapValues(lambda x: len(x)).cache()

In [62]:
print("Max number of ratings a user has given is " + str(users_stats.map(lambda x: x[1]).max()))
print("Min number of ratings a user has given is " + str(users_stats.map(lambda x: x[1]).min()))
print("Average number of users ratings is " + str(users_stats.map(lambda x: x[1]).mean()))
print("Standard deviation for ratings of users is " + str(users_stats.map(lambda x: x[1]).stdev()))

Max number of ratings a user has given is 170
Min number of ratings a user has given is 1
Average number of users ratings is 59.00000000000001
Standard deviation for ratings of users is 65.95832017266662


**Papers ratings statistics**

In [71]:
papers_stats = split_users_rdd.flatMapValues(lambda x: x).map(lambda x: (x[1],1)).reduceByKey(lambda x,y: x+y).cache()

In [72]:
print("Max number of ratings a user has given is " + str(papers_stats.map(lambda x: x[1]).max()))
print("Min number of ratings a user has given is " + str(papers_stats.map(lambda x: x[1]).min()))
print("Average number of users ratings is " + str(papers_stats.map(lambda x: x[1]).mean()))
print("Standard deviation for ratings of users is " + str(papers_stats.map(lambda x: x[1]).stdev()))

Max number of ratings a user has given is 1
Min number of ratings a user has given is 1
Average number of users ratings is 1.0
Standard deviation for ratings of users is 0.0


# Task 1.5

In [167]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, explode, split, udf, row_number
from pyspark.sql.types import ArrayType, IntegerType, StringType


In [101]:
# Create a SparkSession with similar configuration
spark = SparkSession.builder \
    .appName("LocalSparkCluster") \
    .master("local[3]") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

# Access the SparkContext
sc_df = spark.sparkContext

In [171]:
papers_schema = "`paper_id` INT, `type` STRING, `journal` STRING, `bookـtitle` STRING, `series` STRING, `publisher` STRING, \
                `pages` INT, `volume` INT, `number` INT, `year` INT, `month` STRING, `postedat` STRING, `address` STRING, \
                `title` STRING, `abstract` STRING"

papers_df = spark.read.csv('papers.csv',header=False, schema=papers_schema)['paper_id','abstract']
papers_df.show(5,truncate=True)

+--------+--------------------+
|paper_id|            abstract|
+--------+--------------------+
|   80546|the genetic code ...|
| 5842862|choosing good pro...|
| 1242600|although scientis...|
| 3467077|"many scientists ...|
|  309395|there is increasi...|
+--------+--------------------+
only showing top 5 rows



In [172]:
users_library_schema = "`user_id` STRING, `papers` STRING"
users_library_df = spark.read.csv("users_libraries.txt",header=False, sep=";", schema=users_library_schema)
users_library_df = users_library_df.withColumn("papers", split(users_library_df.papers,',').cast(ArrayType(IntegerType())))
users_library_df.show(5,truncate=True)

+--------------------+--------------------+
|             user_id|              papers|
+--------------------+--------------------+
|28d3f81251d94b097...|[3929762, 503574,...|
|d0c9aaa788153daea...|[2080631, 6343346...|
|f05bcffe7951de9e5...|[1158654, 478707,...|
|ca4f1ba4094011d9a...|            [278019]|
|d1d41a15201915503...|[6610569, 6493797...|
+--------------------+--------------------+
only showing top 5 rows



# Task 1.6

In [173]:
def abstract_processing(x):
    if x is None:
        return None
    words = re.split("[' ()}{,.?-]", x)
    words = [word for word in words if word not in stop_word_rdd.value]
    return list(filter(str.isalpha, words))

split_function = udf(abstract_processing, ArrayType(StringType()))
papers_df = papers_df.withColumn("abstract", split_function(col("abstract")))
papers_df.show(5)

+--------+--------------------+
|paper_id|            abstract|
+--------+--------------------+
|   80546|[genetic, code, r...|
| 5842862|[choosing, good, ...|
| 1242600|[scientists, typi...|
| 3467077|[scientists, mana...|
|  309395|[increasing, conc...|
+--------+--------------------+
only showing top 5 rows



In [174]:
users_paper_pair = users_library_df.withColumn("papers", explode(users_library_df.papers))
users_paper_pair.show(5)

+--------------------+-------+
|             user_id| papers|
+--------------------+-------+
|28d3f81251d94b097...|3929762|
|28d3f81251d94b097...| 503574|
|28d3f81251d94b097...|5819422|
|28d3f81251d94b097...|4238883|
|28d3f81251d94b097...|5788061|
+--------------------+-------+
only showing top 5 rows



In [175]:
users_abstract = users_paper_pair.join(papers_df,users_paper_pair.papers ==  papers_df.paper_id,"inner")
users_abstract_words = users_abstract.withColumn("abstract_words", explode(col("abstract")))['user_id','abstract_words']
users_words_count = users_abstract_words.groupBy("user_id","abstract_words").count()

In [176]:
windowPartition = Window.partitionBy("user_id").orderBy(col("count").desc())
users_words_count = users_words_count.withColumn("ranking",row_number().over(windowPartition))

In [177]:
users_words_count.show()

+--------------------+--------------+-----+-------+
|             user_id|abstract_words|count|ranking|
+--------------------+--------------+-----+-------+
|00095808cdc611fb5...|        errors|    5|      1|
|00095808cdc611fb5...|          text|    3|      2|
|00095808cdc611fb5...|   information|    3|      3|
|00095808cdc611fb5...|        impact|    2|      4|
|00095808cdc611fb5...|           web|    2|      5|
|00095808cdc611fb5...|          list|    2|      6|
|00095808cdc611fb5...|    department|    2|      7|
|00095808cdc611fb5...|   recognition|    2|      8|
|00095808cdc611fb5...| automatically|    2|      9|
|00095808cdc611fb5...|         error|    2|     10|
|00095808cdc611fb5...|          site|    2|     11|
|00095808cdc611fb5...|          data|    2|     12|
|00095808cdc611fb5...|     character|    2|     13|
|00095808cdc611fb5...|       problem|    2|     14|
|00095808cdc611fb5...|      analyzed|    1|     15|
|00095808cdc611fb5...|       induced|    1|     16|
|00095808cdc

In [178]:
users_words_count.filter(col("ranking") <= 10).show()

+--------------------+--------------+-----+-------+
|             user_id|abstract_words|count|ranking|
+--------------------+--------------+-----+-------+
|00095808cdc611fb5...|        errors|    5|      1|
|00095808cdc611fb5...|          text|    3|      2|
|00095808cdc611fb5...|   information|    3|      3|
|00095808cdc611fb5...|        impact|    2|      4|
|00095808cdc611fb5...|           web|    2|      5|
|00095808cdc611fb5...|          list|    2|      6|
|00095808cdc611fb5...|    department|    2|      7|
|00095808cdc611fb5...|   recognition|    2|      8|
|00095808cdc611fb5...| automatically|    2|      9|
|00095808cdc611fb5...|         error|    2|     10|
|000ac87bf9c1623ee...| consciousness|   14|      1|
|000ac87bf9c1623ee...|         place|    2|      2|
|000ac87bf9c1623ee...|       mystery|    2|      3|
|000ac87bf9c1623ee...|       account|    2|      4|
|000ac87bf9c1623ee...|         world|    2|      5|
|000ac87bf9c1623ee...|       problem|    2|      6|
|000ac87bf9c