# **Arxiv metadata Analytics with PySpark RDD: JSON case study**

### Udemy Course: Best Hands-on Big Data Practices and Use Cases using PySpark

### Author: Amin Karami (PhD, FHEA)
#### email: amin.karami@ymail.com

In [None]:
########## ONLY in Colab ##########
!pip3 install pyspark
########## ONLY in Colab ##########

In [None]:
########## ONLY in Ubuntu Machine ##########
# Load Spark engine
# !pip3 install -q findspark
# import findspark
# findspark.init()
########## ONLY in Ubuntu Machine ##########

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!pip install -q kaggle
!kaggle datasets download Cornell-University/arxiv
!unzip arxiv.zip -d ./dataset

In [None]:
# Initializing Spark
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local[*]').setAppName('arxiv-JSON')
sc = SparkContext(conf=conf)
sc

In [None]:
# Read and Load Data to Spark
# Data source: https://www.kaggle.com/Cornell-University/arxiv/version/62
import json
from pyspark import StorageLevel

rdd_txt = sc.textFile('./dataset/arxiv-metadata-oai-snapshot.json', 200)
rdd = rdd_txt.map(lambda x: json.loads(x))
rdd.persist(StorageLevel.MEMORY_AND_DISK_2)

In [None]:
# Check the number of parallelism and partitions:
print(sc.defaultParallelism, rdd.getNumPartitions())

## Question 1: Count elements

In [None]:
rdd.count()

## Question 2: Get the first two records


In [None]:
rdd.take(2)

## Question 3: Get all attributes


In [None]:
rdd.flatMap(lambda x: x.keys()).distinct().collect()

## Question 4: Get the name of the licenses

In [None]:
rdd.map(lambda x: x['license']).distinct().collect()

## Question 5: Get the shortest and the longest titles

In [None]:
longest_title = rdd.map(lambda x: x['title']).reduce(lambda x,y: x if len(x)>len(y) else y)
shortest_title = rdd.map(lambda x: x['title']).reduce(lambda x,y: x if len(x)<len(y) else y)
print('longest_title: '+longest_title)
print('shortest_title: '+shortest_title)

## Question 6: Find abbreviations with 5 or more letters in the abstract

In [None]:
import re

def get_abbrevations(line):
  result = re.search(r"\(([a-zA-Z][^_ /\\<>]{5,})\)", line)
  if result:
    return result.group(1)

In [None]:
rdd.filter(lambda x: get_abbrevations(x['abstract']) != None).count()

In [None]:
rdd.filter(lambda x: get_abbrevations(x['abstract']) != None).take(5)

## Question 7: Get the number of archive records per month ('update_date' attribute)

In [None]:
import datetime

def extract_month(DateIn):
  DateOut = datetime.datetime.strptime(DateIn, "%Y-%m-%d")
  return DateOut.month

In [None]:
rdd.map(lambda x: (extract_month(x['update_date']), 1))\
  .reduceByKey(lambda x,y: x+y)\
  .sortBy(lambda l: l[1])\
  .collect()

## Question 8: Get the average number of pages

In [None]:
def get_pages(line):
  line = line if line != None else '0 pages'
  result = re.findall("\d+ pages", line)
  if result:
    return int(result[0].split(' ')[0])
  else:
    return 0

In [None]:
rdd_avg = rdd.map(lambda x: get_pages(x['comments'])).filter(lambda x: x != 0)
suma = rdd_avg.reduce(lambda x,y: x+y)
n = rdd_avg.count()
print(suma/n)