In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, desc, sum as _sum, trim, when

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("ResearchAnalysis").getOrCreate()

In [3]:
# Load JSON data
df = spark.read.json("dblp_v14.json")

In [4]:
# Remove corrupt records
df = df.filter(col("_corrupt_record").isNull())

In [5]:
# Drop rows where authors, year, n_citation, or title is null
df = df.dropna(subset=["authors", "year", "n_citation", "title"])

In [6]:
# Drop duplicate rows
df = df.dropDuplicates()

In [7]:
# Ensure 'year' is an integer
df = df.withColumn("year", col("year").cast("int"))

In [8]:
# Trim whitespace from 'title' and 'abstract'
df = df.withColumn("title", trim(col("title")))
df = df.withColumn("abstract", trim(col("abstract")))

In [9]:
# Most Cited Papers
most_cited_papers = df.select("id", "title", "authors", "n_citation") \
                      .orderBy(desc("n_citation")) \
                      .limit(11)

In [None]:
most_cited_papers.show()

In [10]:
most_cited_papers.write.json("top_papers")