In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, desc, sum as _sum, trim, when

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("ResearchAnalysis").getOrCreate()

In [3]:
# Load JSON data
df = spark.read.json("C:\\Users\\wmizz\\Desktop\\sem6\\CSC4406\\Project\\dblp_v14.json")

In [4]:
# Remove corrupt records
df = df.filter(col("_corrupt_record").isNull())

In [5]:
# Drop rows where authors, year, n_citation, or title is null
df = df.dropna(subset=["authors", "year", "n_citation", "title"])

In [6]:
# Drop duplicate rows
df = df.dropDuplicates()

In [7]:
# Ensure 'year' is an integer
df = df.withColumn("year", col("year").cast("int"))

In [8]:
# Trim whitespace from 'title' and 'abstract'
df = df.withColumn("title", trim(col("title")))
df = df.withColumn("abstract", trim(col("abstract")))

In [14]:
# Research Areas with Highest Impact
# Explode the fos array to analyze each field of study individually
fos_df = df.select(explode("fos").alias("field_of_study"), "n_citation")

In [15]:
# Aggregate citations by field of study
top_research_areas = fos_df.groupBy("field_of_study.name") \
                           .agg(_sum("n_citation").alias("total_citations")) \
                           .orderBy(desc("total_citations")) \
                           .limit(10)

In [16]:
# Display Research Areas with Highest Impact
top_research_areas.show()

+--------------------+---------------+
|                name|total_citations|
+--------------------+---------------+
|    Computer science|       72340451|
|Artificial intell...|       33074426|
|         Mathematics|       19431528|
|    Machine learning|       15006056|
|     Computer vision|       12390401|
| Pattern recognition|       11739926|
|         Data mining|       10174712|
|Mathematical opti...|        9388651|
|    Computer network|        7787029|
|Distributed compu...|        7758253|
+--------------------+---------------+



In [17]:
top_research_areas.write.json("top_research_areas")

In [13]:
# Institutions with Highest Impact
# Explode the authors array to analyze each author individually and extract institutions
institutions_df = df.select(explode("authors").alias("author"), "n_citation")


In [14]:
# Filter out rows with null or empty organization names
institutions_df = institutions_df.filter(trim(col("author.org")) != "")

In [18]:
# Filter out entries labeled as "Corresponding author"
cleaned_institutions_df = institutions_df.filter(col("author.org") != "Corresponding author.")


In [19]:
# Now, proceed with the aggregation and analysis steps
# Aggregate citations by institution
top_institutions = cleaned_institutions_df.groupBy("author.org") \
                                          .agg(_sum("n_citation").alias("total_citations")) \
                                          .orderBy(desc("total_citations")) \
                                          .limit(10)

In [17]:
# Display Institutions with Highest Impact
top_institutions.show()

+--------------------+---------------+
|                 org|total_citations|
+--------------------+---------------+
|  Microsoft Research|        1158633|
|Corresponding aut...|        1122135|
| Stanford University|         837854|
|              Google|         817131|
|Google Brain, Mou...|         730766|
|Carnegie Mellon U...|         691218|
|                IEEE|         676826|
|University of Cal...|         591633|
| stanford university|         551184|
|Université de Mon...|         497980|
+--------------------+---------------+



In [20]:
top_institutions.write.json("top_institutions")

In [21]:
spark.stop()