In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, sum as _sum, when, trim, max as _max


In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("TopAuthorsByCitation").getOrCreate()

In [25]:
# Load JSON data
df = spark.read.json("C:\\Users\\wmizz\\Desktop\\sem6\\CSC4406\\Project\\dblp_v14.json")

In [26]:
# Clean the data: 
# 1. Remove rows with corrupted records
# 2. Fill null values in necessary columns
clean_df = df.filter(df._corrupt_record.isNull())

# Fill null values with default values
clean_df = clean_df.fillna({"n_citation": 0})

# Explode the authors array to create a row for each author
exploded_df = clean_df.select(explode(col("authors")).alias("author"), col("n_citation"))

# Filter out null authors and null author fields
exploded_df = exploded_df.filter(col("author").isNotNull())
exploded_df = exploded_df.filter(col("author.id").isNotNull() & (trim(col("author.id")) != ''))

# Handle nulls in nested fields
exploded_df = exploded_df.withColumn("author_id", col("author.id"))
exploded_df = exploded_df.withColumn("author_name", when(col("author.name").isNotNull(), col("author.name")).otherwise("Unknown"))

In [27]:
# Select the necessary columns and group by author id, summing up the citations
author_citations_df = exploded_df.groupBy("author_id").agg(
    _sum("n_citation").alias("total_citations"),
    _max("author_name").alias("author_name")  # To show the author's name alongside their ID
)

# Order by total citations in descending order and select the top 10 author IDs
top_authors_df = author_citations_df.orderBy(col("total_citations").desc()).limit(10)

In [28]:
# Show the top 10 authors with the most citations
top_authors_df.show(truncate=False)

+------------------------+---------------+------------------+
|author_id               |total_citations|author_name       |
+------------------------+---------------+------------------+
|53f4ba75dabfaed83977b7db|537922         |yoshua bengio dept|
|53f366a7dabfae4b3499c6fe|399813         |geoffrey e hinton |
|53f46ca8dabfaec09f2584aa|373318         |andrew zisserman  |
|53f431badabfaee02ac9803b|338000         |kaiming he        |
|53f458fcdabfaeecd69f5094|309489         |ilya sutskever    |
|53f43097dabfaedf4353f3bc|304473         |jian sun          |
|5405bdaadabfae92b41fb68a|250006         |Simonyan, Karen   |
|53f4340adabfaec22ba6a6f8|242899         |ross girshick     |
|53f42b4fdabfaedce54a3fe9|241463         |shaoqing ren      |
|53f48919dabfaee4dc8b219c|222377         |yann lecun        |
+------------------------+---------------+------------------+



In [29]:
top_authors_df.write.json("top_authors")