In [0]:
len(dbutils.fs.ls("/Volumes/workspace/default/assignment1/cve/json_files/"))

38727

In [0]:
# Databricks notebook source
# ==============================================================
# 🥇 CVE LAKEHOUSE – GOLD / EXPLORATORY ANALYSIS (FINAL OUTPUT)
# ==============================================================

from pyspark.sql import functions as F

# ---------------------------------------------------------------
# Paths
# ---------------------------------------------------------------
silver_core_path = "/Volumes/workspace/default/assignment1/silver/core"
silver_aff_path  = "/Volumes/workspace/default/assignment1/silver/affected"

# ---------------------------------------------------------------
# 1️⃣ Load Silver Tables
# ---------------------------------------------------------------
df_core = spark.read.format("delta").load(silver_core_path)
df_aff  = spark.read.format("delta").load(silver_aff_path)

print(f"✅ Loaded core: {df_core.count():,} records")
print(f"✅ Loaded affected: {df_aff.count():,} records")

# ---------------------------------------------------------------
# 2️⃣ Temporal Trend Analysis
# ---------------------------------------------------------------
print("\n📆 CVE publications per month (2024)...")
df_trend = (
    df_core
    .withColumn("month", F.date_format("date_published", "yyyy-MM"))
    .groupBy("month")
    .agg(F.count("*").alias("cve_count"))
    .orderBy("month")
)
display(df_trend)

# ---------------------------------------------------------------
# 3️⃣ Severity Distribution
# ---------------------------------------------------------------
print("\n⚠️ CVSS Base Severity Distribution...")
df_sev = (
    df_core
    .groupBy("base_severity")
    .agg(F.count("*").alias("count"))
    .orderBy(F.desc("count"))
)
display(df_sev)

# ---------------------------------------------------------------
# 4️⃣ Top Vendors by CVE Count
# ---------------------------------------------------------------
print("\n🏢 Top 25 Vendors by Vulnerability Count...")
df_vendor = (
    df_aff
    .groupBy("vendor")
    .agg(F.countDistinct("cve_id").alias("unique_cves"))
    .orderBy(F.desc("unique_cves"))
    .limit(25)
)
display(df_vendor)

# ---------------------------------------------------------------
# 5️⃣ Join Example (Core + Affected)
# ---------------------------------------------------------------
print("\n🔗 Joining Core + Affected to enrich details...")
df_joined = (
    df_aff.join(df_core, "cve_id", "left")
          .select("cve_id", "vendor", "product", "base_score", "base_severity", "description_text")
)
display(df_joined.limit(10))

# ---------------------------------------------------------------
# 6️⃣ Optional: Save Gold Aggregations
# ---------------------------------------------------------------
gold_path = "/Volumes/workspace/default/assignment1/gold"
df_joined.write.format("delta").mode("overwrite").save(gold_path)
print(f"✅ Gold layer written to: {gold_path}")

print("\n📸 REQUIRED SCREENSHOTS:")
print("   • df_trend (CVE counts by month)")
print("   • df_sev (severity distribution)")
print("   • df_vendor (top vendors)")
print("   • df_joined (joined preview)")
print("   • dbutils.fs.ls(gold_path) to show _delta_log + parquet files")


✅ Loaded core: 32,924 records
✅ Loaded affected: 61,825 records

📆 CVE publications per month (2024)...


month,cve_count
2024-01,1134
2024-02,1769
2024-03,2616
2024-04,3218
2024-05,3348
2024-06,2707
2024-07,2877
2024-08,2692
2024-09,2408
2024-10,3373



⚠️ CVSS Base Severity Distribution...


base_severity,count
,14768
MEDIUM,9953
HIGH,6001
CRITICAL,1415
LOW,779
NONE,8



🏢 Top 25 Vendors by Vulnerability Count...


vendor,unique_cves
,5466
Linux,2794
Microsoft,1107
Adobe,741
Unknown,610
,581
SourceCodester,557
Google,546
Apple,468
Oracle Corporation,366



🔗 Joining Core + Affected to enrich details...


cve_id,vendor,product,base_score,base_severity,description_text
CVE-2024-0001,Pure Storage,FlashArray,10.0,CRITICAL,A condition exists in FlashArray Purity whereby a local account intended for initial array configuration remains active potentially allowing a malicious actor to gain elevated privileges.
CVE-2024-0002,PureStorage,FlashArray,10.0,CRITICAL,A condition exists in FlashArray Purity whereby an attacker can employ a privileged account allowing remote access to the array.
CVE-2024-0003,PureStorage,FlashArray,9.1,CRITICAL,A condition exists in FlashArray Purity whereby a malicious user could use a remote administrative service to create an account on the array allowing privileged access.
CVE-2024-0004,PureStorage,FlashArray,9.1,CRITICAL,A condition exists in FlashArray Purity whereby an user with array admin role can execute arbitrary commands remotely to escalate privilege on the array.
CVE-2024-0005,PureStorage,FlashArray,9.1,CRITICAL,A condition exists in FlashArray and FlashBlade Purity whereby a malicious user could execute arbitrary commands remotely through a specifically crafted SNMP configuration.
CVE-2024-0005,PureStorage,FlashBlade,9.1,CRITICAL,A condition exists in FlashArray and FlashBlade Purity whereby a malicious user could execute arbitrary commands remotely through a specifically crafted SNMP configuration.
CVE-2024-0006,YugabyteDB,YugabyteDB Anywhere,,,"Information exposure in the logging system in Yugabyte Platform allows local attackers with access to application logs to obtain database user credentials in log files, potentially leading to unauthorized database access."
CVE-2024-0007,Palo Alto Networks,PAN-OS,6.8,MEDIUM,A cross-site scripting (XSS) vulnerability in Palo Alto Networks PAN-OS software enables a malicious authenticated read-write administrator to store a JavaScript payload using the web interface on Panorama appliances. This enables the impersonation of another authenticated administrator.
CVE-2024-0007,Palo Alto Networks,Prisma Access,6.8,MEDIUM,A cross-site scripting (XSS) vulnerability in Palo Alto Networks PAN-OS software enables a malicious authenticated read-write administrator to store a JavaScript payload using the web interface on Panorama appliances. This enables the impersonation of another authenticated administrator.
CVE-2024-0007,Palo Alto Networks,Cloud NGFW,6.8,MEDIUM,A cross-site scripting (XSS) vulnerability in Palo Alto Networks PAN-OS software enables a malicious authenticated read-write administrator to store a JavaScript payload using the web interface on Panorama appliances. This enables the impersonation of another authenticated administrator.


✅ Gold layer written to: /Volumes/workspace/default/assignment1/gold

📸 REQUIRED SCREENSHOTS:
   • df_trend (CVE counts by month)
   • df_sev (severity distribution)
   • df_vendor (top vendors)
   • df_joined (joined preview)
   • dbutils.fs.ls(gold_path) to show _delta_log + parquet files
