In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, coalesce, when, expr

In [2]:
spark = SparkSession.builder \
    .appName("SocialMediaAnalytics") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/26 16:19:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
# Load YouTube data
youtube_df = spark.read.option("multiline", "true").json("s3a://socialmedia/youtube/*/*.json")

In [9]:
youtube_df.printSchema()

root
 |-- channel_title: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- description: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- platform: string (nullable = true)
 |-- post_id: string (nullable = true)
 |-- published_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- views: string (nullable = true)



In [11]:
youtube_df_clean = youtube_df.select(
    col("post_id").alias("id"),
    col("title").alias("title"),
    col("channel_title").alias("author"),
    col("likes").cast("int").alias("likes"),
    col("comments").cast("int").alias("comments"),
    col("published_at").alias("created_at")
).withColumn("platform", lit("youtube"))

# Add score column (likes + comments)
youtube_df_clean = youtube_df_clean.withColumn(
    "score", col("likes") + col("comments")
)

In [18]:
youtube_df_clean.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- likes: integer (nullable = true)
 |-- comments: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- platform: string (nullable = false)
 |-- score: integer (nullable = true)



In [13]:
# load reddit data
reddit_df = spark.read.option("multiline", "true").json("s3a://socialmedia/reddit/*/*.json")

In [14]:
reddit_df.printSchema()

root
 |-- created_utc: string (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- platform: string (nullable = true)
 |-- post_id: string (nullable = true)
 |-- score: long (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [16]:
reddit_df_clean = reddit_df.select(
    col("post_id").alias("id"),
    col("title").alias("title"),
    lit(None).alias("author"),            # author not present in your schema
    col("score").cast("int").alias("likes"),
    col("num_comments").cast("int").alias("comments"),
    col("created_utc").alias("created_at")
).withColumn("platform", lit("reddit"))

# Add score = likes + comments
reddit_df_clean = reddit_df_clean.withColumn(
    "score", col("likes") + col("comments")
)

In [17]:
reddit_df_clean.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author: void (nullable = true)
 |-- likes: integer (nullable = true)
 |-- comments: integer (nullable = true)
 |-- created_at: string (nullable = true)
 |-- platform: string (nullable = false)
 |-- score: integer (nullable = true)



In [20]:
# Union both platforms into one DF
social_df = youtube_df_clean.unionByName(reddit_df_clean)

In [26]:
postgres_url = "jdbc:postgresql://postgres:5432/social_media_analytics"  # use service name "postgres" + DB name
postgres_props = {
    "user": "admin",
    "password": "password",
    "driver": "org.postgresql.Driver"
}

In [27]:
social_df.write.jdbc(
    url=postgres_url,
    table="social_media_posts",
    mode="append",
    properties=postgres_props
)