In [0]:
mount_name = "youtube-news-comments"
mount_point = f"/mnt/{mount_name}"

In [0]:
all_items = dbutils.fs.ls(mount_point+"/youtube-comments/")
json_files = [item.path for item in all_items if item.path.endswith(".json")]
new_file = sorted(json_files, reverse=True)[0]

In [0]:
df = spark.read.json(new_file)
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- like_count: long (nullable = true)
 |-- published_at: string (nullable = true)
 |-- text: string (nullable = true)
 |-- video_id: string (nullable = true)



In [0]:
# preprocessing
from pyspark.sql.functions import col, lower

df = df.dropDuplicates()
df = df.filter(col("text").isNotNull() & (col("text") != ""))
df = df.withColumn("text", lower(col("text")))

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

tokenizer = Tokenizer(inputCol="text", outputCol="words")
df = tokenizer.transform(df)

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

In [0]:
# save processed json file
import os

# extract filename from the original path
filename = new_file.split("/")[-1]

# replace the .json extension with _processed.json
processed_filename = filename.replace(".json", "_processed.json")

# construct the new path for the processed data
processed_path = new_file.rsplit("/", 1)[0] + "/" + processed_filename

# save the processed DataFrame to the new path
df.write.json(processed_path)
