In [15]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = (
    SparkSession.builder
    .appName("Local-ETL-Test")
    .master("local[*]")
    .config("spark.driver.memory", "2g")
    .config("spark.sql.files.maxPartitionBytes", 256 * 1024 * 1024) # 256 * 1024 * 1024 bytes
    .config("spark.sql.shuffle.partitions", "200") # 200 partitions for shuffle operations
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [9]:
df = spark.read.json("/data/destination/log_search/category/*.jsonl")


In [17]:
df.groupBy("category") \
  .count() \
  .orderBy(F.desc("count")) \
  .show(87, truncate=False)

+-----------------------------------------+------+
|category                                 |count |
+-----------------------------------------+------+
|Other                                    |313962|
|Drama                                    |57156 |
|Sports                                   |15306 |
|Animation                                |14439 |
|Romance                                  |9569  |
|Music                                    |8669  |
|Reality Show                             |4688  |
|Action                                   |4504  |
|Horror                                   |3528  |
|K Drama                                  |1905  |
|TV Channel                               |1795  |
|Comedy                                   |1422  |
|C Drama                                  |1086  |
|News                                     |1058  |
|Education                                |27    |
|Fantasy                                  |13    |
|Documentary                   

In [18]:
df = df.withColumn(
    "category_std",
    F.when(F.col("category").rlike("(?i)action"), "Action")
     .when(F.col("category").rlike("(?i)c drama"), "C Drama")
     .when(F.col("category").rlike("(?i)k drama|kdrama"), "K Drama")
     .when(F.col("category").rlike("(?i)drama"), "Drama")
     .when(F.col("category").rlike("(?i)romance"), "Romance")
     .when(F.col("category").rlike("(?i)comedy"), "Comedy")
     .when(F.col("category").rlike("(?i)horror"), "Horror")
     .when(F.col("category").rlike("(?i)animation|anime"), "Animation")
     .when(F.col("category").rlike("(?i)sports|sport"), "Sports")
     .when(F.col("category").rlike("(?i)music"), "Music")
     .when(F.col("category").rlike("(?i)reality"), "Reality Show")
     .when(F.col("category").rlike("(?i)tv|channel"), "TV Channel")
     .when(F.col("category").rlike("(?i)news"), "News")
     .otherwise("Other")
)