In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when

In [2]:
spark = (
    SparkSession.builder
    .appName("Local-ETL-Test")
    .master("local[*]")
    .config("spark.driver.memory", "2g")
    .config("spark.sql.files.maxPartitionBytes", 256 * 1024 * 1024) # 256 * 1024 * 1024 bytes
    .config("spark.sql.shuffle.partitions", "200") # 200 partitions for shuffle operations
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [3]:
mapping = spark.read.json("/data/destination/log_search/category/mapping.jsonl")

In [4]:
mapping.groupBy("category") \
  .count() \
  .orderBy(F.desc("count")) \
  .show(87, truncate=False)

+-----------------------------------------+------+
|category                                 |count |
+-----------------------------------------+------+
|Other                                    |313962|
|Drama                                    |57156 |
|Sports                                   |15306 |
|Animation                                |14439 |
|Romance                                  |9569  |
|Music                                    |8669  |
|Reality Show                             |4688  |
|Action                                   |4504  |
|Horror                                   |3528  |
|K Drama                                  |1905  |
|TV Channel                               |1795  |
|Comedy                                   |1422  |
|C Drama                                  |1086  |
|News                                     |1058  |
|Education                                |27    |
|Fantasy                                  |13    |
|Documentary                   

In [5]:
def map_category(df, src_col="category_right", dst_col="category"):
    """
    Chuẩn hoá category_right về tập category chuẩn.
    """

    return (
        df.withColumn(
            dst_col,
            when(col(src_col).isin("Action"), "Action")
            .when(col(src_col).isin("Romance"), "Romance")
            .when(col(src_col).isin("Comedy"), "Comedy")
            .when(col(src_col).isin("Horror"), "Horror")
            .when(col(src_col).isin("Animation", ".Animation", "Animation/Drama"), "Animation")
            .when(col(src_col).isin("Drama", "Drama/Other"), "Drama")
            .when(col(src_col).isin("C Drama"), "C Drama")
            .when(col(src_col).isin("K Drama", "KDrama"), "K Drama")
            .when(col(src_col).isin("Sports", "Sport", "Live Sports", "Sports/Drama"), "Sports")
            .when(col(src_col).isin("Music", "Music/Drama"), "Music")
            .when(col(src_col).isin("Reality Show", "Reality", "Dance/Reality", "Cooking/Reality"), "Reality Show")
            .when(col(src_col).isin("TV Channel", "Live"), "TV Channel")
            .when(col(src_col).isin("News"), "News")
            .otherwise("Other")
        )
    )


In [6]:
mapping = map_category(mapping, "category", "category_std")


In [9]:
mapping.groupBy("category_std") \
  .count() \
  .orderBy(F.desc("count")) \
  .show(87, truncate=False)

+------------+------+
|category_std|count |
+------------+------+
|Other       |314066|
|Drama       |57156 |
|Sports      |15309 |
|Animation   |14441 |
|Romance     |9569  |
|Music       |8670  |
|Reality Show|4691  |
|Action      |4504  |
|Horror      |3528  |
|K Drama     |1906  |
|TV Channel  |1796  |
|Comedy      |1422  |
|C Drama     |1086  |
|News        |1058  |
+------------+------+



In [10]:
mapping = mapping.select("keyword", "category_std")

In [11]:
mapping.show()

+--------------------+------------+
|             keyword|category_std|
+--------------------+------------+
|thời sự 19 giờ hô...|       Other|
|lop hoc bo tuc da...|       Other|
|trở thành nữ chín...|       Other|
|      bệnh nhân việt|       Other|
|    sơ mi ăn kẹo mắt|       Other|
|phương hữu ngọt c...|       Other|
|        piranha 3d 2|       Other|
|               beati|       Other|
|  Vương miện tội lỗi|       Other|
|         thử thách r|       Other|
| sứ wales vs ukraina|       Other|
|        ngoinhachung|       Other|
|dư tiên sinh xin ...|       Other|
|Chú dược sư tiếng...|       Other|
|xem trận brazil -...|       Other|
|         bác ãi ma|       Other|
|     vua bep souma 1|       Other|
|         20 bất hoặc|       Other|
|           make you |       Other|
|u19 indonesia vs ...|       Other|
+--------------------+------------+
only showing top 20 rows

