In [1]:
# import datetime

# start_time = datetime.datetime.now()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

spark = (
    SparkSession.builder
    .appName("Local-ETL-Test")
    .master("local[*]")
    .config("spark.driver.memory", "2g")
    .config("spark.sql.files.maxPartitionBytes", 256 * 1024 * 1024) # 256 * 1024 * 1024 bytes
    .config("spark.sql.shuffle.partitions", "200") # 200 partitions for shuffle operations
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [3]:
folder_path = "/data/raw/log_content/20220401.json" 
save_path = "/data/destination/log_content/20220401/"

In [4]:
# file_list = list_files_sorted(folder_path)
# file_list

In [5]:
def read_data(path):
	df = spark.read.json(path)
	return df 

In [6]:
def select_fields(df):	
	df = df.select("_source.*")
	return df

In [7]:
def transform_category(df):
    return df.withColumn(
        "Type",
        when(
            col("AppName").isin("CHANNEL", "DSHD", "KPLUS", "KPlus"),
            "Truyền Hình"
        )
        .when(
            col("AppName").isin("VOD", "FIMS_RES", "BHD_RES", "VOD_RES", "FIMS", "BHD", "DANET"),
            "Phim Truyện"
        )
        .when(col("AppName") == "RELAX", "Giải Trí")
        .when(col("AppName") == "CHILD", "Thiếu Nhi")
        .when(col("AppName") == "SPORT", "Thể Thao")
        .otherwise("Error")
    )

In [8]:
def calculate_statistics(df):
    return (
        df.groupBy("Contract", "Type")
          .agg(sum("TotalDuration").alias("TotalDuration"))
          .groupBy("Contract")
          .pivot("Type")
          .sum("TotalDuration")
          .na.fill(0)
    )

In [9]:
def finalize_result(statistics,total_devices):
	result = statistics.join(total_devices,'Contract','inner')
	return result 

In [10]:
def save_data(result, path):
    """
    Ghi CSV với 1 file đầu ra. Dùng overwrite.
    """
    (
        result
        .repartition(1)
        .write
        .mode("overwrite")
        .option("header", "true")
        .csv(path)
    )
    print(f"Data saved to {path}")

In [13]:
def main(path):
    print('-------------Reading data from path--------------')
    df = read_data(path)
    df.show()
    print('-------------Selecting fields--------------')
    df = select_fields(df)
    df.show()
    # print('-------------Calculating Devices --------------')
    # total_devices = calculate_devices(df)
    # total_devices.show()
    print('-------------Transforming Category --------------')
    df = transform_category(df)
    df.show()
    print('-------------Calculating Statistics --------------')
    statistics = calculate_statistics(df)
    statistics.show()
    # print('-------------Finalizing result --------------')
    # result = finalize_result(statistics, total_devices)
    # result.show()
    # print('-------------Saving Results --------------')
    # save_data(statistics, save_path)
    return print('Task finished')


In [15]:
main(folder_path)

-------------Reading data from path--------------
+--------------------+-------+------+--------------------+-----+
|                 _id| _index|_score|             _source|_type|
+--------------------+-------+------+--------------------+-----+
|AX_momhia1FFivsGrn9o|history|     0|{KPLUS, HNH579912...|kplus|
|AX_momhca1FFivsGrnvg|history|     0|{KPLUS, HUFD40665...|kplus|
|AX_momhaa1FFivsGrnny|history|     0|{KPLUS, HNH572635...|kplus|
|AX_momhca1FFivsGrnvv|history|     0|{KPLUS, HND141717...|kplus|
|AX_momhia1FFivsGrn98|history|     0|{KPLUS, HNH743103...|kplus|
|AX_momg9a1FFivsGrnkS|history|     0|{KPLUS, HNH893773...|kplus|
|AX_momhca1FFivsGrnwA|history|     0|{KPLUS, HND083642...|kplus|
|AX_momhfa1FFivsGrn2u|history|     0|{KPLUS, DNFD74404...|kplus|
|AX_momhca1FFivsGrnwP|history|     0|{KPLUS, DTFD21200...|kplus|
|AX_momhca1FFivsGrnwU|history|     0|{KPLUS, LDFD05747...|kplus|
|AX_momhfa1FFivsGrn24|history|     0|{KPLUS, HNH063566...|kplus|
|AX_momhia1FFivsGrn-W|history|     0|{KP