In [None]:
# import datetime

# start_time = datetime.datetime.now()

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

spark = (
    SparkSession.builder
    .appName("Local-ETL-Test")
    .master("local[*]")
    .config("spark.driver.memory", "8g")
    .config("spark.sql.files.maxPartitionBytes", 256 * 1024 * 1024) # 256 * 1024 * 1024 bytes
    .config("spark.sql.shuffle.partitions", "200") # 200 partitions for shuffle operations
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [2]:
folder_path = "/data/raw/log_content/20220401.json" 
save_path = "/data/destination/log_content/20220401/"

In [None]:
# def list_files_sorted(path):
#     """
#     Trả về danh sách file full-path theo thứ tự tên file.
#     """
#     files = [
#         os.path.join(path, f)
#         for f in os.listdir(path)
#         if os.path.isfile(os.path.join(path, f))
#     ]
#     return sorted(files)

In [None]:
# file_list = list_files_sorted(folder_path)
# file_list

['/data/raw/log_content/20220401.json',
 '/data/raw/log_content/20220402.json',
 '/data/raw/log_content/20220403.json',
 '/data/raw/log_content/20220404.json',
 '/data/raw/log_content/20220405.json',
 '/data/raw/log_content/20220406.json',
 '/data/raw/log_content/20220407.json',
 '/data/raw/log_content/20220408.json',
 '/data/raw/log_content/20220409.json',
 '/data/raw/log_content/20220410.json',
 '/data/raw/log_content/20220411.json',
 '/data/raw/log_content/20220412.json',
 '/data/raw/log_content/20220413.json',
 '/data/raw/log_content/20220414.json',
 '/data/raw/log_content/20220415.json',
 '/data/raw/log_content/20220416.json',
 '/data/raw/log_content/20220417.json',
 '/data/raw/log_content/20220418.json',
 '/data/raw/log_content/20220419.json',
 '/data/raw/log_content/20220420.json',
 '/data/raw/log_content/20220421.json',
 '/data/raw/log_content/20220422.json',
 '/data/raw/log_content/20220423.json',
 '/data/raw/log_content/20220424.json',
 '/data/raw/log_content/20220425.json',


In [11]:
def read_data(path):
	df = spark.read.json(path)
	return df 

In [12]:
df = read_data(folder_path)
df = df.select("_source.*")
df.show(5)

+-------+---------+------------+-------------+
|AppName| Contract|         Mac|TotalDuration|
+-------+---------+------------+-------------+
|  KPLUS|HNH579912|0C96E62FC55C|          254|
|  KPLUS|HUFD40665|CCEDDC333614|         1457|
|  KPLUS|HNH572635|B068E6A1C5F6|         2318|
|  KPLUS|HND141717|08674EE8D2C2|         1452|
|  KPLUS|HNH743103|402343C25D7D|          251|
+-------+---------+------------+-------------+
only showing top 5 rows



In [25]:
df.groupBy("Contract","Type").agg(sum("TotalDuration").alias("Total_Duration")) \
    .groupBy("Contract") \
    .pivot("Type") \
    .sum("Total_Duration") \
    .na.fill(0) \
    .show()

+---------+--------+-----------+---------+--------+-----------+
| Contract|Giải Trí|Phim Truyện|Thiếu Nhi|Thể Thao|Truyền Hình|
+---------+--------+-----------+---------+--------+-----------+
|HTFD11598|       0|       2884|        0|       0|        707|
|HPFD48556|      69|          0|        0|       0|      92976|
|NBFD10014|       0|          0|        0|       0|      84628|
|HNH619088|       0|       8456|      234|       0|      65210|
|HNH036174|       0|          0|        0|       0|       6049|
|DNH067877|       0|          0|        0|       0|       5760|
|HDFD42710|       0|          0|        0|       0|      12096|
|SGH674576|       0|       1535|        0|       0|       9910|
|SGH446342|       0|          0|        0|       0|      11428|
|DLFD14250|       0|       4436|        0|       0|      12374|
|HNH720916|       0|        538|        0|       0|      12535|
|DNH055515|       0|       6755|        0|       0|       3729|
|TND026221|       0|      18335|        

In [None]:
def select_fields(df):	
	df = df.select("_source.*")
	return df

In [8]:
def transform_category(df):
    return df.withColumn(
        "Type",
        when(
            col("AppName").isin("CHANNEL", "DSHD", "KPLUS", "KPlus"),
            "Truyền Hình"
        )
        .when(
            col("AppName").isin("VOD", "FIMS_RES", "BHD_RES", "VOD_RES", "FIMS", "BHD", "DANET"),
            "Phim Truyện"
        )
        .when(col("AppName") == "RELAX", "Giải Trí")
        .when(col("AppName") == "CHILD", "Thiếu Nhi")
        .when(col("AppName") == "SPORT", "Thể Thao")
        .otherwise("Error")
    )

In [9]:
def calculate_statistics(df):
    return (
        df.groupBy("Contract", "Type")
          .agg(sum("TotalDuration").alias("TotalDuration"))
          .groupBy("Contract")
          .pivot("Type")
          .sum("TotalDuration")
          .na.fill(0)
    )

In [10]:
def finalize_result(statistics,total_devices):
	result = statistics.join(total_devices,'Contract','inner')
	return result 

In [None]:
def main(path):
    print('-------------Reading data from path--------------')
    # file_list = list_files_sorted(path)
    # df = read_data_from_path(file_list)
    df.show()
    print('-------------Selecting fields--------------')
    df = select_fields(df)
    df.show()
    print('-------------Calculating Devices --------------')
    total_devices = calculate_devices(df)
    total_devices.show()
    print('-------------Transforming Category --------------')
    df = transform_category(df)
    df.show()
    print('-------------Calculating Statistics --------------')
    statistics = calculate_statistics(df)
    statistics.show()
    print('-------------Finalizing result --------------')
    result = finalize_result(statistics, total_devices)
    result.show()
    print('-------------Saving Results --------------')
    save_data(result, save_path)
    return print('Task finished')
	
main(folder_path)
endtime = datetime.datetime.now()
timedelta = endtime - start_time
print(timedelta.total_seconds())

-------------Reading data from path--------------
+--------------------+-------+------+--------------------+-----+
|                 _id| _index|_score|             _source|_type|
+--------------------+-------+------+--------------------+-----+
|AX_momhia1FFivsGrn9o|history|     0|{KPLUS, HNH579912...|kplus|
|AX_momhca1FFivsGrnvg|history|     0|{KPLUS, HUFD40665...|kplus|
|AX_momhaa1FFivsGrnny|history|     0|{KPLUS, HNH572635...|kplus|
|AX_momhca1FFivsGrnvv|history|     0|{KPLUS, HND141717...|kplus|
|AX_momhia1FFivsGrn98|history|     0|{KPLUS, HNH743103...|kplus|
|AX_momg9a1FFivsGrnkS|history|     0|{KPLUS, HNH893773...|kplus|
|AX_momhca1FFivsGrnwA|history|     0|{KPLUS, HND083642...|kplus|
|AX_momhfa1FFivsGrn2u|history|     0|{KPLUS, DNFD74404...|kplus|
|AX_momhca1FFivsGrnwP|history|     0|{KPLUS, DTFD21200...|kplus|
|AX_momhca1FFivsGrnwU|history|     0|{KPLUS, LDFD05747...|kplus|
|AX_momhfa1FFivsGrn24|history|     0|{KPLUS, HNH063566...|kplus|
|AX_momhia1FFivsGrn-W|history|     0|{KP