# 1. Load dữ liệu vào DataFrame

In [152]:
import findspark

findspark.init()

In [153]:
import pandas as pd
import pyspark
import ast
import json

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum, udf, to_date, concat, lit, regexp_replace
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.sql import functions as F

import warnings

warnings.filterwarnings('ignore')

In [154]:
# Tạo Cluster
spark = SparkSession.builder \
    .appName("LogData") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .config("spark.executor.memory", "15g") \
    .config("spark.executor.cores", "8") \
    .config("spark.driver.memory", "15g") \
    .config("spark.driver.cores", "8") \
    .getOrCreate()

In [155]:
sqlContext = SQLContext(spark)

In [156]:
# Đọc 1 File dữ liệu trước
data = spark.read.text("G:\\DE6\\Class 2 - Big Data Coding\\DataSampleTest\\logt21.txt")

data.show()

+--------------------+
|               value|
+--------------------+
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'ItemId': u'100...|
|{u'ItemId': u'158...|
|{u'ItemId': u'52'...|
|{u'Firmware': u'2...|
|{u'ItemId': u'125...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
+--------------------+
only showing top 20 rows



In [157]:
type(data)

pyspark.sql.dataframe.DataFrame

In [158]:
data.show()

+--------------------+
|               value|
+--------------------+
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
|{u'Firmware': u'2...|
|{u'ItemId': u'100...|
|{u'ItemId': u'100...|
|{u'ItemId': u'158...|
|{u'ItemId': u'52'...|
|{u'Firmware': u'2...|
|{u'ItemId': u'125...|
|{u'ItemId': u'100...|
|{u'Firmware': u'2...|
+--------------------+
only showing top 20 rows



In [159]:
data.value[0]

Column<'value[0]'>

In [160]:
# Function để chuyển dữ liệu từ dạng String về thành dạng Dictionary
def convert_to_dict(value):
    fixed_string = value.replace("u'", "'").replace("'", '"').replace("\\", "\\\\")

    print(fixed_string)

    # return json.loads(fixed_string)
    return


def parse_data(file_path):
    import ast

    data = spark.read.text(file_path)

    lst_temp = list()

    rows = data.collect()

    k = 0

    rows_final = list()

    for i in range(len(rows)):
        # Nếu ở dòng chẵn thì gợp dữ liệu với các dòng ngay bên dưới (vì 2 dòng tạo thành dữ liệu của 1 cột)
        if i % 2 == 0:
            row_1_value = ast.literal_eval(rows[i].value)
            # row_2_value = ast.literal_eval(rows[i+1].value)

            # merged_dict = {**row_1_value, **row_2_value}

            rows_final.append(row_1_value)

    df_final = spark.createDataFrame(rows_final)

    df_final = df_final.select('MAC', 'SessionMainMenu', 'AppName', 'LogId', 'Event', 'ItemId', 'RealTimePlaying')

    # Trả về dữ liệu là list 1 list sau khi gộp hết các Dictionary về
    return df_final


def create_dataframe(file_lists):
    import glob

    file_lists = glob.glob('DataSampleTest/log*.txt')

    k = 0

    for file_path in file_lists:
        if k == 0:
            df_final = parse_data(file_path)
        else:
            df_final = df_final.union(parse_data(file_path))

        k += 1

    return df_final


def load_data(folder_path):
    import glob

    file_lists = glob.glob(f'{folder_path}/log*.txt')

    df_final = create_dataframe(file_lists)

    return df_final

In [161]:
# Load dữ liệu vào df_final
logs_data = load_data('DataSampleTest')

logs_data.show()

+------------+--------------------+-------+-----+---------------+---------+---------------+
|         MAC|     SessionMainMenu|AppName|LogId|          Event|   ItemId|RealTimePlaying|
+------------+--------------------+-------+-----+---------------+---------+---------------+
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|        StopVOD|100052388|          570.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   55|        NextVOD|100052388|           NULL|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   54|        PlayVOD|100052388|           NULL|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   55|        NextVOD|100052388|           NULL|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   40|      EnterIPTV|     NULL|           NULL|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   18|   ChangeModule|     NULL|           NULL|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   51|       StartVOD|100052388|           NULL|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|    StopChannel|      158|     

In [162]:
logs_data.printSchema()

root
 |-- MAC: string (nullable = true)
 |-- SessionMainMenu: string (nullable = true)
 |-- AppName: string (nullable = true)
 |-- LogId: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- ItemId: string (nullable = true)
 |-- RealTimePlaying: string (nullable = true)



In [163]:
logs_data = logs_data.withColumn('RealTimePlaying', when(col('RealTimePlaying').isNull(), 0).otherwise(col('RealTimePlaying')))

logs_data.show()

+------------+--------------------+-------+-----+---------------+---------+---------------+
|         MAC|     SessionMainMenu|AppName|LogId|          Event|   ItemId|RealTimePlaying|
+------------+--------------------+-------+-----+---------------+---------+---------------+
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|        StopVOD|100052388|          570.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   55|        NextVOD|100052388|              0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   54|        PlayVOD|100052388|              0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   55|        NextVOD|100052388|              0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   40|      EnterIPTV|     NULL|              0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   18|   ChangeModule|     NULL|              0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   51|       StartVOD|100052388|              0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|    StopChannel|      158|     

In [164]:
logs_data.printSchema()

root
 |-- MAC: string (nullable = true)
 |-- SessionMainMenu: string (nullable = true)
 |-- AppName: string (nullable = true)
 |-- LogId: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- ItemId: string (nullable = true)
 |-- RealTimePlaying: string (nullable = true)



In [165]:
# Chuyển đổi dữ liệu cột RealTimePlaying về dạng Float
logs_data = logs_data.withColumn('RealTimePlaying', logs_data['RealTimePlaying'].cast('float'))

logs_data.printSchema()

root
 |-- MAC: string (nullable = true)
 |-- SessionMainMenu: string (nullable = true)
 |-- AppName: string (nullable = true)
 |-- LogId: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- ItemId: string (nullable = true)
 |-- RealTimePlaying: float (nullable = true)



In [166]:
logs_data.show()

+------------+--------------------+-------+-----+---------------+---------+---------------+
|         MAC|     SessionMainMenu|AppName|LogId|          Event|   ItemId|RealTimePlaying|
+------------+--------------------+-------+-----+---------------+---------+---------------+
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|        StopVOD|100052388|          570.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   55|        NextVOD|100052388|            0.0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   54|        PlayVOD|100052388|            0.0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   55|        NextVOD|100052388|            0.0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   40|      EnterIPTV|     NULL|            0.0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   18|   ChangeModule|     NULL|            0.0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   51|       StartVOD|100052388|            0.0|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|    StopChannel|      158|     

In [167]:
# Loại bỏ các logs_data có RealTimePlaying = 0
logs_data = logs_data[logs_data['RealTimePlaying'] != 0]

logs_data.show()

+------------+--------------------+-------+-----+-----------+---------+---------------+
|         MAC|     SessionMainMenu|AppName|LogId|      Event|   ItemId|RealTimePlaying|
+------------+--------------------+-------+-----+-----------+---------+---------------+
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100052388|          570.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|      158|          6.657|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100052388|         1158.6|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|       52|          9.468|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|       52|          7.536|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100053413|         3480.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|      158|         10.415|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100000148|         3903.0|
|B046FCAC0DC1|B046FCAC0DC1:2016.

In [168]:
logs_data.printSchema()

root
 |-- MAC: string (nullable = true)
 |-- SessionMainMenu: string (nullable = true)
 |-- AppName: string (nullable = true)
 |-- LogId: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- ItemId: string (nullable = true)
 |-- RealTimePlaying: float (nullable = true)



In [169]:
logs_data_mac_groupby = logs_data.groupBy('MAC', 'AppName').agg(F.sum('RealTimePlaying').alias('RealTimePlaying')).orderBy('RealTimePlaying', ascending=False)

logs_data_mac_groupby.show()

+------------+-------+---------------+
|         MAC|AppName|RealTimePlaying|
+------------+-------+---------------+
|B046FCB4F6FC|   IPTV|            NaN|
|B046FCAD1A72|   IPTV|            NaN|
|B046FCB5165A|   IPTV|            NaN|
|B046FCB3C270|   IPTV|            NaN|
|B046FCB3C0F6|   IPTV|            NaN|
|B046FCB58A47|   IPTV|            NaN|
|B046FCB710AD|   IPTV|            NaN|
|B046FCAB6D1D|   IPTV|            NaN|
|B046FCB5034E|   IPTV|            NaN|
|B046FCB520D4|   IPTV|            NaN|
|B046FCACA75C|   IPTV|            NaN|
|B046FCB25966|   IPTV|            NaN|
|B046FCAA0FD3|   IPTV|            NaN|
|B046FCAA0535|   IPTV|            NaN|
|B046FCAB0320|   IPTV|            NaN|
|B046FCB7B522|   IPTV|            NaN|
|B046FCB7E79F|   IPTV|            NaN|
|B046FCB1AF25|   IPTV|            NaN|
|B046FCA981B6|   IPTV|            NaN|
|B046FCACC3ED|   IPTV|            NaN|
+------------+-------+---------------+
only showing top 20 rows



In [170]:
logs_data_mac_groupby = logs_data_mac_groupby.na.drop(subset=['RealTimePlaying'])

logs_data_mac_groupby.show()

+------------+-------+--------------------+
|         MAC|AppName|     RealTimePlaying|
+------------+-------+--------------------+
|B046FCAE382D|   IPTV| 1.755163481421441E7|
|B046FCAD1EEA|   IPTV|1.5634642986617684E7|
|B046FCA7BCB5|   IPTV|  1897640.3560125828|
|B046FCB28BCA|   IPTV|  1345581.9630002975|
|B046FCAEDB5E|   IPTV|  1334936.4611959457|
|B046FCAEDDC2|   IPTV|  1187601.9100208282|
|B046FCAA163A|   IPTV|   936302.3393008709|
|B046FCB2CB2B|   IPTV|   935855.6047201157|
|B046FCAA0F1B|   IPTV|   910310.9798007011|
|B046FCB70DFA|   IPTV|   882628.9629154205|
|B046FCB32AF5|   IPTV|   876269.5766096115|
|B046FCA6F096|   IPTV|   868668.9254112244|
|B046FCB5E825|   IPTV|   805443.6820454001|
|B046FCA988C7|   IPTV|   702807.7915058136|
|B046FCAD8739|   IPTV|       654868.953125|
|B046FCB710C6|   IPTV|   571637.4484194517|
|B046FCAD1490|   IPTV|         562320.0625|
|B046FCB84727|   IPTV|   555399.1455761194|
|B046FCA86474|   IPTV|   551702.6164684296|
|B046FCAD399C|   IPTV|   531748.

In [171]:
# Đọc dữ liệu file user_info.txt
user_data = spark.read.csv('DataSampleTest\\user_info.txt', sep='\t', header=True)

user_data.show()

+----------------+---------+
|             MAC|# of days|
+----------------+---------+
|FBOXB046FCB79E0B|       20|
|FBOXB046FCB3528B|      181|
|FBOXB046FCAAFB73|      426|
|FBOXB046FCAAFB72|      426|
|FBOXB046FCAA2085|      429|
|FBOXB046FCAA0669|      380|
|FBOXB046FCB343BF|      376|
|FBOXB046FCAC0CFB|      376|
|FBOXB046FCABED45|      378|
|FBOXB046FCAD80FC|      305|
|FBOXB046FCB1E3FE|      255|
|FBOXB046FCB27666|      210|
|FBOXB046FCB42341|      142|
|FBOXB046FCB6D6B2|       46|
|FBOXB046FCB6D4BC|       46|
|FBOXB046FCB6D4B6|       46|
|FBOXB046FCA6A3F4|      583|
|FBOXB046FCA86BD5|      493|
|FBOXB046FCABE3BC|      425|
|FBOXB046FCAC125F|      374|
+----------------+---------+
only showing top 20 rows



In [172]:
user_data.printSchema()

root
 |-- MAC: string (nullable = true)
 |-- # of days: string (nullable = true)



In [173]:
# Chuyển dữ liệu cột # of days sang dạng integer
user_data = user_data.withColumn('# of days', user_data['# of days'].cast('int'))

user_data.show()

+----------------+---------+
|             MAC|# of days|
+----------------+---------+
|FBOXB046FCB79E0B|       20|
|FBOXB046FCB3528B|      181|
|FBOXB046FCAAFB73|      426|
|FBOXB046FCAAFB72|      426|
|FBOXB046FCAA2085|      429|
|FBOXB046FCAA0669|      380|
|FBOXB046FCB343BF|      376|
|FBOXB046FCAC0CFB|      376|
|FBOXB046FCABED45|      378|
|FBOXB046FCAD80FC|      305|
|FBOXB046FCB1E3FE|      255|
|FBOXB046FCB27666|      210|
|FBOXB046FCB42341|      142|
|FBOXB046FCB6D6B2|       46|
|FBOXB046FCB6D4BC|       46|
|FBOXB046FCB6D4B6|       46|
|FBOXB046FCA6A3F4|      583|
|FBOXB046FCA86BD5|      493|
|FBOXB046FCABE3BC|      425|
|FBOXB046FCAC125F|      374|
+----------------+---------+
only showing top 20 rows



In [174]:
user_data.printSchema()

root
 |-- MAC: string (nullable = true)
 |-- # of days: integer (nullable = true)



In [175]:
user_data = user_data.withColumn('MAC', regexp_replace('MAC', 'FBOX', ''))

user_data.show()

+------------+---------+
|         MAC|# of days|
+------------+---------+
|B046FCB79E0B|       20|
|B046FCB3528B|      181|
|B046FCAAFB73|      426|
|B046FCAAFB72|      426|
|B046FCAA2085|      429|
|B046FCAA0669|      380|
|B046FCB343BF|      376|
|B046FCAC0CFB|      376|
|B046FCABED45|      378|
|B046FCAD80FC|      305|
|B046FCB1E3FE|      255|
|B046FCB27666|      210|
|B046FCB42341|      142|
|B046FCB6D6B2|       46|
|B046FCB6D4BC|       46|
|B046FCB6D4B6|       46|
|B046FCA6A3F4|      583|
|B046FCA86BD5|      493|
|B046FCABE3BC|      425|
|B046FCAC125F|      374|
+------------+---------+
only showing top 20 rows



In [177]:
from pyspark.sql import functions as F

# Assuming 'user_data' is your DataFrame and 'MAC' is the column you want to sum values for
user_data_groupby = user_data.groupBy('MAC').agg(F.sum('# of days').alias('Total Days')).orderBy('Total Days', ascending=False)

# Show the value sums
user_data_groupby.show()

+------------+----------+
|         MAC|Total Days|
+------------+----------+
|001D20ED4ACA|      1983|
|001c55007d93|      1166|
|001C55007967|      1056|
|001C55007B29|      1056|
|001C550081A7|      1054|
|001C55007A8F|      1049|
|001C550080DA|      1046|
|001C550081E5|      1042|
|001C55007D35|      1037|
|001C550083F1|      1029|
|001C55007CF7|      1018|
|001C55007BFF|      1014|
|001C55007BD7|      1007|
|001C550083FE|      1006|
|001C550083BE|      1002|
|001C5500855C|       989|
|001C5500802D|       988|
|001C55007C49|       986|
|001C550086F7|       985|
|B046FCA7C5B3|       976|
+------------+----------+
only showing top 20 rows



In [178]:
user_data_groupby.show()

+------------+----------+
|         MAC|Total Days|
+------------+----------+
|001D20ED4ACA|      1983|
|001c55007d93|      1166|
|001C55007967|      1056|
|001C55007B29|      1056|
|001C550081A7|      1054|
|001C55007A8F|      1049|
|001C550080DA|      1046|
|001C550081E5|      1042|
|001C55007D35|      1037|
|001C550083F1|      1029|
|001C55007CF7|      1018|
|001C55007BFF|      1014|
|001C55007BD7|      1007|
|001C550083FE|      1006|
|001C550083BE|      1002|
|001C5500855C|       989|
|001C5500802D|       988|
|001C55007C49|       986|
|001C550086F7|       985|
|B046FCA7C5B3|       976|
+------------+----------+
only showing top 20 rows



In [179]:
logs_data.show()

+------------+--------------------+-------+-----+-----------+---------+---------------+
|         MAC|     SessionMainMenu|AppName|LogId|      Event|   ItemId|RealTimePlaying|
+------------+--------------------+-------+-----+-----------+---------+---------------+
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100052388|          570.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|      158|          6.657|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100052388|         1158.6|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|       52|          9.468|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|       52|          7.536|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100053413|         3480.3|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   IPTV|   42|StopChannel|      158|         10.415|
|B046FCAC0DC1|B046FCAC0DC1:2016...|    VOD|   52|    StopVOD|100000148|         3903.0|
|B046FCAC0DC1|B046FCAC0DC1:2016.