In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# Tạo SparkSession
spark = SparkSession.builder \
    .appName("Video Analysis") \
    .getOrCreate()


# Định nghĩa schema mới
video_schema = StructType([
    StructField("Video ID", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("Category_ID", IntegerType(), True),
    StructField("Published At", TimestampType(), True),
    StructField("Channel Title", StringType(), True),
    StructField("View Count", IntegerType(), True),
    StructField("Like Count", IntegerType(), True),
    StructField("Comment Count", IntegerType(), True),
])

# Đọc file CSV với schema mới
file_path = "video.csv"
df = spark.read.option("header", "true").schema(video_schema).csv(file_path)

# Kiểm tra schema và hiển thị dữ liệu
df.printSchema()
df.show()
# Kiểm tra số hàng và cột
print(f"Số hàng: {df.count()}, Số cột: {len(df.columns)}")

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Category_ID: integer (nullable = true)
 |-- Published At: timestamp (nullable = true)
 |-- Channel Title: string (nullable = true)
 |-- View Count: integer (nullable = true)
 |-- Like Count: integer (nullable = true)
 |-- Comment Count: integer (nullable = true)

+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|   Video ID|               Title|Category_ID|       Published At|   Channel Title|View Count|Like Count|Comment Count|
+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|N-gpD9QqTK0|Nấu đám tiệc kiểu...|         19|2024-12-01 17:44:37|Khoai Lang Thang|   1520244|     37496|         1969|
|M_WD9Dxayk8|Đến nhà người lạ ...|         19|2024-11-17 18:57:32|Khoai Lang Thang|   1602732|     28690|         1378|
|5AJd2FJUVkc|Ăn 10 món lạ ở kh...|         19|202

In [17]:
# Kiểm tra số lượng giá trị null mỗi cột


# Loại bỏ giá trị null (nếu cần)
df_clean = df.dropna()


In [18]:
df_clean.printSchema()

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Category_ID: integer (nullable = true)
 |-- Published At: timestamp (nullable = true)
 |-- Channel Title: string (nullable = true)
 |-- View Count: integer (nullable = true)
 |-- Like Count: integer (nullable = true)
 |-- Comment Count: integer (nullable = true)



In [19]:
df_clean.show()
print(f"Số hàng: {df_clean.count()}, Số cột: {len(df_clean.columns)}")

+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|   Video ID|               Title|Category_ID|       Published At|   Channel Title|View Count|Like Count|Comment Count|
+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|N-gpD9QqTK0|Nấu đám tiệc kiểu...|         19|2024-12-01 17:44:37|Khoai Lang Thang|   1520244|     37496|         1969|
|M_WD9Dxayk8|Đến nhà người lạ ...|         19|2024-11-17 18:57:32|Khoai Lang Thang|   1602732|     28690|         1378|
|5AJd2FJUVkc|Ăn 10 món lạ ở kh...|         19|2024-11-05 20:27:52|Khoai Lang Thang|   2130532|     29506|         1236|
|92-IbWKp_3k|Gặp nạn ở CHỢ TRỜ...|         19|2024-10-27 18:52:00|Khoai Lang Thang|   1779109|     24698|         1794|
|MLBhMV8k6e0|Du lịch ẩm thực C...|         19|2024-10-16 21:05:48|Khoai Lang Thang|   2031031|     28867|         1248|
|cJD4fc5l3fM|Châu Phi p8: HÀNG...|      

In [20]:
# Mô tả thống kê
df_clean.describe().show()

# Thống kê lượt xem (views)


+-------+-----------+--------------------+------------------+----------------+------------------+------------------+------------------+
|summary|   Video ID|               Title|       Category_ID|   Channel Title|        View Count|        Like Count|     Comment Count|
+-------+-----------+--------------------+------------------+----------------+------------------+------------------+------------------+
|  count|        284|                 284|               284|             284|               284|               284|               284|
|   mean|       NULL|                NULL|19.433098591549296|            NULL|2267935.3767605633|26922.975352112677|1581.9788732394366|
| stddev|       NULL|                NULL|2.2633419685435197|            NULL|1518302.5854609765| 16322.17740844484|1292.2597402616855|
|    min|-VfFP4zpkZo|"""Tết Kate"" ở V...|                10|Khoai Lang Thang|             59105|               960|                66|
|    max|zs9xbXCOT6Y|🇹🇭Đi về phương ...|        

- Tạo bảng để sử dụng spark sql

In [21]:
from pyspark.sql import SparkSession
df_clean.createOrReplaceTempView("video")

1.2. Tính tổng lượt xem, tổng số lượng video theo mỗi category

1.2.1 TỔng lượt xem của mỗi category

- Sử dụng spark dataframe

In [22]:
from pyspark.sql.functions import col, sum

# Chuyển cột "View Count" sang kiểu Integer
df_clean = df_clean.withColumn("View Count", col("View Count").cast("int"))

# Thực hiện nhóm và tính tổng số video theo category
df_category=df_clean.groupBy("Category_ID") \
                    .agg(sum("View Count").alias("Total Views")) \
                    .orderBy(col("Total Views").desc()) 
df_category.show()


+-----------+-----------+
|Category_ID|Total Views|
+-----------+-----------+
|         19|  590550047|
|         24|   36836146|
|         10|   11178060|
|         22|    5077519|
|         15|     451875|
+-----------+-----------+



- Sử dụng spark sql

In [23]:
from pyspark.sql.functions import col, sum

spark.sql("""select 
                Category_ID, 
                sum(`View Count`) as Total_view
            from 
                Video 
            group by 
                Category_ID
            order by 
                Total_view""").show()


+-----------+----------+
|Category_ID|Total_view|
+-----------+----------+
|         15|    451875|
|         22|   5077519|
|         10|  11178060|
|         24|  36836146|
|         19| 590550047|
+-----------+----------+



1.2.2. Số lượng video của mỗi category

- Sử dụng spark dataframe

In [24]:
from pyspark.sql.functions import count

# Nhóm dữ liệu theo Category_ID và sau đó đếm số lần xuất hiện của Video ID
df_category_video=df_clean.groupBy("Category_ID") \
    .agg(count("Video ID").alias("Video Count")) \
    .orderBy(col("Video Count").desc()) 
df_category_video.show()


+-----------+-----------+
|Category_ID|Video Count|
+-----------+-----------+
|         19|        236|
|         24|         35|
|         10|          7|
|         22|          5|
|         15|          1|
+-----------+-----------+



- Sử dụng spark sql

In [48]:
from pyspark.sql.functions import count

spark.sql("""select 
                Category_ID, 
                count(`Video ID`) as Video_count
            from 
                Video 
            group by 
                Category_ID
            order by 
                Video_count desc""").show()


+-----------+-----------+
|Category_ID|Video_count|
+-----------+-----------+
|         19|        236|
|         24|         35|
|         10|          7|
|         22|          5|
|         15|          1|
+-----------+-----------+



1.4. Số lượng video đăng tải mỗi năm

- SỬ dụng dataframe

In [32]:
from pyspark.sql.functions import year, month

# Thêm cột năm, tháng nếu có cột ngày phát hành
df_clean = df_clean.withColumn("year", year(col("Published At"))) \
                   .withColumn("month", month(col("Published At")))

# Số lượng video theo năm
df_count_video_year=df_clean.groupBy("year").count().orderBy("year")
df_count_video_year.show()


+----+-----+
|year|count|
+----+-----+
|2017|   48|
|2018|   58|
|2019|   50|
|2020|   28|
|2021|   16|
|2022|   26|
|2023|   35|
|2024|   23|
+----+-----+



- Sử dụng spark sql

In [33]:
from pyspark.sql.functions import year, month


spark.sql("""
            SELECT 
                YEAR(`Published At`) as Year, 
                SUM(`View Count`) as Total_view
            FROM 
                Video 
            GROUP BY 
                YEAR(`Published At`)
            ORDER BY 
                YEAR(`Published At`) DESC 
        """).show()


+----+----------+
|Year|Total_view|
+----+----------+
|2024|  64707611|
|2023|  97253424|
|2022|  75388408|
|2021|  38998764|
|2020|  68516080|
|2019| 107469646|
|2018| 137904607|
|2017|  53855107|
+----+----------+



1.6 Tỉ lệ tương tác theo video

- Sử dụng spark dataframe

In [36]:
df_tuong_tac = df_clean.selectExpr("`Video ID`",\
                                   "Title",\
                                   "(`Like Count` + `Comment Count`) / `View Count` as Ti_le_tuong_tac")\
                        .orderBy("Ti_le_tuong_tac", ascending=False)

df_tuong_tac.show()


+-----------+--------------------+--------------------+
|   Video ID|               Title|     Ti_le_tuong_tac|
+-----------+--------------------+--------------------+
|XdbjuI86zhM|Hành trình xây 30...| 0.07914200821377348|
|a0RLS40Qh-A|Việt Nam chuyện c...| 0.07401455549595638|
|36ms9Si7f8I|XIN LỖI - KHOAI |...| 0.05627321892772128|
|q5EPifZQK8I|BALI DU KÝ |Du lị...| 0.05039767513000918|
|xXTcKSxGClQ|Khoai đi làm nhân...| 0.04872383460159695|
|TrbtMqHSmcc|800 NGÀY ĐI KHẮP ...| 0.04163283890996895|
|wjDs5zwRlMA|(Official Music V...| 0.03859600280123764|
|kL-XgVsV8EI|Chiang Mai - Khoa...| 0.03678875611256841|
|zMn9mithu84|Biển hoa khổng lồ...| 0.03601119164946227|
|CnlXly4_b9Y|Kỷ niệm 1000 subs...|0.032958957796567115|
|quZy6AhGmEc|KHOAI LANG THANG ...|0.032448847884032295|
|OWlik0yAwoc|Cuộc sống Bản Phù...| 0.03187260153091588|
|JYULx2AItoI|(Lofi Ver.) 'em v...| 0.03140695915279879|
|SYcnSZfXwWw|Em và Những Ng...|0.030660321797922212|
|RvMcyBS8SgU|Nấu Buffet cho em...|0.030424938025

- Sử dụng spark sql

In [37]:
spark.sql("""
            SELECT 
                `Video ID`, Title,
                (`Like Count` + `Comment Count`) / `View Count` as Ti_le_tuong_tac
            FROM 
                Video 
            ORDER BY 
                Ti_le_tuong_tac DESC 
        """).show()

+-----------+--------------------+--------------------+
|   Video ID|               Title|     Ti_le_tuong_tac|
+-----------+--------------------+--------------------+
|XdbjuI86zhM|Hành trình xây 30...| 0.07914200821377348|
|a0RLS40Qh-A|Việt Nam chuyện c...| 0.07401455549595638|
|36ms9Si7f8I|XIN LỖI - KHOAI |...| 0.05627321892772128|
|q5EPifZQK8I|BALI DU KÝ |Du lị...| 0.05039767513000918|
|xXTcKSxGClQ|Khoai đi làm nhân...| 0.04872383460159695|
|TrbtMqHSmcc|800 NGÀY ĐI KHẮP ...| 0.04163283890996895|
|wjDs5zwRlMA|(Official Music V...| 0.03859600280123764|
|kL-XgVsV8EI|Chiang Mai - Khoa...| 0.03678875611256841|
|zMn9mithu84|Biển hoa khổng lồ...| 0.03601119164946227|
|CnlXly4_b9Y|Kỷ niệm 1000 subs...|0.032958957796567115|
|quZy6AhGmEc|KHOAI LANG THANG ...|0.032448847884032295|
|OWlik0yAwoc|Cuộc sống Bản Phù...| 0.03187260153091588|
|JYULx2AItoI|(Lofi Ver.) 'em v...| 0.03140695915279879|
|SYcnSZfXwWw|Em và Những Ng...|0.030660321797922212|
|RvMcyBS8SgU|Nấu Buffet cho em...|0.030424938025

1.8. Lượt View, Like, Comment trung bình cho mỗi video

- Sử dụng spark dataframe

In [40]:
from pyspark.sql.functions import expr

View_tb = df_clean.agg( 
                            (expr("COUNT(`Video ID`)")).alias("Total_Video"),
                            (expr("SUM(`View Count`)")).alias("Total_View"),
                            (expr("SUM(`View Count`)") / expr("COUNT(`Video ID`)")).alias("Average_Views_Per_Video"),
                            (expr("SUM(`Like Count`)")).alias("Total_Like"),
                            (expr("SUM(`Like Count`)") / expr("COUNT(`Video ID`)")).alias("Average_Likes_Per_Video"),
                            (expr("SUM(`Comment Count`)")).alias("Total_Comment"),
                            (expr("SUM(`Comment Count`)") / expr("COUNT(`Video ID`)")).alias("Average_Comment_Per_Video")
)

View_tb.show()


+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|        284| 644093647|     2267935.3767605633|   7646125|     26922.975352112677|       449282|       1581.9788732394366|
+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+



- Sử dụng spark sql

In [41]:
from pyspark.sql.functions import expr

spark.sql("""
    SELECT 
        COUNT(`Video ID`) AS Total_Video,
        SUM(`View Count`) AS Total_View,
        SUM(`View Count`) / COUNT(`Video ID`) AS Average_Views_Per_Video,
        SUM(`Like Count`) AS Total_Like,
        SUM(`Like Count`) / COUNT(`Video ID`) AS Average_Likes_Per_Video,
        SUM(`Comment Count`) AS Total_Comment,
        SUM(`Comment Count`) / COUNT(`Video ID`) AS Average_Comment_Per_Video
    FROM video
""").show()


+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|        284| 644093647|     2267935.3767605633|   7646125|     26922.975352112677|       449282|       1581.9788732394366|
+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+



1.10. Tính view trung bình, số lượng like trung bình, số lượng comment trung bình cho mỗi video theo từng năm/tháng

- Sử dụng spark dataframe

In [44]:
from pyspark.sql.functions import expr

View_Like_Comment_tb_year_month = df_clean.groupby("Year","month")\
                        .agg( 
                            (expr("COUNT(`Video ID`)")).alias("Total_Video"),
                            (expr("SUM(`View Count`)")).alias("Total_View"),
                            (expr("SUM(`View Count`)") / expr("COUNT(`Video ID`)")).alias("Average_Views_Per_Video"),
                            (expr("SUM(`Like Count`)")).alias("Total_Like"),
                            (expr("SUM(`Like Count`)") / expr("COUNT(`Video ID`)")).alias("Average_Likes_Per_Video"),
                            (expr("SUM(`Comment Count`)")).alias("Total_Comment"),
                            (expr("SUM(`Comment Count`)") / expr("COUNT(`Video ID`)")).alias("Average_Comment_Per_Video"))\
                        .orderBy("year","month")

View_Like_Comment_tb_year_month.show()


+----+-----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Year|month|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+----+-----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|2017|    2|          2|   1224273|               612136.5|     13479|                 6739.5|         1217|                    608.5|
|2017|    3|          5|   4008293|               801658.6|     40684|                 8136.8|         2694|                    538.8|
|2017|    4|          5|   2757327|               551465.4|     29915|                 5983.0|         2252|                    450.4|
|2017|    5|          3|   1821947|      607315.6666666666|     18036|                 6012.0|         1047|                    349.0|
|2017|    6|          8|   5491449|             686431.

- Sử dụng spark sql

In [45]:
from pyspark.sql.functions import expr

spark.sql("""
    SELECT 
        YEAR(`Published At`) as Year,
        MONTH(`Published At`) as Month,
        COUNT(`Video ID`) AS Total_Video,
        SUM(`View Count`) AS Total_View,
        SUM(`View Count`) / COUNT(`Video ID`) AS Average_Views_Per_Video,
        SUM(`Like Count`) AS Total_Like,
        SUM(`Like Count`) / COUNT(`Video ID`) AS Average_Likes_Per_Video,
        SUM(`Comment Count`) AS Total_Comment,
        SUM(`Comment Count`) / COUNT(`Video ID`) AS Average_Comment_Per_Video
    FROM video
    GROUP BY Year, Month
    ORDER BY Year, Month
""").show()


+----+-----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Year|Month|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+----+-----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|2017|    2|          2|   1224273|               612136.5|     13479|                 6739.5|         1217|                    608.5|
|2017|    3|          5|   4008293|               801658.6|     40684|                 8136.8|         2694|                    538.8|
|2017|    4|          5|   2757327|               551465.4|     29915|                 5983.0|         2252|                    450.4|
|2017|    5|          3|   1821947|      607315.6666666666|     18036|                 6012.0|         1047|                    349.0|
|2017|    6|          8|   5491449|             686431.