In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# Tạo SparkSession
spark = SparkSession.builder \
    .appName("Video Analysis") \
    .getOrCreate()


# Định nghĩa schema mới
video_schema = StructType([
    StructField("Video ID", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("Category_ID", IntegerType(), True),
    StructField("Published At", TimestampType(), True),
    StructField("Channel Title", StringType(), True),
    StructField("View Count", IntegerType(), True),
    StructField("Like Count", IntegerType(), True),
    StructField("Comment Count", IntegerType(), True),
])

# Đọc file CSV với schema mới
file_path = "video.csv"
df = spark.read.option("header", "true").schema(video_schema).csv(file_path)

# Kiểm tra schema và hiển thị dữ liệu
df.printSchema()
df.show()
# Kiểm tra số hàng và cột
print(f"Số hàng: {df.count()}, Số cột: {len(df.columns)}")

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Category_ID: integer (nullable = true)
 |-- Published At: timestamp (nullable = true)
 |-- Channel Title: string (nullable = true)
 |-- View Count: integer (nullable = true)
 |-- Like Count: integer (nullable = true)
 |-- Comment Count: integer (nullable = true)

+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|   Video ID|               Title|Category_ID|       Published At|   Channel Title|View Count|Like Count|Comment Count|
+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|N-gpD9QqTK0|Nấu đám tiệc kiểu...|         19|2024-12-01 17:44:37|Khoai Lang Thang|   1520244|     37496|         1969|
|M_WD9Dxayk8|Đến nhà người lạ ...|         19|2024-11-17 18:57:32|Khoai Lang Thang|   1602732|     28690|         1378|
|5AJd2FJUVkc|Ăn 10 món lạ ở kh...|         19|202

In [None]:
# Kiểm tra số lượng giá trị null mỗi cột


# Loại bỏ giá trị null (nếu cần)
df_clean = df.dropna()


In [5]:
df_clean.printSchema()

root
 |-- Video ID: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Category_ID: integer (nullable = true)
 |-- Published At: timestamp (nullable = true)
 |-- Channel Title: string (nullable = true)
 |-- View Count: integer (nullable = true)
 |-- Like Count: integer (nullable = true)
 |-- Comment Count: integer (nullable = true)



: 

: 

In [5]:
df_clean.show()
print(f"Số hàng: {df_clean.count()}, Số cột: {len(df_clean.columns)}")

+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|   Video ID|               Title|Category_ID|       Published At|   Channel Title|View Count|Like Count|Comment Count|
+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|N-gpD9QqTK0|Nấu đám tiệc kiểu...|         19|2024-12-01 17:44:37|Khoai Lang Thang|   1520244|     37496|         1969|
|M_WD9Dxayk8|Đến nhà người lạ ...|         19|2024-11-17 18:57:32|Khoai Lang Thang|   1602732|     28690|         1378|
|5AJd2FJUVkc|Ăn 10 món lạ ở kh...|         19|2024-11-05 20:27:52|Khoai Lang Thang|   2130532|     29506|         1236|
|92-IbWKp_3k|Gặp nạn ở CHỢ TRỜ...|         19|2024-10-27 18:52:00|Khoai Lang Thang|   1779109|     24698|         1794|
|MLBhMV8k6e0|Du lịch ẩm thực C...|         19|2024-10-16 21:05:48|Khoai Lang Thang|   2031031|     28867|         1248|
|cJD4fc5l3fM|Châu Phi p8: HÀNG...|      

In [6]:
# Mô tả thống kê
df_clean.describe().show()

# Thống kê lượt xem (views)


+-------+-----------+--------------------+------------------+----------------+------------------+------------------+------------------+
|summary|   Video ID|               Title|       Category_ID|   Channel Title|        View Count|        Like Count|     Comment Count|
+-------+-----------+--------------------+------------------+----------------+------------------+------------------+------------------+
|  count|        284|                 284|               284|             284|               284|               284|               284|
|   mean|       NULL|                NULL|19.433098591549296|            NULL|2267935.3767605633|26922.975352112677|1581.9788732394366|
| stddev|       NULL|                NULL|2.2633419685435197|            NULL|1518302.5854609765| 16322.17740844484|1292.2597402616855|
|    min|-VfFP4zpkZo|"""Tết Kate"" ở V...|                10|Khoai Lang Thang|             59105|               960|                66|
|    max|zs9xbXCOT6Y|🇹🇭Đi về phương ...|        

In [7]:
from pyspark.sql import SparkSession
df_clean.createOrReplaceTempView("video")

In [8]:
spark.sql("select * from video").show()

+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|   Video ID|               Title|Category_ID|       Published At|   Channel Title|View Count|Like Count|Comment Count|
+-----------+--------------------+-----------+-------------------+----------------+----------+----------+-------------+
|N-gpD9QqTK0|Nấu đám tiệc kiểu...|         19|2024-12-01 17:44:37|Khoai Lang Thang|   1520244|     37496|         1969|
|M_WD9Dxayk8|Đến nhà người lạ ...|         19|2024-11-17 18:57:32|Khoai Lang Thang|   1602732|     28690|         1378|
|5AJd2FJUVkc|Ăn 10 món lạ ở kh...|         19|2024-11-05 20:27:52|Khoai Lang Thang|   2130532|     29506|         1236|
|92-IbWKp_3k|Gặp nạn ở CHỢ TRỜ...|         19|2024-10-27 18:52:00|Khoai Lang Thang|   1779109|     24698|         1794|
|MLBhMV8k6e0|Du lịch ẩm thực C...|         19|2024-10-16 21:05:48|Khoai Lang Thang|   2031031|     28867|         1248|
|cJD4fc5l3fM|Châu Phi p8: HÀNG...|      

- TỔng lượt xem của mỗi category

In [9]:
from pyspark.sql.functions import col, sum

spark.sql("""select 
                Category_ID, 
                sum(`View Count`) as Total_view
            from 
                Video 
            group by 
                Category_ID
            order by 
                Total_view""").show()


+-----------+----------+
|Category_ID|Total_view|
+-----------+----------+
|         15|    451875|
|         22|   5077519|
|         10|  11178060|
|         24|  36836146|
|         19| 590550047|
+-----------+----------+



- Số lượng video của mỗi category

In [10]:
from pyspark.sql.functions import count

spark.sql("""select 
                Category_ID, 
                count(`Video ID`) as Video_count
            from 
                Video 
            group by 
                Category_ID
            order by 
                Video_count""").show()


+-----------+-----------+
|Category_ID|Video_count|
+-----------+-----------+
|         15|          1|
|         22|          5|
|         10|          7|
|         24|         35|
|         19|        236|
+-----------+-----------+



- 10 video có lượt xem cao nhất

In [11]:
spark.sql("""
            SELECT 
                `Video ID`, 
                `Title`,
                `View Count`
            FROM 
                Video 
            ORDER BY 
                `View Count` DESC 
            LIMIT 10
        """).show()


+-----------+--------------------+----------+
|   Video ID|               Title|View Count|
+-----------+--------------------+----------+
|v32lFxUUV5o|Đám Cưới Miệt Vườ...|   8107261|
|5QCsAh9Eays|ĂN NGON & RẺ KHÔN...|   7891063|
|3i3cDuGayqg|🇰🇷Chợ Gwangjang...|   7680470|
|O8nVnHlmhCM|CHỢ CHÂU ĐỐC • TH...|   7403760|
|hK-s3LsZcFc|🇹🇭Chợ đêm biên ...|   6804373|
|TyzP2yUATuM|Đám giỗ bên cồn t...|   6516807|
|a-2iTFdtnvk|Cuộc sống bản Phù...|   6045537|
|7txU8f9Dl5c|ĂN BUFFET 5 SAO Ở...|   5891310|
|p_u50y858kY|Nấu ĐÁM GIỖ từ ba...|   5738110|
|zc3A_CxSyc4|Nấu đám miệt vườn...|   5575163|
+-----------+--------------------+----------+



In [12]:
spark.sql("""
            SELECT 
                `Video ID`, 
                `Title`,
                `Like Count`
            FROM 
                Video 
            ORDER BY 
                `Like Count` DESC 
            LIMIT 10
        """).show()


+-----------+--------------------+----------+
|   Video ID|               Title|Like Count|
+-----------+--------------------+----------+
|TrbtMqHSmcc|800 NGÀY ĐI KHẮP ...|    137707|
|wjDs5zwRlMA|(Official Music V...|     93884|
|36ms9Si7f8I|XIN LỖI - KHOAI |...|     92510|
|T02aU8qpshQ|10 năm & BÍ MẬT C...|     88221|
|v32lFxUUV5o|Đám Cưới Miệt Vườ...|     78135|
|a0RLS40Qh-A|Việt Nam chuyện c...|     63578|
|MZxiqm-JkfY|Bánh đúc thủ công...|     62743|
|a-2iTFdtnvk|Cuộc sống bản Phù...|     61219|
|3i3cDuGayqg|🇰🇷Chợ Gwangjang...|     57065|
|TyzP2yUATuM|Đám giỗ bên cồn t...|     55756|
+-----------+--------------------+----------+



In [13]:
spark.sql("""
            SELECT 
                `Video ID`, 
                `Title`,
                `Comment Count`
            FROM 
                Video 
            ORDER BY 
                `Comment Count` DESC 
            LIMIT 10
        """).show()


+-----------+--------------------+-------------+
|   Video ID|               Title|Comment Count|
+-----------+--------------------+-------------+
|TrbtMqHSmcc|800 NGÀY ĐI KHẮP ...|        13160|
|36ms9Si7f8I|XIN LỖI - KHOAI |...|         9190|
|v32lFxUUV5o|Đám Cưới Miệt Vườ...|         6202|
|TyzP2yUATuM|Đám giỗ bên cồn t...|         5589|
|4V4KjfmH5vw|ĐẶC SẢN QUÝ HIẾM ...|         5537|
|wjDs5zwRlMA|(Official Music V...|         4823|
|T02aU8qpshQ|10 năm & BÍ MẬT C...|         4708|
|a0RLS40Qh-A|Việt Nam chuyện c...|         4449|
|5QCsAh9Eays|ĂN NGON & RẺ KHÔN...|         4431|
|a-2iTFdtnvk|Cuộc sống bản Phù...|         4366|
+-----------+--------------------+-------------+



- Số lượng video mỗi năm

In [14]:
from pyspark.sql.functions import year, month


spark.sql("""
            SELECT 
                YEAR(`Published At`) as Year, 
                SUM(`View Count`) as Total_view
            FROM 
                Video 
            GROUP BY 
                YEAR(`Published At`)
            ORDER BY 
                YEAR(`Published At`) DESC 
        """).show()


+----+----------+
|Year|Total_view|
+----+----------+
|2024|  64707611|
|2023|  97253424|
|2022|  75388408|
|2021|  38998764|
|2020|  68516080|
|2019| 107469646|
|2018| 137904607|
|2017|  53855107|
+----+----------+



- Tỉ lệ tương tác theo video

In [15]:
spark.sql("""
            SELECT 
                `Video ID`, 
                (`Like Count` + `Comment Count`) / `View Count` as Ti_le_tuong_tac
            FROM 
                Video 
            ORDER BY 
                Ti_le_tuong_tac DESC 
        """).show()

+-----------+--------------------+
|   Video ID|     Ti_le_tuong_tac|
+-----------+--------------------+
|XdbjuI86zhM| 0.07914200821377348|
|a0RLS40Qh-A| 0.07401455549595638|
|36ms9Si7f8I| 0.05627321892772128|
|q5EPifZQK8I| 0.05039767513000918|
|xXTcKSxGClQ| 0.04872383460159695|
|TrbtMqHSmcc| 0.04163283890996895|
|wjDs5zwRlMA| 0.03859600280123764|
|kL-XgVsV8EI| 0.03678875611256841|
|zMn9mithu84| 0.03601119164946227|
|CnlXly4_b9Y|0.032958957796567115|
|quZy6AhGmEc|0.032448847884032295|
|OWlik0yAwoc| 0.03187260153091588|
|JYULx2AItoI| 0.03140695915279879|
|SYcnSZfXwWw|0.030660321797922212|
|RvMcyBS8SgU|0.030424938025804687|
|5ta7YnZ1K_o|0.030185685932506125|
|V_2_CPiTDR4|  0.0295407755862526|
|T02aU8qpshQ| 0.02926931746921145|
|SLpU2LGv54g|0.029211199642519933|
|wX-TrSDnAys| 0.02893649585103135|
+-----------+--------------------+
only showing top 20 rows



- Tỉ lệ tương tác theo category

In [16]:
# Dùng groupBy với selectExpr gián tiếp

spark.sql("""SELECT 
                Category_ID,
                (SUM(`Like Count`)+ SUM(`Comment Count`))/SUM(`View Count`) as Ti_le_tuong_tac
            FROM 
                Video
            GROUP BY 
                Category_ID
            ORDER BY
                Ti_le_tuong_tac DESC
            """).show()


+-----------+--------------------+
|Category_ID|     Ti_le_tuong_tac|
+-----------+--------------------+
|         22| 0.03472955984999761|
|         10| 0.03354419282057888|
|         15|  0.0142716459197787|
|         19|0.012093535571253625|
|         24|0.010745450949184531|
+-----------+--------------------+



- View trung bình cho mỗi video

In [17]:
from pyspark.sql.functions import expr

spark.sql("""
    SELECT 
        COUNT(`Video ID`) AS Total_Video,
        SUM(`View Count`) AS Total_View,
        SUM(`View Count`) / COUNT(`Video ID`) AS Average_Views_Per_Video,
        SUM(`Like Count`) AS Total_Like,
        SUM(`Like Count`) / COUNT(`Video ID`) AS Average_Likes_Per_Video,
        SUM(`Comment Count`) AS Total_Comment,
        SUM(`Comment Count`) / COUNT(`Video ID`) AS Average_Comment_Per_Video
    FROM video
""").show()


+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|        284| 644093647|     2267935.3767605633|   7646125|     26922.975352112677|       449282|       1581.9788732394366|
+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+



In [18]:
from pyspark.sql.functions import expr

spark.sql("""
    SELECT 
        YEAR(`Published At`) as Year,
        COUNT(`Video ID`) AS Total_Video,
        SUM(`View Count`) AS Total_View,
        SUM(`View Count`) / COUNT(`Video ID`) AS Average_Views_Per_Video,
        SUM(`Like Count`) AS Total_Like,
        SUM(`Like Count`) / COUNT(`Video ID`) AS Average_Likes_Per_Video,
        SUM(`Comment Count`) AS Total_Comment,
        SUM(`Comment Count`) / COUNT(`Video ID`) AS Average_Comment_Per_Video
    FROM video
    GROUP BY Year
    ORDER BY Year
""").show()


+----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Year|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|2017|         48|  53855107|     1121981.3958333333|    566504|     11802.166666666666|        45151|        940.6458333333334|
|2018|         58| 137904607|     2377665.6379310344|   1335530|     23026.379310344826|        94382|       1627.2758620689656|
|2019|         50| 107469646|             2149392.92|   1610016|               32200.32|       112736|                  2254.72|
|2020|         28|  68516080|     2447002.8571428573|    952112|                34004.0|        59969|                  2141.75|
|2021|         16|  38998764|             2437422.75|    513688|                32105.5|        3

In [19]:
from pyspark.sql.functions import expr

spark.sql("""
    SELECT 
        YEAR(`Published At`) as Year,
        MONTH(`Published At`) as Month,
        COUNT(`Video ID`) AS Total_Video,
        SUM(`View Count`) AS Total_View,
        SUM(`View Count`) / COUNT(`Video ID`) AS Average_Views_Per_Video,
        SUM(`Like Count`) AS Total_Like,
        SUM(`Like Count`) / COUNT(`Video ID`) AS Average_Likes_Per_Video,
        SUM(`Comment Count`) AS Total_Comment,
        SUM(`Comment Count`) / COUNT(`Video ID`) AS Average_Comment_Per_Video
    FROM video
    GROUP BY Year, Month
    ORDER BY Year, Month
""").show()


+----+-----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|Year|Month|Total_Video|Total_View|Average_Views_Per_Video|Total_Like|Average_Likes_Per_Video|Total_Comment|Average_Comment_Per_Video|
+----+-----+-----------+----------+-----------------------+----------+-----------------------+-------------+-------------------------+
|2017|    2|          2|   1224273|               612136.5|     13479|                 6739.5|         1217|                    608.5|
|2017|    3|          5|   4008293|               801658.6|     40684|                 8136.8|         2694|                    538.8|
|2017|    4|          5|   2757327|               551465.4|     29915|                 5983.0|         2252|                    450.4|
|2017|    5|          3|   1821947|      607315.6666666666|     18036|                 6012.0|         1047|                    349.0|
|2017|    6|          8|   5491449|             686431.

- Top 5 video được xem nhiều nhất mỗi năm

In [20]:
spark.sql("""SELECT  *
             FROM 
                 (SELECT
                        YEAR(`Published At`) as Year,
                        `Video ID`,
                        Title,
                        `View Count`,
                        ROW_NUMBER() OVER (PARTITION BY YEAR(`Published At`) ORDER BY `View Count` DESC) AS rank
                    FROM 
                        video) as rank_video
             WHERE rank <=3
             ORDER BY Year""").show()




+----+-----------+--------------------+----------+----+
|Year|   Video ID|               Title|View Count|rank|
+----+-----------+--------------------+----------+----+
|2017|7txU8f9Dl5c|ĂN BUFFET 5 SAO Ở...|   5891310|   1|
|2017|4V4KjfmH5vw|ĐẶC SẢN QUÝ HIẾM ...|   4210885|   2|
|2017|78uRq9DW6Eo|18 món ăn Thái La...|   3171014|   3|
|2018|5QCsAh9Eays|ĂN NGON & RẺ KHÔN...|   7891063|   1|
|2018|O8nVnHlmhCM|CHỢ CHÂU ĐỐC • TH...|   7403760|   2|
|2018|S07VinbxN3g|CUA HOÀNG ĐẾ, BÀO...|   5235118|   3|
|2019|v32lFxUUV5o|Đám Cưới Miệt Vườ...|   8107261|   1|
|2019|TyzP2yUATuM|Đám giỗ bên cồn t...|   6516807|   2|
|2019|a-2iTFdtnvk|Cuộc sống bản Phù...|   6045537|   3|
|2020|L4HWqGQxIj4|Món ngon sông nướ...|   5281038|   1|
|2020|rgtYBNml5ts|24h sống trên bè ...|   4790384|   2|
|2020|r3UJ3Uc9VqM|Nấu ăn cùng n...|   4709265|   3|
|2021|MZxiqm-JkfY|Bánh đúc thủ công...|   4924649|   1|
|2021|ENSLITZnc6w|Cuộc sống Năm Căn...|   4244863|   2|
|2021|6S3pp7oLmog|Du lịch Nghệ An: ...|   405779