In [1]:
from pathlib import Path
import dask
from dask.distributed import Client
import dask.dataframe as dd
import json
import logging

TRENDING_VIDEO_DATA = "data/youtube-trending-video-dataset"
FOREIGN_LANGUAGES = ["BR", "DE","FR", "IN", "JP", "KR", "MX", "RU"]

def preprocess(data_path: Path, category_file: Path):
    with category_file.open() as f:
        cat_items = json.load(f)["items"]
    cat_dic = {int(c["id"]): c["snippet"]["title"] for c in cat_items}

    ddf = dd.read_csv(data_path.as_posix())
    ddf["category_name"] = ddf["categoryId"].map(cat_dic)
    ddf["description"] = ddf["description"].fillna("")

    ddf.dask.visualize(filename=f"{data_path.stem}_no_translation.svg")
    processed_file_name =  f"{data_path.stem}_processed.csv"
    ddf.to_csv(data_path.parent.joinpath(processed_file_name).as_posix(), single_file=True, index=False)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_path = "/content/drive/MyDrive/NUS/CS5344/"
for country in ["CA", "GB", "US"]:
    data_path = Path(root_path + country + "_youtube_trending_data.csv")
    category_file = Path(root_path + country + "_category_id.json")
    preprocess(data_path=data_path, category_file=category_file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data into a Pandas DataFrame
# Replace 'your_data.csv' with the actual filename or path to your data
df = pd.read_csv("/content/drive/MyDrive/NUS/CS5344/CA_youtube_trending_data_processed.csv", lineterminator='\n')

df.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,category_name
0,KX06ksuS6Xo,Diljit Dosanjh: CLASH (Official) Music Video |...,2020-08-11T07:30:02Z,UCZRdNleCgW-BGUJf-bbjzQg,Diljit Dosanjh,10,2020-08-12T00:00:00Z,clash diljit dosanjh|diljit dosanjh|diljit dos...,9140911,296541,6180,30059,https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg,False,False,CLASH official music video performed by DILJIT...,Music
1,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353797,2628,40222,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...,Entertainment
2,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146740,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare...",Gaming
3,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156914,5857,35331,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...,People & Blogs
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11T15:10:05Z,UCDVPcEbVLQgLZX0Rt6jo34A,Mr. Kate,26,2020-08-12T00:00:00Z,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45803,964,2198,https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg,False,False,Transforming The LaBrant Family's empty white ...,Howto & Style



if I have csv file which is the table contain column video_id, views, categoryId, catergory_name, title, trending_date, description. I want to find the top 10 frequent words in description. however, the video_id is not unique as the data may update when the date change. I want the data before 01, November 2023 for calculation. that means, if the video_id is repeated, used the latest date before 01, November 2023 in column "trending_date". please use pyspark and tfidf.


In [None]:
!pip install pyspark
!pip install dask_ml

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

root_path = "/content/drive/MyDrive/NUS/CS5344/"
for country in ["CA", "GB", "US"]:
    data_path = root_path + country + "_youtube_trending_data_processed.csv"
    vars()[country] = pd.read_csv(data_path, lineterminator='\n')

df = pd.concat([CA, GB, US], axis=0)

# Handle missing values in the 'description' column
df['description'] = df['description'].fillna('')

# Filter data before 01, November 2023
df['trending_date'] = pd.to_datetime(df['trending_date'])
df = df[df['trending_date'] < "2023-11-01"]

# Get the latest trending date for each video_id
df['latest_trending_date'] = df.groupby('video_id')['trending_date'].transform('max')
df = df[df['trending_date'] == df['latest_trending_date']]

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words and not any(stop_word in word.lower() for stop_word in ["www", "http", "https",'video', 'youtube', 'facebook', 'new', 'follow', 'like', 'watch', 'subscribe', 'channel'])]
    return filtered_tokens

# Tokenize and remove stop words using scikit-learn TfidfVectorizer
new = pd.concat([df['description'], df['title']], axis=0)
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize_and_remove_stopwords, max_features=10)
X = vectorizer.fit_transform(new)

# Get top 10 words based on TF-IDF
top_words = vectorizer.get_feature_names_out()

# Display the result
print("Top 10 words based on TF-IDF:")
print(top_words)



Top 10 words based on TF-IDF:
['game' 'highlights' 'instagram' 'live' 'minecraft' 'music' 'official'
 'sports' 'twitter' 'world']


In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DateType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Initialize Spark session
spark = SparkSession.builder.appName("TopFrequentWords").getOrCreate()

root_path = "/content/drive/MyDrive/NUS/CS5344/"
for country in ["CA", "GB", "US"]:
    data_path = root_path + country + "_youtube_trending_data_processed.csv"
    vars()[country] = pd.read_csv(data_path, lineterminator='\n')

df = pd.concat([CA, GB, US], axis=0)

df = spark.createDataFrame(df)

# Convert "trending_date" to DateType
df = df.withColumn("trending_date", col("trending_date").cast(DateType()))

# Filter data before 01, November 2023
df = df.filter(col("trending_date") < "2023-11-01")

# Group by video_id and select the latest trending_date
latest_dates = df.groupBy("video_id").agg({"trending_date": "max"}).withColumnRenamed("max(trending_date)", "latest_trending_date")

# Join with the original DataFrame to get the corresponding rows
df = df.join(latest_dates, (df["video_id"] == latest_dates["video_id"]) & (df["trending_date"] == latest_dates["latest_trending_date"]), "inner").drop(latest_dates["video_id"]).drop(latest_dates["latest_trending_date"])

# Tokenize the description column
tokenizer = Tokenizer(inputCol="description", outputCol="words")
df = tokenizer.transform(df)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

# Calculate TF (Term Frequency)
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
df = hashingTF.transform(df)

# Calculate IDF (Inverse Document Frequency)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(df)
df = idfModel.transform(df)

# Select only the necessary columns
df = df.select("video_id", "description", "features")

# Show the DataFrame
df.show(truncate=False)

# Extract top 10 words based on TF-IDF
top_words = df.select("video_id", "description", "features").rdd.flatMap(lambda x: [(x[0], w) for w in x[1]])
top_words = top_words.toDF(["video_id", "word"]).groupBy("word").count().sort("count", ascending=False).limit(30)

# Show the top 10 words
top_words.show(truncate=False)

# Stop the Spark session
spark.stop()

If I have csv file which is the table contain column video_id, views, categoryId, catergory_name, title, trending_date, view_count, description. The video_id is not unique as the data may update when the date change. I want the data before 01, November 2023 for calculation. that means, if the video_id is repeated, used the latest date before 01, November 2023 in column "trending_date".


Group dataset a by view_count in quartile, find and compare top 30 frequent words in description

please use pyspark and tfidf.

In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:

# Load your CSV file into a DataFrame
root_path = "/content/drive/MyDrive/NUS/CS5344/"
for country in ["CA", "GB", "US"]:
    data_path = root_path + country + "_youtube_trending_data_processed.csv"
    vars()[country] = pd.read_csv(data_path, lineterminator='\n')

df = pd.concat([CA, GB, US], axis=0)

# Handle missing values in the 'description' column
df['description'] = df['description'].fillna('')

# Filter data before 01, November 2023
df['trending_date'] = pd.to_datetime(df['trending_date'])
df = df[df['trending_date'] < '2023-11-01']

# Get the latest date for each video_id
df = df.sort_values('trending_date', ascending=False).drop_duplicates('video_id')

# Group by view_count quartiles
df['view_count_quartile'] = pd.qcut(df['view_count'], q=4, labels=False)

# Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words and not any(stop_word in word.lower() for stop_word in ["www", "http", "https",'video', 'youtube', 'facebook', 'new', 'follow', 'like', 'watch', 'subscribe', 'channel'])]
    return filtered_tokens


df['filtered_words'] = df['description'].apply(tokenize_and_remove_stopwords)

# Calculate TF-IDF for each quartile
for quartile in range(4):
    quartile_df = df[df['view_count_quartile'] == quartile]

    corpus = quartile_df['description'].tolist()
    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize_and_remove_stopwords)
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Get feature names and sort by IDF values
    feature_names = vectorizer.get_feature_names_out()
    idf_values = vectorizer.idf_
    sorted_features = [feature for _, feature in sorted(zip(idf_values, feature_names))]

    # Display the top 30 frequent words for each quartile
    top_30_words = sorted_features[:10]
    print(f"Top 10 frequent words for Quartile {quartile + 1}: {top_30_words}")





Top 10 frequent words for Quartile 1: ['instagram', 'twitter', 'live', 'world', 'content', 'website', 'highlights', 'best', 'official', 'latest']
Top 10 frequent words for Quartile 2: ['instagram', 'twitter', 'music', 'live', 'merch', 'free', 'official', 'make', 'world', 'content']
Top 10 frequent words for Quartile 3: ['instagram', 'twitter', 'music', 'live', 'official', 'merch', 'check', 'tiktok', 'free', 'world']
Top 10 frequent words for Quartile 4: ['instagram', 'twitter', 'music', 'official', 'merch', 'free', 'tiktok', 'content', 'world', 'make']


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, rank, desc
from pyspark.sql.window import Window

# Create a Spark session
spark = SparkSession.builder.appName("VideoAnalysis").getOrCreate()

root_path = "/content/drive/MyDrive/NUS/CS5344/"
for country in ["CA", "GB", "US"]:
    data_path = root_path + country + "_youtube_trending_data_processed.csv"
    vars()[country] = pd.read_csv(data_path, lineterminator='\n')

df = pd.concat([CA, GB, US], axis=0)

df = spark.createDataFrame(df)

# Convert "trending_date" to DateType
df = df.withColumn("trending_date", col("trending_date").cast(DateType()))

# Filter data before 01, November 2023
df = df.filter(col("trending_date") < "2023-11-01")

# Use window function to get the latest date for each video_id
window_spec = Window.partitionBy("video_id").orderBy(desc("trending_date"))
df = df.withColumn("rank", rank().over(window_spec)).filter(col("rank") == 1)

# Tokenize the description
tokenizer = Tokenizer(inputCol="description", outputCol="words")
df = tokenizer.transform(df)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df = remover.transform(df)

# Apply TF-IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=20)
idf = IDF(inputCol="rawFeatures", outputCol="features")
pipeline = Pipeline(stages=[hashingTF, idf])

# Fit and transform the pipeline
model = pipeline.fit(df)
result = model.transform(df)

# Get the top 30 frequent words
top_words = result.select("filtered_words").rdd.flatMap(lambda x: x['filtered_words']).countByValue()
top_30_words = sorted(top_words.items(), key=lambda x: x[1], reverse=True)[:30]

# Display the top 30 frequent words
for word, count in top_30_words:
    print(f"{word}: {count}")
