# Set Up Environment

## Import Libraries

In [2]:
import pandas as pd

## Define Functions

# Import Data

# Import Data

In [4]:
videos_with_labelling_df = pd.read_csv('videos_with_labelling_df.csv')

In [5]:
videos_with_labelling_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,prompt,classification
0,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first...,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27,Front End Developer Roadmap 2024,Career
1,UC8butISFwT-Wl7EV0hUK0BQ,ypNKKYUJE5o,JavaScript Security Vulnerabilities Tutorial ...,Learn about 10 security vulnerabilities every ...,,2023-05-16 14:37:07.000000,62016.0,2625.0,0,71.0,1505,hd,True,27,JavaScript Security Vulnerabilities Tutorial –...,Tutorial
2,UC8butISFwT-Wl7EV0hUK0BQ,D6Xj_W4leu8,Use ChatGPT to Build a RegEx Generator – OpenA...,Learn how to build a dashboard that generates ...,,2023-03-30 13:32:31.000000,102762.0,2133.0,0,82.0,1792,hd,True,27,Use ChatGPT to Build a RegEx Generator – OpenA...,Tutorial
3,UC8butISFwT-Wl7EV0hUK0BQ,xZbU6bCZFYo,freeCodeCamp.org Curriculum Expansion: Math + ...,Support our campaign here: https://www.freecod...,,2021-02-02 19:00:57.000000,87027.0,3478.0,0,197.0,1677,hd,True,27,freeCodeCamp.org Curriculum Expansion: Math + ...,News
4,UC8butISFwT-Wl7EV0hUK0BQ,flpmSXVTqBI,Java Testing - JUnit 5 Crash Course,JUnit 5 is one of the most popular frameworks ...,,2021-01-12 15:59:45.000000,309188.0,5393.0,0,97.0,1565,hd,False,27,Java Testing - JUnit 5 Crash Course,Tutorial


# Data Analysis and Feature Exploration

In [7]:
videos_with_labelling_df = videos_with_labelling_df[videos_with_labelling_df['classification'].notnull()]

In [9]:
videos_with_labelling_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,prompt,classification
0,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first...,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27,Front End Developer Roadmap 2024,Career
1,UC8butISFwT-Wl7EV0hUK0BQ,ypNKKYUJE5o,JavaScript Security Vulnerabilities Tutorial ...,Learn about 10 security vulnerabilities every ...,,2023-05-16 14:37:07.000000,62016.0,2625.0,0,71.0,1505,hd,True,27,JavaScript Security Vulnerabilities Tutorial –...,Tutorial
2,UC8butISFwT-Wl7EV0hUK0BQ,D6Xj_W4leu8,Use ChatGPT to Build a RegEx Generator – OpenA...,Learn how to build a dashboard that generates ...,,2023-03-30 13:32:31.000000,102762.0,2133.0,0,82.0,1792,hd,True,27,Use ChatGPT to Build a RegEx Generator – OpenA...,Tutorial
3,UC8butISFwT-Wl7EV0hUK0BQ,xZbU6bCZFYo,freeCodeCamp.org Curriculum Expansion: Math + ...,Support our campaign here: https://www.freecod...,,2021-02-02 19:00:57.000000,87027.0,3478.0,0,197.0,1677,hd,True,27,freeCodeCamp.org Curriculum Expansion: Math + ...,News
4,UC8butISFwT-Wl7EV0hUK0BQ,flpmSXVTqBI,Java Testing - JUnit 5 Crash Course,JUnit 5 is one of the most popular frameworks ...,,2021-01-12 15:59:45.000000,309188.0,5393.0,0,97.0,1565,hd,False,27,Java Testing - JUnit 5 Crash Course,Tutorial


In [10]:
videos_with_labelling_df['classification'].value_counts()

classification
Tutorial     1629
Career        458
Project       225
Tips          223
Challenge     123
Review        118
News          108
Interview     105
Lecture         7
Debate          4
Name: count, dtype: int64

## TF-IDF vectorization

TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a numerical statistic used in information retrieval and text mining to evaluate the importance of a word in a document relative to a collection of documents (corpus). TF-IDF is commonly used for text feature extraction in machine learning and natural language processing tasks.

Here's a breakdown of TF-IDF:

1. **Term Frequency (TF)**: It measures how frequently a term (word) occurs in a document. It is calculated as the ratio of the number of times a term appears in a document to the total number of terms in the document. The idea is that words that occur more frequently within a document are more important for describing the content of that document.

   $$ \text{TF}(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d} $$

2. **Inverse Document Frequency (IDF)**: It measures the importance of a term across a collection of documents (corpus). It is calculated as the logarithm of the ratio of the total number of documents to the number of documents containing the term. The IDF value decreases as the term appears in more documents, indicating that common terms are less informative than rare terms.

   $$ \text{IDF}(t, D) = \log\left(\frac{\text{Total number of documents in corpus } |D|}{\text{Number of documents containing term } t}\right) $$

3. **TF-IDF**: It combines the TF and IDF values to calculate a weighted score for each term in a document. The TF-IDF score increases with the frequency of the term in the document (TF) and decreases with the frequency of the term in the corpus (IDF).

   $$ \text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D) $$

In essence, TF-IDF identifies words that are unique and important to a specific document while also considering their general importance across a collection of documents. It's commonly used for tasks like document classification, information retrieval, and text mining.


In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Drop any NaN values in the 'video_title' column
videos_with_labelling_df = videos_with_labelling_df.dropna(subset=['video_title'])

# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'video_title' column
tfidf_matrix = tfidf_vectorizer.fit_transform(videos_with_labelling_df['video_title'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
tfidf_df.head()

Unnamed: 0,000,01,02,026,03,04,05,06,07,08,...,zero,zhou,zip,zod,zone,zuckerberg,करत,ᵐᵒˢᵗˡʸ,𝐂𝐎𝐃𝐄,𝐓𝐇𝐎𝐍
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Sum the TF-IDF scores across all documents
tfidf_sum = tfidf_df.sum()

# Sort the sums in descending order to get the most important words
most_important_words = tfidf_sum.sort_values(ascending=False)

# Display the most important words
most_important_words.head(20)


in            113.614793
data          113.386482
to            105.033789
python        103.521670
tutorial       88.412563
how            79.583534
for            72.793060
and            70.296413
the            63.687306
javascript     61.449447
with           58.369572
science        54.250774
is             53.561014
learn          48.970938
what           47.112559
of             46.376878
learning       44.818593
analyst        41.672244
you            41.642363
minutes        40.040516
dtype: float64

In [37]:
# Group tfidf_df by the 'classification' column and calculate the sum of TF-IDF scores for each category
tfidf_sum_by_category = tfidf_df.groupby(videos_with_labelling_df['classification']).sum()

# Display the top 10 most important words for each category
for category, scores in tfidf_sum_by_category.iterrows():
    print(f"Classification: {category}")
    top_10_words = scores.nlargest(10)  # Get the top 10 words with the highest total TF-IDF scores
    print(top_10_words)
    print()


Classification: Career
data         44.813518
to           31.295703
analyst      29.717354
how          24.190321
science      20.192632
become       17.782906
in           17.721425
scientist    16.833936
job          16.104703
the          16.064669
Name: Career, dtype: float64

Classification: Challenge
you           7.158587
this          5.098004
the           4.509462
will          4.088448
programmer    4.083853
are           3.933996
daily         3.781269
tried         3.702819
with          3.128325
css           3.027359
Name: Challenge, dtype: float64

Classification: Debate
neuralseek    0.573375
technology    0.565979
is            0.542158
vs            0.516848
underrated    0.506866
skill         0.488427
the           0.463013
most          0.408131
business      0.404929
threat        0.402841
Name: Debate, dtype: float64

Classification: Interview
interview                 5.755735
on                        5.090825
questions                 3.727423
prof          