# Set Up Environment

## Import Libraries

In [5]:
import pandas as pd
import os
from googleapiclient.discovery import build
import isodate
from sqlalchemy import create_engine,text
import plotly.express as px

In [83]:
db_string = 'sqlite:///../db/youtube.db'

# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

## Define Functions

In [10]:
import pandas as pd

def filter_videos_by_duration(df, min_duration=60, max_duration=1800):
    """
    Filter videos DataFrame by duration within a specified range.
    
    Parameters:
        df (DataFrame): The DataFrame containing video data.
        min_duration (int): Minimum duration in seconds. Default is 60.
        max_duration (int): Maximum duration in seconds. Default is 1800.
        
    Returns:
        DataFrame: Filtered DataFrame containing videos with duration within the specified range.
    """
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

# Import Data

In [11]:
# Import channels data
query = text('SELECT * FROM channel')
channels_df = pd.read_sql_query(query, conn)
channels_df.head()

Unnamed: 0,channel_id,channel_name,description,subscriber_count,view_count,video_count,playlist_id,start_date,country
0,UC8butISFwT-Wl7EV0hUK0BQ,freeCodeCamp.org,Learn to code for free.,9040000,661570512,1585,UU8butISFwT-Wl7EV0hUK0BQ,2014-12-16,US
1,UCEBpSZhI1X8WaP-kY_2LLcg,365 Data Science,At 365 Data Science we make #DataScience acces...,307000,13782817,224,UUEBpSZhI1X8WaP-kY_2LLcg,2017-08-07,BG
2,UCHXa4OpASJEwrHrLeIzw7Yg,Nicholas Renotte,"Sup!\n\nWelcome to the channel. So, if you're ...",235000,15263747,304,UUHXa4OpASJEwrHrLeIzw7Yg,2019-01-26,AU
3,UCDybamfye5An6p-j1t2YMsg,Mo Chen,"👋 Hey there, my name is Mo Chen and I work as ...",82100,2658864,88,UUDybamfye5An6p-j1t2YMsg,2022-12-25,GB
4,UCkRFwipiIqBTakN-mkZ-GcQ,Ayush Singh,,70200,2075280,24,UUkRFwipiIqBTakN-mkZ-GcQ,2022-06-25,IN


In [12]:
# Import videos data
query = text('SELECT * FROM video')
videos_df = pd.read_sql_query(query, conn)
videos_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
0,UC8butISFwT-Wl7EV0hUK0BQ,YdWkUdMxMvM,Career Change to Code - The Complete Guide,This course is for those considering transitio...,,2024-02-07 15:49:07.000000,3252.0,374.0,0,27.0,12191,hd,False,27
1,UC8butISFwT-Wl7EV0hUK0BQ,5rNk7m_zlAg,Spring Boot & Spring Data JPA – Complete Course,Learn how to use Spring Boot and Spring Data J...,,2024-02-06 15:25:40.000000,24118.0,1434.0,0,223.0,45737,hd,False,27
2,UC8butISFwT-Wl7EV0hUK0BQ,5ZdHfJVAY-s,Build 25 React Projects – Tutorial,Master React by building 25 different projects...,,2024-02-05 15:30:28.000000,50388.0,2988.0,0,103.0,34614,hd,False,27
3,UC8butISFwT-Wl7EV0hUK0BQ,OwjKN9_NqPI,Oh My Zsh Creator Robby Russell – freeCodeCamp...,"In this week's episode of the podcast, freeCod...",,2024-02-02 15:26:29.000000,14435.0,284.0,0,22.0,7673,hd,False,27
4,UC8butISFwT-Wl7EV0hUK0BQ,e2nkq3h1P68,Learn Accessibility - Full a11y Tutorial,Learn how to write accessible HTML by solving ...,,2024-02-01 15:38:37.000000,23669.0,968.0,0,17.0,5586,hd,False,27


# Data Pre-Processing
## Remove shorts and longs

In [13]:
filtered_videos_df = filter_videos_by_duration(videos_df)

# Inspect Data

In [14]:
filtered_videos_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id
57,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first...,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27
134,UC8butISFwT-Wl7EV0hUK0BQ,ypNKKYUJE5o,JavaScript Security Vulnerabilities Tutorial ...,Learn about 10 security vulnerabilities every ...,,2023-05-16 14:37:07.000000,62016.0,2625.0,0,71.0,1505,hd,True,27
158,UC8butISFwT-Wl7EV0hUK0BQ,D6Xj_W4leu8,Use ChatGPT to Build a RegEx Generator – OpenA...,Learn how to build a dashboard that generates ...,,2023-03-30 13:32:31.000000,102762.0,2133.0,0,82.0,1792,hd,True,27
489,UC8butISFwT-Wl7EV0hUK0BQ,xZbU6bCZFYo,freeCodeCamp.org Curriculum Expansion: Math + ...,Support our campaign here: https://www.freecod...,,2021-02-02 19:00:57.000000,87027.0,3478.0,0,197.0,1677,hd,True,27
496,UC8butISFwT-Wl7EV0hUK0BQ,flpmSXVTqBI,Java Testing - JUnit 5 Crash Course,JUnit 5 is one of the most popular frameworks ...,,2021-01-12 15:59:45.000000,309188.0,5393.0,0,97.0,1565,hd,False,27


In [15]:
filtered_videos_df['category_id'].value_counts()

category_id
27    19873
28     6952
26     1758
22     1637
24      497
1       185
17       74
20       48
19       18
23       12
25       11
2         8
15        3
10        2
Name: count, dtype: int64

# Labelling dataset

In [51]:
prompt_str = """You are a classifier, and I want you to classify each of the following data science youtube video titles based on the type of video. The options for video type are: Tutorial
Project
Interview
Review
News
Tips
Career
Challenge

The output should be in a table format with the columns being the index, the prompt and the classification for that prompt.

The video titles are as below:

"""

In [59]:
videos_for_labelling_df = filtered_videos_df.copy().reset_index()
videos_for_labelling_df.to_csv("videos_for_labelling_df.csv")

In [79]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#print(videos_for_labelling_df['video_title'].head(300).to_string(index=False, header=False))
print((prompt_str + videos_for_labelling_df['video_title'].iloc[2900:3000].to_string(header=False)).replace('|', ''))

You are a classifier, and I want you to classify each of the following data science youtube video titles based on the type of video. The options for video type are: Tutorial
Project
Interview
Review
News
Tips
Career
Challenge

The output should be in a table format with the columns being the index, the prompt and the classification for that prompt.

The video titles are as below:

2900                                                     Riddhi Jain Pitliya on virtual agents and memes #ai
2901                             Mahault Albarracin on intelligence #artificialintelligence #activeinference
2902                                                            Prof. KARL FRISTON on upcoming WOLFRAM show!
2903                                                      Prof. Karl Friston on Prof. Andy Clark's new book!
2904                         Professor Noam Chomsky on Wittgenstein's private language argument #linguistics
2905           Luciano Floridi on the ramifications of working in AI #m

In [80]:
# Load the classified_titles.csv into a DataFrame
classified_df = pd.read_csv('classified_titles.csv')

# Assuming videos_for_labelling_df is already defined as a DataFrame

# Left join the two DataFrames on the index
videos_with_labelling_df = videos_for_labelling_df.merge(classified_df, how='left', left_index=True, right_index=True)

In [82]:
videos_with_labelling_df.head()

Unnamed: 0,index,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,Index,Prompt,Classification
0,57,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first to become a front end web developer.\nWatch the full learning front end learning path: https://www.youtube.com/playlist?list=PLWKjhJtqVAbmMuZ3saqRIBimAKIMYkt0E\n\n✏️ This video was developed by @beau \n\n🎉 Thanks to our Champion and Sponsor supporters:\n👾 davthecoder\n👾 jedi-or-sith\n👾 南宮千影\n👾 Agustín Kussrow\n👾 Nattira Maneerat\n👾 Heather Wcislo\n👾 Serhiy Kalinets\n👾 Justin Hual\n👾 Otis Morgan \n👾 Oscar Rahnama\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27,0.0,Front End Developer Roadmap 2024,Career
1,134,UC8butISFwT-Wl7EV0hUK0BQ,ypNKKYUJE5o,JavaScript Security Vulnerabilities Tutorial – With Code Examples,Learn about 10 security vulnerabilities every JavaScript developer should know. First try to find the vulnerabilities in the different code examples. Then learn how to fix the issues.\n\n✏️ Brandon from Semgrep developed this course.\n\n🔗 Learn more about Semgrep: https://go.semgrep.dev/scan-in-1-minute-for-free\n🏗 Semgrep provided a grant to make this course possible.\n\n\n\n🎉 Thanks to our Champion and Sponsor supporters:\n👾 davthecoder\n👾 jedi-or-sith\n👾 南宮千影\n👾 Agustín Kussrow\n👾 Nattira Maneerat\n👾 Heather Wcislo\n👾 Serhiy Kalinets\n👾 Justin Hual\n👾 Otis Morgan\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news,,2023-05-16 14:37:07.000000,62016.0,2625.0,0,71.0,1505,hd,True,27,1.0,JavaScript Security Vulnerabilities Tutorial – With Code Examples,Tutorial
2,158,UC8butISFwT-Wl7EV0hUK0BQ,D6Xj_W4leu8,Use ChatGPT to Build a RegEx Generator – OpenAI API Low Code Course,"Learn how to build a dashboard that generates RegEx using the OpenAI API and a low code tool called Retool. The dashboard allows users to input a string, write a description of the desired RegEx, and generate the code to transform the string accordingly. \n\n💻 You can find the iframe code here: https://github.com/kubowania/regex-openai\n\n✏️ This course was created by @AniaKubow .\n\n🏗 Retool provided a grant to make this course possible. Learn more about Retool: https://retool.com\n\n⭐️ Contents ⭐️\n(0:00) Introduction\n(2:36) Writing our own regex code\n(5:06) Creating the header element\n(8:11) Creating the text area\n(11:35) How to create a custom code editor\n(16:32) Creating the new button\n(19:08) Setting up the sidebar\n(20:58) Configuring the open api api\n(22:51) The body of the example example\n(24:58) Setting up the code\n\n🎉 Thanks to our Champion and Sponsor supporters:\n👾 Nattira Maneerat\n👾 Heather Wcislo\n👾 Serhiy Kalinets\n👾 Erdeniz Unvan\n👾 Justin Hual\n👾 Agustín Kussrow\n👾 Otis Morgan\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news",,2023-03-30 13:32:31.000000,102762.0,2133.0,0,82.0,1792,hd,True,27,2.0,Use ChatGPT to Build a RegEx Generator – OpenAI API Low Code Course,Tutorial
3,489,UC8butISFwT-Wl7EV0hUK0BQ,xZbU6bCZFYo,freeCodeCamp.org Curriculum Expansion: Math + Machine Learning + Data Science,"Support our campaign here: https://www.freecodecamp.org/news/p/4476d664-eb83-47c9-8328-903a78865c8f#the-2021-data-science-curriculum-pledge-drive\n\nView the Curriculum Design Sheets (these are very much a work in progress): https://docs.google.com/spreadsheets/d/1KXEJvgnAziiePMp2Ovf_ssDqCFFWYuycXWBNp91k3qM/edit?usp=sharing\n\nIf you're a math / CS professor or Data Science practitioner, we'd appreciate your feedback / ideas for this curriculum: https://docs.google.com/forms/d/e/1FAIpQLScLiGgmYh2DxdTmPxS141J5knuOtcYN9ulOtBwsRbkuVwXRRQ/viewform?usp=sf_link\n\nFollow Quincy on Twitter: https://twitter.com/ossia\n\nFollow Eric on Twitter: https://twitter.com/erictleung\n\nFollow Tom on Twitter: https://twitter.com/moTness\n\nFollow Darrell Silver (who's 100%-matching all donations) on Twitter: https://twitter.com/darrellsilver",,2021-02-02 19:00:57.000000,87027.0,3478.0,0,197.0,1677,hd,True,27,3.0,freeCodeCamp.org Curriculum Expansion: Math + Machine Learning + Data Science,News
4,496,UC8butISFwT-Wl7EV0hUK0BQ,flpmSXVTqBI,Java Testing - JUnit 5 Crash Course,"JUnit 5 is one of the most popular frameworks for testing Java applications. In this crash course, you will learn about Junit 5 and how to use it to write unit tests for your Java programs.\n\n✏️ Course created by Programming Techie. Check out their channel: https://www.youtube.com/channel/UCD20RZV_WHQImisCW2QZwDw\n\n💻 Source Code for Starter Project: https://github.com/SaiUpadhyayula/contact-manager-starter\n💻 Source Code for Completed Project: https://github.com/SaiUpadhyayula/contact-manager\n\n🔗 Written Tutorial: https://programmingtechie.com/2020/12/26/junit-5-complete-tutorial/\n🔗 Maven Tutorial: https://www.youtube.com/watch?v=JhSBS2OpGdU\n\n⭐️ Course Contents ⭐️\n⌨️ (00:00) Introduction\n⌨️ (00:28) What is JUnit?\n⌨️ (01:09) JUnit Architecture\n⌨️ (02:19) First JUnit Test\n⌨️ (08:26) Testing Exceptions using assertThrows()\n⌨️ (10:10) Understanding JUnit Test Lifecycle\n⌨️ (13:42) Conditional Executions\n⌨️ (15:11) Assumptions\n⌨️ (16:58) Repeated Tests\n⌨️ (18:48) Parameterized Tests\n⌨️ (23:47) Nested Tests\n⌨️ (25:22) Disabled Tests\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news",,2021-01-12 15:59:45.000000,309188.0,5393.0,0,97.0,1565,hd,False,27,4.0,Java Testing - JUnit 5 Crash Course,Tutorial


In [86]:
db_string = 'sqlite:///../db/youtube.db'

# Create a engine
engine = create_engine(db_string)

videos_with_labelling_df.to_sql(name='videos_with_labelling', con=engine, if_exists='replace', index=False)

OperationalError: (sqlite3.OperationalError) duplicate column name: Index
[SQL: 
CREATE TABLE videos_with_labelling (
	"index" BIGINT, 
	channel_id TEXT, 
	video_id TEXT, 
	video_title TEXT, 
	description TEXT, 
	tags TEXT, 
	published TEXT, 
	view_count FLOAT, 
	like_count FLOAT, 
	favourite_count BIGINT, 
	comment_count FLOAT, 
	duration BIGINT, 
	definition TEXT, 
	caption TEXT, 
	category_id BIGINT, 
	"Index" FLOAT, 
	"Prompt" TEXT, 
	"Classification" TEXT
)

]
(Background on this error at: https://sqlalche.me/e/20/e3q8)