# Set Up Environment

## Import Libraries

In [1]:
from sqlalchemy import create_engine,text

## Define Functions

In [2]:
import pandas as pd

def filter_videos_by_duration(df, min_duration=60, max_duration=1800):
    """
    Filter videos DataFrame by duration within a specified range.
    
    Parameters:
        df (DataFrame): The DataFrame containing video data.
        min_duration (int): Minimum duration in seconds. Default is 60.
        max_duration (int): Maximum duration in seconds. Default is 1800.
        
    Returns:
        DataFrame: Filtered DataFrame containing videos with duration within the specified range.
    """
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

# Import Data

# Data Pre-Processing
## Remove shorts and longs

In [3]:
videos_for_labelling_df = pd.read_csv("videos_for_labelling_df.csv")

# Labelling training dataset with ChatGPT

## Write a prompt to get 100 labels at a time

In [4]:
import pyperclip
# Write prompt string
prompt_str = """You are a classifier, and I want you to classify each of the following data science youtube video titles based on the type of video. The options for video type are: Tutorial
Project
Interview
Review
News
Tips
Career
Challenge

The output should be in a table format with the columns being the index, the prompt and the classification for that prompt.

The video titles are as below:

"""

# Get 100 video titles, remove "|" as this screws chatGP, concat into prompt
prompt = (prompt_str + videos_for_labelling_df['video_title'].iloc[2900:3000].to_string(header=False)).replace('|', '')

# Copy to clipboard
pyperclip.copy(prompt)

## Manually paste ChatGPT output into CSV
- This stage was done manually, the resulting CSV can be read in below

In [9]:
import pandas as pd

# Load the classified_titles.csv into a DataFrame
classified_titles_df = pd.read_csv('classified_titles.csv')

# Merge the DataFrames
videos_with_labelling_df = videos_for_labelling_df.merge(classified_titles_df, how='left', left_index=True, right_index=True)

# Selecting columns and renaming
selected_columns = ['channel_id', 'video_id', 'video_title', 'description', 'tags', 'published', 
                    'view_count', 'like_count', 'favourite_count', 'comment_count', 'duration', 
                    'definition', 'caption', 'category_id', 'Prompt', 'Classification']

videos_with_labelling_df = videos_with_labelling_df.loc[:, selected_columns]
videos_with_labelling_df = videos_with_labelling_df.rename(columns={'Prompt': 'prompt', 'Classification': 'classification'})

# Save the DataFrame to a CSV file
videos_with_labelling_df.to_csv('videos_with_labelling_df.csv', index=False)

In [10]:
videos_with_labelling_df.head()

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,prompt,classification
0,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first...,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27,Front End Developer Roadmap 2024,Career
1,UC8butISFwT-Wl7EV0hUK0BQ,ypNKKYUJE5o,JavaScript Security Vulnerabilities Tutorial ...,Learn about 10 security vulnerabilities every ...,,2023-05-16 14:37:07.000000,62016.0,2625.0,0,71.0,1505,hd,True,27,JavaScript Security Vulnerabilities Tutorial –...,Tutorial
2,UC8butISFwT-Wl7EV0hUK0BQ,D6Xj_W4leu8,Use ChatGPT to Build a RegEx Generator – OpenA...,Learn how to build a dashboard that generates ...,,2023-03-30 13:32:31.000000,102762.0,2133.0,0,82.0,1792,hd,True,27,Use ChatGPT to Build a RegEx Generator – OpenA...,Tutorial
3,UC8butISFwT-Wl7EV0hUK0BQ,xZbU6bCZFYo,freeCodeCamp.org Curriculum Expansion: Math + ...,Support our campaign here: https://www.freecod...,,2021-02-02 19:00:57.000000,87027.0,3478.0,0,197.0,1677,hd,True,27,freeCodeCamp.org Curriculum Expansion: Math + ...,News
4,UC8butISFwT-Wl7EV0hUK0BQ,flpmSXVTqBI,Java Testing - JUnit 5 Crash Course,JUnit 5 is one of the most popular frameworks ...,,2021-01-12 15:59:45.000000,309188.0,5393.0,0,97.0,1565,hd,False,27,Java Testing - JUnit 5 Crash Course,Tutorial


# Check labels using label studio
https://towardsdatascience.com/bootstrapping-labels-with-gpt-4-8dc85ab5026d