In [261]:
import pandas as pd
from sqlalchemy import create_engine,text
import os
import json
import vertexai
from vertexai.generative_models import GenerativeModel
import time

### Vertex config

In [194]:
VERTEX_PROJECT_ID = os.getenv("VERTEX_PROJECT_ID")
vertex_region = "us-west4"

### Get all video data from database

In [269]:
db_string = 'sqlite:///../db/youtube.db'
# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

In [282]:
# Get all video data
query = text("SELECT * FROM video")
video_df = pd.read_sql_query(query, conn)

### Remove shorts and very long format videos

In [271]:
def filter_videos_by_duration(df, min_duration=60, max_duration=1800):
    """
    Filter videos DataFrame by duration within a specified range.
    
    Parameters:
        df (DataFrame): The DataFrame containing video data.
        min_duration (int): Minimum duration in seconds. Default is 60.
        max_duration (int): Maximum duration in seconds. Default is 1800.
        
    Returns:
        DataFrame: Filtered DataFrame containing videos with duration within the specified range.
    """
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

In [283]:
videos_for_labelling_df = filter_videos_by_duration(video_df)

### Create traning dataset

In [284]:
# Create training dataset
videos_for_labelling_df = videos_for_labelling_df.head(200) # defines the size of the training dataset

### Create Gemini prompt

In [274]:
# Adjust display options to prevent truncation
pd.set_option('display.max_colwidth', None)

In [275]:
# Write prompt string
prompt_str = """You are a classifier, and I want you to classify each of the following data science youtube video titles based on three overarching categories, video_type, video_topic, coding_language

"video_type": the options for video type are: 

"Tutorial",
"Project",
"News",
"Tips",
"Challenge",
"Career Advice",
"Podcast/Interview

"video_topic": the options for video topic are: 

"Statistics and Probability",
"Machine Learning / AI",
"Data Wrangling",
"Data Visualization",
"Data Mining",
"Software Engineering",
"Ethics and Privacy",
"Cloud Computing",
"Resume Building",
"Job Search Strategies",
"Interview Techniques",
"Career Development Paths",
"Balancing Work and Life",
"Business Acumen",

"technologies": also pull out any software/coding_language/packages as a list called 'technologies'. This should not include anything other than software / coding_language / packages though

From now on, return the output as a regular Python dict object. Your entire response should be a single line. Do not wrap strings with '\' There should be items within this for each video, for the classifications.

The response should look like (for example):

{
  "videos": [
    {
      "video_id": "YdWkUdMxMvM",
      "video_title": "Career Change to Code - The Complete Guide",
      "video_info": {
        "video_type": "Career Advice",
        "video_topic": "Software Engineering",
        "technologies": []
      }
    },
    {
      "video_id": "5rNk7m_zlAg",
      "video_title": "Spring Boot & Spring Data JPA ‚Äì Complete Course",
      "video_info": {
        "video_type": "Tutorial",
        "video_topic": "Software Engineering",
        "technologies": ["Spring", "Spring Boot"]
      }
    },
  ]
}

There should be no trailing comma after the last item in the "videos" list!

The video ids and titles are below:

"""

### Function to generate respone from Gemini

In [276]:
def generate_response(project_id: str, location: str, prompt) -> str:
    # Initialize Vertex AI
    vertexai.init(project=project_id, location=location)
    # Load the model
    multimodal_model = GenerativeModel("gemini-1.0-pro")
    # Query the model
    response = multimodal_model.generate_content(
        [
            prompt
        ]
    )
    return response.text

### Generate responses

In [287]:
# Get total number of videos
total_videos = len(videos_for_labelling_df)

# Define chunk size
chunk_size = 50

# Init response list
responses = []

# Chunk counter
chunk_count = 0

# Loop through the DataFrame in chunks of 50
for chunk_start in range(0, total_videos, chunk_size):
    # Get chunk of videos
    videos_chunk = videos_for_labelling_df.iloc[chunk_start:chunk_start + chunk_size]

    # Init prompt
    prompt = prompt_str

    # Add video information to prompt
    for i, (_, row) in enumerate(videos_chunk.iterrows(), start=1):
        video_title = row['video_title']
        video_id = row['video_id']
        prompt += f'\nvideo_id : {video_id}, video_title: {video_title}'

    # Generate response for the chunk
    response_text = generate_response(VERTEX_PROJECT_ID, vertex_region, prompt)

    video_info_dict = json.loads(response_text)

    print("Got response, starting processing")

    # Map video_type, video_topic, and technologies to the existing df
    for video in video_info_dict['videos']:
        video_id = video['video_id']
        video_type = video['video_info']['video_type']
        video_topic = video['video_info']['video_topic']
        technologies = ', '.join(video['video_info']['technologies'])
        
        # Update the df with the extracted information
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'video_type'] = video_type
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'video_topic'] = video_topic
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'technologies'] = technologies

    chunk_count += chunk_size

    print(f'Processed {chunk_count} rows of {total_videos}...')

    # Add a delay before processing the next chunk
    time.sleep(10)

print('Completed processing')

Got response, starting processing
Processed 50 rows of 200...
Got response, starting processing
Processed 100 rows of 200...
Got response, starting processing
Processed 150 rows of 200...
Got response, starting processing
Processed 200 rows of 200...
Completed processing


In [288]:
videos_for_labelling_df.head(10)

Unnamed: 0,channel_id,video_id,video_title,description,tags,published,view_count,like_count,favourite_count,comment_count,duration,definition,caption,category_id,video_type,video_topic,technologies
57,UC8butISFwT-Wl7EV0hUK0BQ,9He4UBLyk8Y,Front End Developer Roadmap 2024,Learn what technologies you should learn first to become a front end web developer.\nWatch the full learning front end learning path: https://www.youtube.com/playlist?list=PLWKjhJtqVAbmMuZ3saqRIBimAKIMYkt0E\n\n‚úèÔ∏è This video was developed by @beau \n\nüéâ Thanks to our Champion and Sponsor supporters:\nüëæ davthecoder\nüëæ jedi-or-sith\nüëæ ÂçóÂÆÆÂçÉÂΩ±\nüëæ Agust√≠n Kussrow\nüëæ Nattira Maneerat\nüëæ Heather Wcislo\nüëæ Serhiy Kalinets\nüëæ Justin Hual\nüëæ Otis Morgan \nüëæ Oscar Rahnama\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news,,2023-10-19 14:18:42.000000,507722.0,17091.0,0,493.0,729,hd,False,27,Tips,Software Engineering,
134,UC8butISFwT-Wl7EV0hUK0BQ,ypNKKYUJE5o,JavaScript Security Vulnerabilities Tutorial ‚Äì With Code Examples,Learn about 10 security vulnerabilities every JavaScript developer should know. First try to find the vulnerabilities in the different code examples. Then learn how to fix the issues.\n\n‚úèÔ∏è Brandon from Semgrep developed this course.\n\nüîó Learn more about Semgrep: https://go.semgrep.dev/scan-in-1-minute-for-free\nüèó Semgrep provided a grant to make this course possible.\n\n\n\nüéâ Thanks to our Champion and Sponsor supporters:\nüëæ davthecoder\nüëæ jedi-or-sith\nüëæ ÂçóÂÆÆÂçÉÂΩ±\nüëæ Agust√≠n Kussrow\nüëæ Nattira Maneerat\nüëæ Heather Wcislo\nüëæ Serhiy Kalinets\nüëæ Justin Hual\nüëæ Otis Morgan\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news,,2023-05-16 14:37:07.000000,62016.0,2625.0,0,71.0,1505,hd,True,27,Tutorial,Software Engineering,JavaScript
158,UC8butISFwT-Wl7EV0hUK0BQ,D6Xj_W4leu8,Use ChatGPT to Build a RegEx Generator ‚Äì OpenAI API Low Code Course,"Learn how to build a dashboard that generates RegEx using the OpenAI API and a low code tool called Retool. The dashboard allows users to input a string, write a description of the desired RegEx, and generate the code to transform the string accordingly. \n\nüíª You can find the iframe code here: https://github.com/kubowania/regex-openai\n\n‚úèÔ∏è This course was created by @AniaKubow .\n\nüèó Retool provided a grant to make this course possible. Learn more about Retool: https://retool.com\n\n‚≠êÔ∏è Contents ‚≠êÔ∏è\n(0:00) Introduction\n(2:36) Writing our own regex code\n(5:06) Creating the header element\n(8:11) Creating the text area\n(11:35) How to create a custom code editor\n(16:32) Creating the new button\n(19:08) Setting up the sidebar\n(20:58) Configuring the open api api\n(22:51) The body of the example example\n(24:58) Setting up the code\n\nüéâ Thanks to our Champion and Sponsor supporters:\nüëæ Nattira Maneerat\nüëæ Heather Wcislo\nüëæ Serhiy Kalinets\nüëæ Erdeniz Unvan\nüëæ Justin Hual\nüëæ Agust√≠n Kussrow\nüëæ Otis Morgan\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news",,2023-03-30 13:32:31.000000,102762.0,2133.0,0,82.0,1792,hd,True,27,Tutorial,Software Engineering,
489,UC8butISFwT-Wl7EV0hUK0BQ,xZbU6bCZFYo,freeCodeCamp.org Curriculum Expansion: Math + Machine Learning + Data Science,"Support our campaign here: https://www.freecodecamp.org/news/p/4476d664-eb83-47c9-8328-903a78865c8f#the-2021-data-science-curriculum-pledge-drive\n\nView the Curriculum Design Sheets (these are very much a work in progress): https://docs.google.com/spreadsheets/d/1KXEJvgnAziiePMp2Ovf_ssDqCFFWYuycXWBNp91k3qM/edit?usp=sharing\n\nIf you're a math / CS professor or Data Science practitioner, we'd appreciate your feedback / ideas for this curriculum: https://docs.google.com/forms/d/e/1FAIpQLScLiGgmYh2DxdTmPxS141J5knuOtcYN9ulOtBwsRbkuVwXRRQ/viewform?usp=sf_link\n\nFollow Quincy on Twitter: https://twitter.com/ossia\n\nFollow Eric on Twitter: https://twitter.com/erictleung\n\nFollow Tom on Twitter: https://twitter.com/moTness\n\nFollow Darrell Silver (who's 100%-matching all donations) on Twitter: https://twitter.com/darrellsilver",,2021-02-02 19:00:57.000000,87027.0,3478.0,0,197.0,1677,hd,True,27,News,Machine Learning / AI,
496,UC8butISFwT-Wl7EV0hUK0BQ,flpmSXVTqBI,Java Testing - JUnit 5 Crash Course,"JUnit 5 is one of the most popular frameworks for testing Java applications. In this crash course, you will learn about Junit 5 and how to use it to write unit tests for your Java programs.\n\n‚úèÔ∏è Course created by Programming Techie. Check out their channel: https://www.youtube.com/channel/UCD20RZV_WHQImisCW2QZwDw\n\nüíª Source Code for Starter Project: https://github.com/SaiUpadhyayula/contact-manager-starter\nüíª Source Code for Completed Project: https://github.com/SaiUpadhyayula/contact-manager\n\nüîó Written Tutorial: https://programmingtechie.com/2020/12/26/junit-5-complete-tutorial/\nüîó Maven Tutorial: https://www.youtube.com/watch?v=JhSBS2OpGdU\n\n‚≠êÔ∏è Course Contents ‚≠êÔ∏è\n‚å®Ô∏è (00:00) Introduction\n‚å®Ô∏è (00:28) What is JUnit?\n‚å®Ô∏è (01:09) JUnit Architecture\n‚å®Ô∏è (02:19) First JUnit Test\n‚å®Ô∏è (08:26) Testing Exceptions using assertThrows()\n‚å®Ô∏è (10:10) Understanding JUnit Test Lifecycle\n‚å®Ô∏è (13:42) Conditional Executions\n‚å®Ô∏è (15:11) Assumptions\n‚å®Ô∏è (16:58) Repeated Tests\n‚å®Ô∏è (18:48) Parameterized Tests\n‚å®Ô∏è (23:47) Nested Tests\n‚å®Ô∏è (25:22) Disabled Tests\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news",,2021-01-12 15:59:45.000000,309188.0,5393.0,0,97.0,1565,hd,False,27,Tutorial,Software Engineering,"Java, JUnit 5"
572,UC8butISFwT-Wl7EV0hUK0BQ,pN92rnO_n5U,How to Design a Website ‚Äì A UX Wireframe Tutorial,"Learn how to use wireframing to help design a website. A good wireframe can provide the vision for the entire layout and functionality of a website. It can serve as the first stage of a design, give an idea of the overall structure pages will take, and how navigation will flow. \n\nüé• Video from Adrian Twarog. Check out his YouTube channel: https://www.youtube.com/channel/UCvM5YYWwfLwpcQgbRr68JLQ\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://freecodecamp.org/news",,2020-09-03 16:29:24.000000,286924.0,9913.0,0,249.0,1800,hd,True,27,Tutorial,Software Engineering,
787,UC8butISFwT-Wl7EV0hUK0BQ,TPMlZxRRaBQ,Tableau for Data Science and Data Visualization - Crash Course Tutorial,"Learn to use Tableau to produce high quality, interactive data visualizations!\n\nTableau can help you see and understand your data. Connect to almost any database, drag and drop to create visualizations, and share with a click.\n\nüîóTableau Public: https://public.tableau.com/en-us/s/\n\nüîóKaggle dataset: https://www.kaggle.com/c/titanic/data\n\nFind more data science information: https://www.velocityanalytics.io/\n\nTutorial from Velocity Consulting. Check out their YouTube channel: https://www.youtube.com/channel/UCjWUocSV-slQnC64nQ1vVhQ\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://medium.freecodecamp.org","tableau training for beginners, tableau tutorial, tableau dashboard, business intelligence tools, tableau training, data visualization tools, tableau desktop, tableau excel, tableau certification, tableau online, what is tableau, data vizualisation software, tableau reporting, business intelligence and analytics, tableau dashboard example, tableau example, tableau vizualisation, tableau course",2019-01-29 17:38:12.000000,814773.0,12112.0,0,215.0,1722,hd,False,27,Tutorial,Data Visualization,Tableau
798,UC8butISFwT-Wl7EV0hUK0BQ,T6e1GnlW8-g,React Native Web Setup (P7D1) - Live Coding with Jesse,Project 7 Day 1: Today we will start a project with React Native Web\n\nSee a professional software engineer at work. Unscripted. Mistakes included.\n\nReact: https://facebook.github.io/react/\n\nGitter: https://gitter.im/LiveCodingwithJesseFreeCodeCamp/\n\nTwitter: https://twitter.com/JesseRWeigel\nYoutube: https://www.youtube.com/c/JesseWeigel29 \nInstagram: https://www.instagram.com/jesse.weigel/\n\nCode Editor: Visual Studio Code\nVS Code Theme: Seti\n\n-\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://medium.freecodecamp.org,"javascript, react, react native, coding",2019-01-18 14:49:15.000000,46272.0,394.0,0,23.0,1447,hd,False,27,Project,Software Engineering,React Native
799,UC8butISFwT-Wl7EV0hUK0BQ,TbMKwl11itQ,Create a Keylogger with Python - Tutorial,Learn how to create a keylogger for windows using Python. Get user keypresses and store them in a text file. This keyboard logging tutorial uses the pynput module.\n\nTutorial from Tech With Tim.\nCheck out Tim's channel: https://www.youtube.com/channel/UC4JX40jDee_tINbkjycV4Sg\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://medium.freecodecamp.org,"keylogger in python, keylogger tutorial, how to make a simple python keylogger, python keylogger for windows, python keylogger, python keylogger tutorial, how to make a python keylogger, pynput, keyboard logger tutorial, how to make a keyboard logger, tech with tim, tech with tim python",2019-01-17 14:46:47.000000,393596.0,8793.0,0,536.0,698,hd,False,27,Tutorial,Software Engineering,Python
806,UC8butISFwT-Wl7EV0hUK0BQ,msrnbh66OhM,Web Developer Portfolio Review and Tips - Nisar,Learn tips to improve your portfolio while watching Benjamin Spak review the portfolio and social media presence for a web developer named Nisar.\n\nüé• Benjamin's YouTube channel: https://www.youtube.com/channel/UCQUNME_uamXEW1c58iEADWw\n\n‚ö†Ô∏è Want Benjamin to Review Your Resume or Portfolio?\nhttps://goo.gl/forms/6NM27sIuXxHAoE4y1\n\nConnect With Benjamin üí¨\nüëä Discord: http://spak.co\nüê¶ Twitter: https://twitter.com/benjaminspak\n‚õìÔ∏è LinkedIn: https://www.linkedin.com/in/benjaminspak/\n\n--\n\nLearn to code for free and get a developer job: https://www.freecodecamp.org\n\nRead hundreds of articles on programming: https://medium.freecodecamp.org,"web developer portfolio, resume review, portfolio review, social media review, web developer resume, developer resume review, front end web developer",2019-01-03 17:24:59.000000,15508.0,358.0,0,33.0,1017,hd,False,27,Tips,Software Engineering,


In [289]:
videos_for_labelling_df.to_csv('./training_dataset/labelled_video_dataset.csv')