In [3]:
import pandas as pd
from sqlalchemy import create_engine,text
import os
import json
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import time

### Vertex config

In [4]:
VERTEX_PROJECT_ID = os.getenv("VERTEX_PROJECT_ID")
vertex_region = "us-west4"

### Get all video data from database

In [5]:
db_string = 'sqlite:///../db/youtube.db'
# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

In [6]:
# Get all video data
query = text("SELECT * FROM video")
video_df = pd.read_sql_query(query, conn)

### Remove shorts and very long format videos

In [7]:
def filter_videos_by_duration(df, min_duration=60, max_duration=1800):
    """
    Filter videos DataFrame by duration within a specified range.
    
    Parameters:
        df (DataFrame): The DataFrame containing video data.
        min_duration (int): Minimum duration in seconds. Default is 60.
        max_duration (int): Maximum duration in seconds. Default is 1800.
        
    Returns:
        DataFrame: Filtered DataFrame containing videos with duration within the specified range.
    """
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

In [8]:
filtered_video_df = filter_videos_by_duration(video_df)

### Create traning dataset

In [9]:
# Sample dataset
videos_for_labelling_df = filtered_video_df.sample(n=11_000, random_state=37).reset_index(drop=True)

### Create Gemini prompt

In [10]:
# Adjust display options to prevent truncation
pd.set_option('display.max_colwidth', None)

In [11]:
# Write prompt string
prompt_str = """You are a classifier, and I want you to classify each of the following data science youtube video titles based on three overarching categories, video_type, video_topic, coding_language

"video_type": The only options for video type are: 

"Tutorial",
"Project",
"News",
"Tips",
"Challenge",
"Career Advice",
"Podcast/Interview

"video_topic": The only options for video topic are: 

"Statistics and Probability",
"Machine Learning / AI",
"Data Wrangling",
"Data Visualization",
"Data Mining",
"Software Engineering",
"Ethics and Privacy",
"Cloud Computing",
"Resume Building",
"Job Search Strategies",
"Interview Techniques",
"Career Development Paths",
"Balancing Work and Life",
"Business Acumen",

For video_type and video_topic, you may only use the options provided and no others. If none of these options are appropriate, return an empty string.

"technologies": also pull out any software/coding_language/packages as a list called 'technologies'. This should not include anything other than software / coding_language / packages though.

From now on, return the output in JSON format. Do not include the JSON identifier at the start of the output. Only output pure JSON.

The response should look like (for example):

{
  "videos": [
    {
      "video_id": "YdWkUdMxMvM",
      "video_title": "Career Change to Code - The Complete Guide",
      "video_info": {
        "video_type": "Career Advice",
        "video_topic": "Software Engineering",
        "technologies": []
      }
    },
    {
      "video_id": "5rNk7m_zlAg",
      "video_title": "Spring Boot & Spring Data JPA – Complete Course",
      "video_info": {
        "video_type": "Tutorial",
        "video_topic": "Software Engineering",
        "technologies": ["Spring", "Spring Boot"]
      }
    }
  ]
}

The video ids and titles are below:

"""

### Function to generate respone from Gemini

In [12]:
def generate_response(project_id: str, location: str, prompt) -> str:
    # Initialize Vertex AI
    vertexai.init(project=project_id, location=location)
    # Load the model
    multimodal_model = GenerativeModel("gemini-1.0-pro")
    # Query the model
    response = multimodal_model.generate_content(
        [
            prompt
        ]
    )
    return response.text

### Generate responses

In [14]:
# Get total number of videos
total_videos = len(videos_for_labelling_df)

# Define chunk size
chunk_size = 50

# Init response list
responses = []

# Chunk counter
chunk_count = 0

# Loop through the DataFrame in chunks of 50
for chunk_start in range(0, total_videos, chunk_size):
    # Get chunk of videos
    videos_chunk = videos_for_labelling_df.iloc[chunk_start:chunk_start + chunk_size]

    # Init prompt
    prompt = prompt_str

    # Add video information to prompt
    for i, (_, row) in enumerate(videos_chunk.iterrows(), start=1):
        video_title = row['video_title']
        video_id = row['video_id']
        prompt += f'\nvideo_id : {video_id}, video_title: {video_title}'

    # Generate response for the chunk
    response_text = generate_response(VERTEX_PROJECT_ID, vertex_region, prompt)

    #response_text = response_text.split("{", 1)[1]

    # Check if the response ends with "```"
    if response_text.endswith("```"):
        # Remove the ending "```"
        response_text = response_text[:-3]

    # Load with YAML instead of JSON as response normally has trailing comma on last item
    try:
        video_info_dict = json.loads(response_text)
    except json.JSONDecodeError:
        print(f"Error in JSON formatting of chunk {chunk_count}, skipping chunk...")

    # Map video_type, video_topic, and technologies to the existing df
    for video in video_info_dict['videos']:
        video_id = video.get('video_id', '')
        
        # Get video_info dictionary
        video_info = video.get('video_info', {})
        
        video_type = video_info.get('video_type', '')
        video_topic = video_info.get('video_topic', '')
        
        # Get technologies list
        technologies = video_info.get('technologies', [])
        technologies_str = ', '.join(technologies) if isinstance(technologies, list) else ''
        
        # Update the df with the extracted information
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'video_type'] = video_type
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'video_topic'] = video_topic
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'technologies'] = technologies_str

    chunk_count += chunk_size

    print(f'Processed {chunk_count} rows of {total_videos}...')

    # Delay before processing the next chunk
    time.sleep(1)

print('Completed processing')

Processed 50 rows of 11000...
Processed 100 rows of 11000...
Processed 150 rows of 11000...
Processed 200 rows of 11000...
Processed 250 rows of 11000...
Processed 300 rows of 11000...
Processed 350 rows of 11000...
Processed 400 rows of 11000...
Processed 450 rows of 11000...
Processed 500 rows of 11000...
Processed 550 rows of 11000...
Processed 600 rows of 11000...
Processed 650 rows of 11000...
Processed 700 rows of 11000...
Processed 750 rows of 11000...
Processed 800 rows of 11000...
Processed 850 rows of 11000...
Processed 900 rows of 11000...
Processed 950 rows of 11000...
Processed 1000 rows of 11000...
Processed 1050 rows of 11000...
Processed 1100 rows of 11000...
Processed 1150 rows of 11000...
Processed 1200 rows of 11000...
Processed 1250 rows of 11000...
Processed 1300 rows of 11000...
Processed 1350 rows of 11000...
Processed 1400 rows of 11000...
Processed 1450 rows of 11000...
Processed 1500 rows of 11000...
Error in JSON formatting of chunk 1500, skipping chunk...
Pr

In [15]:
videos_for_labelling_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   channel_id       11000 non-null  object 
 1   video_id         11000 non-null  object 
 2   video_title      11000 non-null  object 
 3   description      11000 non-null  object 
 4   tags             11000 non-null  object 
 5   published        11000 non-null  object 
 6   view_count       11000 non-null  float64
 7   like_count       10938 non-null  float64
 8   favourite_count  11000 non-null  int64  
 9   comment_count    10990 non-null  float64
 10  duration         11000 non-null  int64  
 11  definition       11000 non-null  object 
 12  caption          11000 non-null  object 
 13  category_id      11000 non-null  int64  
 14  video_type       10313 non-null  object 
 15  video_topic      10290 non-null  object 
 16  technologies     10466 non-null  object 
dtypes: float64(3

In [17]:
videos_for_labelling_df.to_csv('./training_dataset/labelled_video_dataset.csv')

In [18]:
len(videos_for_labelling_df['channel_id'].unique())

100