In [56]:
import pandas as pd
from sqlalchemy import create_engine,text
import os
import json
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import time
import yaml

### Vertex config

In [16]:
VERTEX_PROJECT_ID = os.getenv("VERTEX_PROJECT_ID")
vertex_region = "us-west4"

### Get all video data from database

In [89]:
db_string = 'sqlite:///../db/youtube.db'
# Create a engine
engine = create_engine(db_string)
# Create connection
conn = engine.connect()

In [90]:
# Get all video data
query = text("SELECT * FROM video")
video_df = pd.read_sql_query(query, conn)

### Remove shorts and very long format videos

In [91]:
def filter_videos_by_duration(df, min_duration=60, max_duration=1800):
    """
    Filter videos DataFrame by duration within a specified range.
    
    Parameters:
        df (DataFrame): The DataFrame containing video data.
        min_duration (int): Minimum duration in seconds. Default is 60.
        max_duration (int): Maximum duration in seconds. Default is 1800.
        
    Returns:
        DataFrame: Filtered DataFrame containing videos with duration within the specified range.
    """
    return df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

In [92]:
videos_for_labelling_df = filter_videos_by_duration(video_df)

### Create traning dataset

In [93]:
# Create training dataset
videos_for_labelling_df = videos_for_labelling_df.head(10_000) # defines the size of the training dataset

### Create Gemini prompt

In [9]:
# Adjust display options to prevent truncation
pd.set_option('display.max_colwidth', None)

In [None]:
# Write prompt string
prompt_str = """You are a classifier, and I want you to classify each of the following data science youtube video titles based on three overarching categories, video_type, video_topic, coding_language

"video_type": the options for video type are: 

"Tutorial",
"Project",
"News",
"Tips",
"Challenge",
"Career Advice",
"Podcast/Interview

"video_topic": the options for video topic are: 

"Statistics and Probability",
"Machine Learning / AI",
"Data Wrangling",
"Data Visualization",
"Data Mining",
"Software Engineering",
"Ethics and Privacy",
"Cloud Computing",
"Resume Building",
"Job Search Strategies",
"Interview Techniques",
"Career Development Paths",
"Balancing Work and Life",
"Business Acumen",

"technologies": also pull out any software/coding_language/packages as a list called 'technologies'. This should not include anything other than software / coding_language / packages though.

From now on, return the output in JSON format. Do not include the JSON identifier at the start of the output. Only output pure JSON.

The response should look like (for example):

{
  "videos": [
    {
      "video_id": "YdWkUdMxMvM",
      "video_title": "Career Change to Code - The Complete Guide",
      "video_info": {
        "video_type": "Career Advice",
        "video_topic": "Software Engineering",
        "technologies": []
      }
    },
    {
      "video_id": "5rNk7m_zlAg",
      "video_title": "Spring Boot & Spring Data JPA – Complete Course",
      "video_info": {
        "video_type": "Tutorial",
        "video_topic": "Software Engineering",
        "technologies": ["Spring", "Spring Boot"]
      }
    }
  ]
}

The video ids and titles are below:

"""

### Function to generate respone from Gemini

In [72]:
def generate_response(project_id: str, location: str, prompt) -> str:
    # Initialize Vertex AI
    vertexai.init(project=project_id, location=location)
    # Load the model
    multimodal_model = GenerativeModel("gemini-1.0-pro")
    # Query the model
    response = multimodal_model.generate_content(
        [
            prompt
        ]
    )
    return response.text

In [68]:
VERTEX_PROJECT_ID='astute-veld-414717'

### Generate responses

In [105]:
# Get total number of videos
total_videos = len(videos_for_labelling_df)

# Define chunk size
chunk_size = 50

# Init response list
responses = []

# Chunk counter
chunk_count = 0

# Loop through the DataFrame in chunks of 50
for chunk_start in range(0, total_videos, chunk_size):
    # Get chunk of videos
    videos_chunk = videos_for_labelling_df.iloc[chunk_start:chunk_start + chunk_size]

    # Init prompt
    prompt = prompt_str

    # Add video information to prompt
    for i, (_, row) in enumerate(videos_chunk.iterrows(), start=1):
        video_title = row['video_title']
        video_id = row['video_id']
        prompt += f'\nvideo_id : {video_id}, video_title: {video_title}'

    # Generate response for the chunk
    response_text = generate_response(VERTEX_PROJECT_ID, vertex_region, prompt)

    #response_text = response_text.split("{", 1)[1]

    # Check if the response ends with "```"
    if response_text.endswith("```"):
        # Remove the ending "```"
        response_text = response_text[:-3]

    # Load with YAML instead of JSON as response normally has trailing comma on last item
    video_info_dict = json.loads(response_text)

    # Map video_type, video_topic, and technologies to the existing df
    for video in video_info_dict['videos']:
        video_id = video.get('video_id', '')
        
        # Get video_info dictionary
        video_info = video.get('video_info', {})
        
        video_type = video_info.get('video_type', '')
        video_topic = video_info.get('video_topic', '')
        
        # Get technologies list
        technologies = video_info.get('technologies', [])
        technologies_str = ', '.join(technologies) if isinstance(technologies, list) else ''
        
        # Update the df with the extracted information
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'video_type'] = video_type
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'video_topic'] = video_topic
        videos_for_labelling_df.loc[videos_for_labelling_df['video_id'] == video_id, 'technologies'] = technologies_str

    chunk_count += chunk_size

    print(f'Processed {chunk_count} rows of {total_videos}...')

    # Delay before processing the next chunk
    time.sleep(1)

print('Completed processing')

Processed 9450 rows of 10000...
Processed 9500 rows of 10000...
Processed 9550 rows of 10000...
Processed 9600 rows of 10000...
Processed 9650 rows of 10000...
Processed 9700 rows of 10000...
Processed 9750 rows of 10000...
Processed 9800 rows of 10000...
Processed 9850 rows of 10000...
Processed 9900 rows of 10000...
Processed 9950 rows of 10000...
Processed 10000 rows of 10000...
Completed processing


In [108]:
videos_for_labelling_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 57 to 16900
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   channel_id       10000 non-null  object 
 1   video_id         10000 non-null  object 
 2   video_title      10000 non-null  object 
 3   description      10000 non-null  object 
 4   tags             10000 non-null  object 
 5   published        10000 non-null  object 
 6   view_count       10000 non-null  float64
 7   like_count       9976 non-null   float64
 8   favourite_count  10000 non-null  int64  
 9   comment_count    9964 non-null   float64
 10  duration         10000 non-null  int64  
 11  definition       10000 non-null  object 
 12  caption          10000 non-null  object 
 13  category_id      10000 non-null  int64  
 14  video_type       9281 non-null   object 
 15  video_topic      8992 non-null   object 
 16  technologies     9375 non-null   object 
dtypes: float64(

In [111]:
# Drop any rows where video_type is empty
cleaned_labelled_video_df = videos_for_labelling_df.dropna(subset=['video_type'])

In [112]:
cleaned_labelled_video_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9281 entries, 57 to 16900
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   channel_id       9281 non-null   object 
 1   video_id         9281 non-null   object 
 2   video_title      9281 non-null   object 
 3   description      9281 non-null   object 
 4   tags             9281 non-null   object 
 5   published        9281 non-null   object 
 6   view_count       9281 non-null   float64
 7   like_count       9257 non-null   float64
 8   favourite_count  9281 non-null   int64  
 9   comment_count    9265 non-null   float64
 10  duration         9281 non-null   int64  
 11  definition       9281 non-null   object 
 12  caption          9281 non-null   object 
 13  category_id      9281 non-null   int64  
 14  video_type       9281 non-null   object 
 15  video_topic      8936 non-null   object 
 16  technologies     9281 non-null   object 
dtypes: float64(3

In [114]:
cleaned_labelled_video_df.to_csv('./training_dataset/labelled_video_dataset.csv')