# Create YouTube Title-Thumbnail Training Pairs

Code authored by: Shaw Talebi

[Video link](https://youtu.be/W4s6b2ZM6kI) | [Blog link](https://medium.com/towards-data-science/fine-tuning-multimodal-embedding-models-bf007b1c5da5) <br>
[Dataset](https://huggingface.co/datasets/shawhin/yt-title-thumbnail-pairs) | [Fine-tuned Model](https://huggingface.co/shawhin/clip-title-thumbnail-embeddings)

modified by: Soonwook Hwang

### imports

In [1]:
#!pip install isodate
#!pip install pandas
#!pip install sentence_transformers

In [2]:
# you hit the classic pyarrow-on-HPC wall ðŸ˜…
# install pre-built binaries from conda-forge

#!conda install  "pyarrow==19.0.0" datasets



In [3]:
from top_secret import my_key
import requests
from isodate import parse_duration

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import DatasetDict, Dataset

### Extract

#### extract video ids

In [4]:
channel_id = 'UCa9gErQ9AE5jT2DZLjXBIdA' # my YouTube channel ID
page_token = None # initialize page token
url = 'https://www.googleapis.com/youtube/v3/search' # YouTube search API endpoint

# extract video data across multiple search result pages
video_id_list = []

while page_token != 0:
    params = {
        "key": my_key, 
        'channelId': channel_id, 
        'part': ["snippet","id"], 
        'order': "date", 
        'maxResults':50, 
        'pageToken': page_token
    }
    response = requests.get(url, params=params)

    for raw_item in dict(response.json())['items']:
        
        # only execute for youtube videos
        if raw_item['id']['kind'] != "youtube#video":
            continue

        # grab video ids
        video_id_list.append(raw_item['id']['videoId'])

    try:
        # grab next page token
        page_token = dict(response.json())['nextPageToken']
    except:
        # if no next page token kill while loop
        page_token = 0

In [5]:
len(video_id_list)

160

#### extract titles and thumbnail urls

In [6]:
url = "https://www.googleapis.com/youtube/v3/videos"

video_data_list = []

for video_id in video_id_list:

    params = {
        "part": ["snippet","contentDetails"],
        "id": video_id,  
        "key": my_key,  
    }
    response = requests.get(url, params=params)
    
    raw_dict = dict(response.json())['items'][0]

    # only process videos longer than 3 minutes
    iso_duration = raw_dict['contentDetails']["duration"]
    if parse_duration(iso_duration).total_seconds() < 180:
        continue
    
    # extract video data
    video_data = {}
    video_data['video_id'] = video_id
    video_data['title'] = raw_dict['snippet']['title']
    video_data['thumbnail_url'] = raw_dict['snippet']['thumbnails']['high']['url']

    # append data to list
    video_data_list.append(video_data)

In [7]:
len(video_data_list)

115

### Transform

#### create dataframe

In [8]:
df = pd.DataFrame(video_data_list)
df.head()

Unnamed: 0,video_id,title,thumbnail_url
0,G72pJMcYBns,How to Monetize Your AI Skills (outside a 9â€“5),https://i.ytimg.com/vi/G72pJMcYBns/hqdefault.jpg
1,XEMZniYKuaY,How to Build a Remote MCP Server (with Auth),https://i.ytimg.com/vi/XEMZniYKuaY/hqdefault.jpg
2,rTkm1eY0ezU,30 AI Engineering Terms Explained (in Plain En...,https://i.ytimg.com/vi/rTkm1eY0ezU/hqdefault.jpg
3,PCLu84VLF1w,How to Code 10X Faster Using AI (without theÂ c...,https://i.ytimg.com/vi/PCLu84VLF1w/hqdefault.jpg
4,w-Ml3NivoFo,How to Build (Custom) AI Agents with MCP,https://i.ytimg.com/vi/w-Ml3NivoFo/hqdefault.jpg


#### create negative pairs

In [9]:
# Load the model
model = SentenceTransformer("all-mpnet-base-v2")

In [10]:
%%time
# Encode all titles
job_embeddings = model.encode(df['title'].to_list())
print(job_embeddings.shape)

(115, 768)
CPU times: user 144 ms, sys: 118 ms, total: 262 ms
Wall time: 6.41 s


In [11]:
# compute similarities
similarities = model.similarity(job_embeddings, job_embeddings)
print(similarities.shape)

torch.Size([115, 115])


In [12]:
# match least title least similar to positive match as the negative match
similarities_argsorted = np.argsort(similarities.numpy(), axis=1)
negative_pair_index_list = []

for i in range(len(similarities)):

    # Start with the smallest similarity index for the current row
    j = 0
    index = int(similarities_argsorted[i][j])

    # Ensure the index is unique
    while index in negative_pair_index_list:
        j += 1  # Move to the next smallest index
        index = int(similarities_argsorted[i][j])  # Fetch next smallest index

    negative_pair_index_list.append(index)

In [13]:
# add negative pairs to df
df['title_neg'] = df['title'].iloc[negative_pair_index_list].values

In [14]:
df.head()

Unnamed: 0,video_id,title,thumbnail_url,title_neg
0,G72pJMcYBns,How to Monetize Your AI Skills (outside a 9â€“5),https://i.ytimg.com/vi/G72pJMcYBns/hqdefault.jpg,Persistent Homology | Introduction & Python Ex...
1,XEMZniYKuaY,How to Build a Remote MCP Server (with Auth),https://i.ytimg.com/vi/XEMZniYKuaY/hqdefault.jpg,What Nature Can Teach Us About Business...
2,rTkm1eY0ezU,30 AI Engineering Terms Explained (in Plain En...,https://i.ytimg.com/vi/rTkm1eY0ezU/hqdefault.jpg,I Quit My Jobâ€¦ Hereâ€™s How Much I Made 1 Year L...
3,PCLu84VLF1w,How to Code 10X Faster Using AI (without theÂ c...,https://i.ytimg.com/vi/PCLu84VLF1w/hqdefault.jpg,How to Build a Remote MCP Server (with Auth)
4,w-Ml3NivoFo,How to Build (Custom) AI Agents with MCP,https://i.ytimg.com/vi/w-Ml3NivoFo/hqdefault.jpg,Why Conflict Is Good & How You Can Use It


#### train-test split

In [15]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 20% test)
train_frac = 0.7
valid_frac = 0.15
test_frac = 0.15

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

### Load

In [16]:
# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'valid': valid_ds,
    'test': test_ds
})

In [17]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['video_id', 'title', 'thumbnail_url', 'title_neg'],
        num_rows: 80
    })
    valid: Dataset({
        features: ['video_id', 'title', 'thumbnail_url', 'title_neg'],
        num_rows: 17
    })
    test: Dataset({
        features: ['video_id', 'title', 'thumbnail_url', 'title_neg'],
        num_rows: 18
    })
})

In [18]:
from huggingface_hub import login

# Login with your token (it will prompt you)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [19]:
# push data to hub
#dataset_dict.push_to_hub("shawhin/yt-title-thumbnail-pairs")
#dataset_dict.push_to_hub("hwang2006/yt-title-thumbnail-pairs")