In [3]:
# Getting data from YouTube
import re
from nltk.corpus import stopwords
from googleapiclient.discovery import build

DEVELOPER_KEY = "<enter your key>"
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

youtube = build(
    YOUTUBE_API_SERVICE_NAME,
    YOUTUBE_API_VERSION,
    developerKey=DEVELOPER_KEY
)

video_search_response = youtube.search().list(
    q="python",
    part='id,snippet',
    maxResults=50,
    type='video',
    relevanceLanguage="en",
    regionCode="US"
).execute()

tkn = video_search_response["nextPageToken"]
videos = []
while tkn and len(video_search_response["items"]) > 0:
    videos += video_search_response["items"]
    video_search_response = youtube.search().list(
        q="python",
        part='id,snippet',
        maxResults=50,
        type='video',
        pageToken=tkn,
        relevanceLanguage="en",
        regionCode="US"
    ).execute()
    tkn = video_search_response.get("nextPageToken")

print("Total no. of videos", len(set([i["id"]["videoId"] for i in videos])))


dets = {
        i["id"]["videoId"]: {
            "title": i["snippet"]["title"], 
            "description": i["snippet"]["description"]
        }
        for i in videos
}
video_text = [[k, ". ".join([i["title"], i["description"]])] for k,i in dets.items()]
yt_text = [i[1] for i in video_text]

Total no. of videos 522


In [6]:
sample_1 = "Python Machine Learning Tutorial (Data Science) Python Machine Learning Tutorial - Learn how to predict the kind of music people like. Subscribe for more Python tutorials like ..."
sample_2 = "Ball python bite I decided to grab a snake out of its tank without asking, this was totally my fault. But now I can say I've been bit by a snake"


# TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vect = TfidfVectorizer()

# get tfidf of all samples in the corpus
tfidf = vect.fit_transform(yt_text)

def tfidf_sim(sample_doc):
    # get tfidf vector for sample document
    selected_itm = vect.transform([sample_doc])

    # similarity between sample doc and the rest of the corpus
    cosine_sim = [
        cosine_similarity(selected_itm, itm)[0][0] 
        for itm in tfidf
    ]

    # index of top 8 matches
    indx_top8 = sorted(
        range(len(cosine_sim)),
        key=lambda i: cosine_sim[i],
        reverse=True
    )[:8]
    return indx_top8


In [7]:
sample1_tfidf_top8 = tfidf_sim(sample_1)
sample2_tfidf_top8 = tfidf_sim(sample_2)

# SpaCy

In [10]:
!pip install spacy
!python -m spacy download en_core_web_lg

import spacy

nlp = spacy.load("en_core_web_lg")

def spacy_sim(sample_doc):
    docs_spacy = [nlp("u'"+itm+"'") for itm in yt_text]
    selected_itm = nlp("u'"+sample_doc+"'")

    # Similarity between sample doc and the rest of the corpus
    spacy_sim = [
        selected_itm.similarity(itm) for itm in docs_spacy
    ]

    # index of top 8 matches
    indx_top8 = sorted(
        range(len(spacy_sim)),
        key=lambda i: spacy_sim[i],
        reverse=True
    )[:8]
    return indx_top8

In [11]:
sample1_spacy_top8 = spacy_sim(sample_1)
sample2_spacy_top8 = spacy_sim(sample_2)

# BERT

In [12]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util

bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

document_embeddings = bert_model.encode(yt_text)

def bert_sim(sample_doc):
    selected_itm = bert_model.encode(sample_doc)

    # Similarity between sample doc and the rest of the corpus
    bert_sim = [
        util.pytorch_cos_sim(selected_itm, itm).item() 
        for itm in document_embeddings
    ]

    # index of top 8 matches
    indx_top8 = sorted(
        range(len(bert_sim)), 
        key=lambda i: bert_sim[i], 
        reverse=True)[:8]
    return indx_top8

In [13]:
sample1_bert_top8 = bert_sim(sample_1)
sample2_bert_top8 = bert_sim(sample_2)

In [14]:
# Video data can be accessed by the above return indices as video_text[inx]