# Pre-requisites

### 1. Load Spacy and Download Language Model from Spacy
- pip install spacy
- python -m spacy download en_core_web_md

### 2. Initialise Youtube API
- https://developers.google.com/youtube/v3/quickstart/python
- Use this key in your application by passing it with the key=API_KEY parameter. AIzaSyBFB9_Plcj-N7tpF2p08IJngcOwNQFnvrI

In [33]:
# Import Libraries
import spacy
import pandas as pd
import os
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import re
import numpy as np
import sklearn.cluster
import distance
from fuzzywuzzy import fuzz
import statistics

pd.set_option('display.max_colwidth', 0)
from nltk.corpus import wordnet
import nltk
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
API_KEY = 'AIzaSyCYhjAqZWZUkMDI4gd3rkspFHEoXX7KDi4'
nlp = spacy.load("en_core_web_md")  
youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

In [34]:
def unusual_words(text):
    text_vocab = set(w.lower() for w in text.split() if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

def get_oov_words(query):
    doc = nlp(query)
    oov_words = [word.text for word in doc if word.is_oov]
    oov_words_string = ' '.join(word.lower() for word in list(oov_words))
    return oov_words_string

def get_oov_words2(query):
    query=query.lower()
    query_filtered = unusual_words(query)
    oov_words = [word for word in query_filtered if not wordnet.synsets(word)]
    oov_words_string = ' '.join(word.lower() for word in list(oov_words))
    return oov_words_string

def filter_query(query):
    doc = nlp(query)
    filtered_words = [word.text for word in doc if not word.is_oov]
    filtered_words_string = ' '.join(word.lower() for word in list(filtered_words))
    return filtered_words_string

def clean_up_texts(text):
    text = re.sub(r'https\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
    text = re.sub(r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?','',text)
    text = ''.join(e for e in text if e.isalpha() or e.isspace())
   
    return text.lower()


def get_videos(query, number_of_videos):
    

    oov_words_in_search_query = get_oov_words(query)
    filtered_query = filter_query(query)

    
    nextPageToken = None
    allVideos = []
    _counter = number_of_videos
    videoId = []
    title = []
    description = []
    oov_words = []
    
    while True:
        if _counter < 50:
            MAX_COUNT = _counter
        else:
            MAX_COUNT = 50

        req = youtube.search().list(q=filtered_query, part='snippet', type='video', maxResults=MAX_COUNT, pageToken=None)
        res = req.execute()
        _counter = _counter-50
        nextPageToken = res['nextPageToken']
        items = res['items']
        for each_item in items:
            allVideos += each_item
            
            _videoId = each_item['id']['videoId']
            _title = each_item['snippet']['title']
            _description = each_item['snippet']['description']
            
            _title = clean_up_texts(_title)
            _description = clean_up_texts(_description)
            _oov_words = get_oov_words2(_title) +' '+ get_oov_words2(_description)

            videoId.append(_videoId)
            title.append(_title)
            description.append(_description)
            oov_words.append(_oov_words)
        if res['nextPageToken'] == None:
            break;  # exit from the loop
        if _counter <=0 :
            break;  # exit from the loop

    data = {'videoId': videoId,'title': title, 'description': description, 'oov_words':oov_words
            ,'oov_lookup':oov_words_in_search_query }
    df = pd.DataFrame(data)

    return df

In [52]:
query = 'raghab hindi Valentine Cover'

In [53]:
get_oov_words2(query)

'raghab'

In [54]:
df = get_videos(query,10)
df.head()

Unnamed: 0,videoId,title,description,oov_words,oov_lookup
0,vFN3eNe0_Hs,pehla nasha valentines day special sanam,this valentines day come fall in love again with your favorite band sanam as they present yet another sanamrendition of the super hit song phela nasha,nasha pehla sanam nasha phela sanam sanamrendition,raghab
1,RyRgdKGSctU,valentines medley na tum jaano na hum pee lu tum jo aaye mast magan raghav chaitanya,subscribe to my youtube channel hi guys valentines day is special and on this special day here i am with my medley of,aaye chaitanya jaano magan raghav youtube,raghab
2,NbpdEetp6Hk,valentines medley raghav chaitanya,subscribe to my youtube channel hi im back with my valentines medley for the year happy valentines day to everyone,chaitanya raghav im youtube,raghab
3,j45TDsUSEVk,bodo and hindi mashup cover song happy valentines day to all my loving friends feb,music videography premanto narzary,mashup narzary premanto videography,raghab
4,tIv-tO8Ba-0,valentines day special bahut pyaar karte hain debolinaa nandy ft badal s cover,love never dies love will continue love keeps on beating when youre gone love never dies once it is in you life may be fleeting love lives on soo my,badal debolinaa karte nandy pyaar soo youre,raghab


In [55]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(' ')))

# calculate lengths of splits
lens = df['oov_words'].str.split(' ').map(len)

# create new dataframe, repeating or chaining as appropriate
res = pd.DataFrame({'videoId': np.repeat(df['videoId'], lens),
                    'title': np.repeat(df['title'], lens),
                    'description': np.repeat(df['description'], lens),
                    'oov_words': chainer(df['oov_words']),
                   'oov_lookup': np.repeat(df['oov_lookup'], lens)})
df  = res.reset_index(drop=True)

df['oov_words'].replace('', np.nan, inplace=True)
df.dropna(subset=['oov_words'], inplace=True)

In [56]:
df.head()

Unnamed: 0,videoId,title,description,oov_words,oov_lookup
0,vFN3eNe0_Hs,pehla nasha valentines day special sanam,this valentines day come fall in love again with your favorite band sanam as they present yet another sanamrendition of the super hit song phela nasha,nasha,raghab
1,vFN3eNe0_Hs,pehla nasha valentines day special sanam,this valentines day come fall in love again with your favorite band sanam as they present yet another sanamrendition of the super hit song phela nasha,pehla,raghab
2,vFN3eNe0_Hs,pehla nasha valentines day special sanam,this valentines day come fall in love again with your favorite band sanam as they present yet another sanamrendition of the super hit song phela nasha,sanam,raghab
3,vFN3eNe0_Hs,pehla nasha valentines day special sanam,this valentines day come fall in love again with your favorite band sanam as they present yet another sanamrendition of the super hit song phela nasha,nasha,raghab
4,vFN3eNe0_Hs,pehla nasha valentines day special sanam,this valentines day come fall in love again with your favorite band sanam as they present yet another sanamrendition of the super hit song phela nasha,phela,raghab


In [57]:
_array = np.asarray([words for segments in df.oov_words for words in segments.split()]) 
words = np.insert( _array, len(_array), get_oov_words(query) )
#words

In [58]:
lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

In [59]:
lev_similarity

array([[ 0, -4, -4, ..., -6, -7, -3],
       [-4,  0, -5, ..., -7, -7, -5],
       [-4, -5,  0, ..., -6, -7, -4],
       ...,
       [-6, -7, -6, ...,  0, -7, -6],
       [-7, -7, -7, ..., -7,  0, -7],
       [-3, -5, -4, ..., -6, -7,  0]])

## 1.3  Using K-Means Clustering

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import fasttext
ft_model = fasttext.load_model('wiki.simple/wiki.simple.bin')



In [61]:
def cosine_similarity(vec_a, vec_b):
    """Compute cosine similarity between vec_a and vec_b"""
    return np.dot(vec_a, vec_b) / \
        (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))

In [62]:
cos_similarity = -1*np.array([[cosine_similarity(
    ft_model.get_word_vector(w1),ft_model.get_word_vector(w2))
                               for w1 in words] for w2 in words])

In [63]:
cos_similarity

array([[-1.        , -0.4142262 , -0.36530843, ..., -0.11352272,
        -0.29796284, -0.4405388 ],
       [-0.4142262 , -1.        , -0.42266458, ..., -0.23738801,
        -0.424228  , -0.47339624],
       [-0.36530843, -0.42266458, -0.9999999 , ..., -0.15476146,
        -0.3166491 , -0.40475303],
       ...,
       [-0.11352272, -0.23738801, -0.15476146, ..., -1.        ,
        -0.25787985, -0.20113617],
       [-0.29796284, -0.424228  , -0.3166491 , ..., -0.25787985,
        -1.        , -0.35617706],
       [-0.4405388 , -0.47339624, -0.40475303, ..., -0.20113617,
        -0.35617706, -0.99999994]], dtype=float32)

In [64]:
true_k = 25
kmeans_model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
kmeans_model.fit(cos_similarity)

KMeans(max_iter=100, n_clusters=25, n_init=1)

In [65]:
#kmeans_model.labels_

In [66]:
model_score=[]
for cluster_id in np.unique(kmeans_model.labels_):
    exemplar = cluster_id
    cluster = np.unique(words[np.nonzero(kmeans_model.labels_==cluster_id)])
    cluster_str = ", ".join(cluster) 
    
    oov_lookup = get_oov_words(query)
    if oov_lookup in cluster_str:
        print(f"\033[1m{exemplar} {cluster_str }\033[0m:" )
        for cluster_word in cluster_str.split():
            if oov_lookup!= cluster_word:
                model_score.append(fuzz.ratio(oov_lookup, cluster_word))
    else:
        print(f"\033[1m{exemplar}\033[0m: {cluster_str }" )
        #pass

[1m0[0m: aspl
[1m1[0m: mulakat, singhthatsings
[1m2[0m: banita, magan, phela
[1m3[0m: youtube
[1m4[0m: mashup
[1m5 raghab, raghav[0m:
[1m6[0m: pyaar
[1m7[0m: nasha
[1m8[0m: chaitanya, jaitay
[1m9[0m: datelove, debolinaa, premanto
[1m10[0m: ajnabee, badal, haseena, jaano, pehla
[1m11[0m: im
[1m12[0m: curated
[1m13[0m: tseries
[1m14[0m: sanamrendition
[1m15[0m: kabir, narzary
[1m16[0m: aaye, youre
[1m17[0m: jangir
[1m18[0m: videography
[1m19[0m: ek
[1m20[0m: gai, soo
[1m21[0m: sanam
[1m22[0m: manchanda, nandy
[1m23[0m: adi
[1m24[0m: karte


In [69]:
statistics.mean(model_score)

87.5