## Import and download libraries, model and data

In [1]:
# ! pip install gensim
# ! pip install ipynb

In [2]:
from ipynb.fs.full.base import *
import gensim.downloader as api
import pandas as pd

import re
import nltk
from stop_words import get_stop_words
from nltk.corpus import stopwords
import numpy as np

In [3]:
glove_model = api.load('glove-wiki-gigaword-300')

In [4]:
df = pd.read_csv('entire_library_have_fun.csv')
df = df[['Track Name']]

## clean data

In [5]:
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)

def clean(text):
    text = str(text).lower() #lowercase
    text = re.sub(r'\bid\b', 'i would', text) #start abbreviation
    text = re.sub(r'\bive\b', 'i have', text)
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bcant\b', 'can not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bthats\b', 'that is', text) #end abbreviation
    text = re.sub('[0-9]+', '', text) # delete numbers
    text = re.sub(r'[^\x00-\x7F]+',' ', text) #remove non-ascii
    text = re.sub('[<>{}=~.,،:\\!?\\-()\\[\\]#/@"]+|[_x000D_]+|\u200c+|[\r\n]', ' ', text) #remove punctuations
    word_list = nltk.word_tokenize(text)
    word_list = list(filter(lambda word: word not in stop_words, word_list)) # delete stopwords
    word_list = [w for w in word_list if len(w)>1] # delete len = 1
    text = ' '.join(word_list)
    text = text.strip()
    return text


In [6]:
df['Cleaned Track Name'] = df['Track Name'].apply(lambda x: clean(x))
df

## Variables

In [26]:
d = 300
b = 10
f = 100
k = 20

## LSH Model and Index

In [32]:
lsh_model_song = LSHIndex(f, b, d)

sentence_vector_map = {}

for index, value in enumerate(df['Cleaned Track Name']):
    vectors = []
    for word in value.split():
        try:
            vector = glove_model.get_vector(word)
            if vector is not None:
                vectors.append(vector)
        except:
            pass
    
    if vectors:   
        x = np.array(sum(vectors)/len(vectors))
        sentence_vector_map[str(np.array([x]))] = value
        lsh_model_song.index(index, x)

## Query over 'end of world'
### top 3 similarities:
-  Track names list: ['World’s About To End'] Similarity: 0.99999994
-  Track names list: ['My World', 'The World Is Yours', 'The World', 'The World', 'World', 'My World'] Similarity: 0.8698745
-  Track names list: ['Half the World'] Similarity: 0.85235816

In [33]:
value = 'end of world'
vectors = []
clean_value = clean('end of world')
for word in clean_value.split():
    try:
        vector = glove_model.get_vector(word)
        if vector is not None:
            vectors.append(vector)
    except:
        pass
if vectors:   
    q = np.array([np.array(sum(vectors)/len(vectors))])

results = lsh_model_song.query(q, k)

set_results = set()
for i in results:
    sentence = sentence_vector_map[str(i)]
    track_names = list(df[df['Cleaned Track Name'] == sentence]['Track Name'])
    similarity = 1 - cosine_distance(i,q)
    set_results.add(f" Track names list: {track_names} Similarity: {similarity}")
set_results

{" Track names list: ['Big World'] Similarity: [[0.81706697]]",
 " Track names list: ['End of Time', 'End Of Time'] Similarity: [[0.8323111]]",
 " Track names list: ['Half the World'] Similarity: [[0.85235816]]",
 " Track names list: ['Let The World Turn'] Similarity: [[0.7884188]]",
 " Track names list: ['My World', 'The World Is Yours', 'The World', 'The World', 'World', 'My World'] Similarity: [[0.8698745]]",
 " Track names list: ['New World'] Similarity: [[0.8213221]]",
 " Track names list: ['Night of Blood in a World Without End'] Similarity: [[0.8081635]]",
 " Track names list: ['Not With All the Hope in the World'] Similarity: [[0.8420321]]",
 " Track names list: ['The Day The World Went Away', 'The Day The World Went Away', 'The Day The World Went Away', 'The Day The World Went Away', 'The Day The World Went Away'] Similarity: [[0.79932874]]",
 " Track names list: ['World’s About To End'] Similarity: [[0.99999994]]"}

## Query over 'he and his friend'
### top 3 similarities:
- Track names list: ['What a Friend', 'Friend To All', 'Friend', 'Only Friend'] Similarity: 1
- Track names list: ['Friend of the Night'] Similarity: 0.81013167
- Track names list: ['All My Friends', 'Friends', 'All My Friends'] Similarity: 0.7554451

In [34]:
value = 'he and his friend'
vectors = []
clean_value = clean('he and his friend')
for word in clean_value.split():
    try:
        vector = glove_model.get_vector(word)
        if vector is not None:
            vectors.append(vector)
    except:
        pass
if vectors:   
    q = np.array([np.array(sum(vectors)/len(vectors))])

results = lsh_model_song.query(q, k)

set_results = set()
for i in results:
    sentence = sentence_vector_map[str(i)]
    track_names = list(df[df['Cleaned Track Name'] == sentence]['Track Name'])
    similarity = 1 - cosine_distance(i,q)
    set_results.add(f" Track names list: {track_names} Similarity: {similarity}")
set_results

{" Track names list: ['All My Friends', 'Friends', 'All My Friends'] Similarity: [[0.7554451]]",
 " Track names list: ['Another Girl'] Similarity: [[0.51693785]]",
 " Track names list: ['Danny Boy'] Similarity: [[0.49132073]]",
 " Track names list: ['Family Song for the Leaving'] Similarity: [[0.5208501]]",
 " Track names list: ['Fortunate Son - Live'] Similarity: [[0.4717186]]",
 " Track names list: ['Friend of the Night'] Similarity: [[0.81013167]]",
 " Track names list: ['Half Sister'] Similarity: [[0.46893597]]",
 " Track names list: ['Hello, Good Night'] Similarity: [[0.47629404]]",
 " Track names list: ['Killing Your Love'] Similarity: [[0.4923262]]",
 " Track names list: ['Not Even Married'] Similarity: [[0.5281723]]",
 " Track names list: ['Not a Daughter', 'Daughter'] Similarity: [[0.59166664]]",
 " Track names list: ['The Night Me and Your Mama Met'] Similarity: [[0.4992473]]",
 " Track names list: ['The Poet You Never Were'] Similarity: [[0.54257655]]",
 " Track names list: 