# Explicit Entity Resolution: A Short Example

This short excercise is meant to show the ways in which explicit entity resolution is costly and not effective with larger datasets.  For an equal comparison, we will compare the same fields and assess the total potential time needed to resolve the entire dataset. 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, pairwise_kernels
from scipy import sparse
import numpy as np
from sklearn.preprocessing import normalize
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.metrics.distance import jaro_winkler_similarity
import time
import dask
%matplotlib inline

In [None]:
data = pd.read_csv(r'Albums.csv')
data['all'] = data['title'].astype(str) + data['length'].astype(str) + data['artist'].astype(str) + data['album'].astype(str) + data['language'].astype(str)
data.head()

## Illustration: String Comparison with an Explicit For Loop

In [None]:
song_list = data['all'].tolist()
start = time.time()
score_list = []
for i in range(len(song_list)):
    score_list.append(jaro_winkler_similarity(song_list[i], song_list[0]))
print('Total time required to compare one song with all other songs {} Seconds'.format((time.time()-start)))
print('Total estimated time to compute similarities {} hours'.format(((time.time()-start)*len(data))/60/60))

## Illustration: String Comparison with For Loop Parrellized with Dask 

In [None]:
def score_string_holdout(record_list, index_holdout):
    score_list = []
    
    for i in range(len(record_list)):
        score_list.append(jaro_winkler_similarity(record_list[i], record_list[index_holdout]))
    return score_list
compute = dask.delayed(score_string_holdout)(data['all'].tolist(),0)
start = time.time()
score_list = compute.compute()
print('Total time required to compare one song with all other songs {} Seconds'.format((time.time()-start)))
print('Total estimated time to compute similarities {} hours'.format(((time.time()-start)*len(data))/60/60))

In [None]:
np.argsort(score_list)[-5:]

In [None]:
score_list[3978], score_list[0]

In [None]:
data['all'].tolist()[3978], data['all'].tolist()[0]