In [1]:
# @Author: Rutvik Patel (17BCE0729)
# @Date: 29 August 2020, 4 September 2020
# @Description: Document similarity using cosine similarity and Euclidean distance

In [2]:
import nltk
from string import punctuation
import math
import requests 
import re
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
#Cleaning data (removing HTML tags and other special characters, references, etc.)
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
def remove_newline(text):
    clean = re.compile('\n')
    return re.sub(clean, '', text)
def remove_refs(text):
    clean = re.compile('\[.*\]')
    return re.sub(clean, '', text)

def extractContentByTag(soup, TAG):
    contents = []
    if TAG == 'p':
        contents = ''
        for tag in soup.find('div', {'class:', 'storyWrap'}).findAll(TAG):
            contents += (remove_refs(remove_newline(remove_html_tags(tag.getText()))))
        contents = contents[ : contents.rfind('\xa0To\xa0subscribe\xa0to\xa0National Geographic Traveller India\xa0and\xa0National\xa0Geographic')]
        
    else:
        for parentTag in soup.findAll('div', {'class:', 'cDescription'}):
            for tag in parentTag.findAll(TAG):
                if 'href' in tag.attrs.keys():
                    if not tag.attrs['href'].startswith('http', 0): #Filter useless URLs
                        continue
                    contents.append(tag.attrs['href'])
            
    return(contents)

def extractURLs(seedURL):
    req = requests.get(seedURL)
    soup = BeautifulSoup(req.content, 'html5lib')
    URLs = extractContentByTag(soup, 'a')
    return(URLs)

def extractArticles(URLs):
    corpus = {}
    reqs = [requests.get(URL) for URL in URLs]
    soups = [BeautifulSoup(req.content, 'html5lib') for req in reqs] 
    corpus = {URLs[i] : extractContentByTag(soups[i], 'p') for i in range(len(URLs))}
    return(corpus)
    

In [4]:
#Extracting data frtom the URL using beautiful soup
URLs = extractURLs('http://www.natgeotraveller.in/author/lakshmi-sankaran/')
corpus = extractArticles(URLs)
df = pd.DataFrame([(URL, len(corpus[URL])) for URL in corpus], index = ['Article ' + str(i + 1) for i in range(len(corpus.keys()))], columns = ['Article Link', 'Article Length'])
df

Unnamed: 0,Article Link,Article Length
Article 1,http://www.natgeotraveller.in/a-culture-ride-t...,9944
Article 2,http://www.natgeotraveller.in/splendour-at-sea/,4667
Article 3,http://www.natgeotraveller.in/in-thailand-nost...,4299
Article 4,http://www.natgeotraveller.in/a-citys-state-of...,2458
Article 5,http://www.natgeotraveller.in/the-great-outdoo...,2424
Article 6,http://www.natgeotraveller.in/searching-for-ou...,2431
Article 7,http://www.natgeotraveller.in/a-bombay-classic...,2449
Article 8,http://www.natgeotraveller.in/redefining-goals/,2517
Article 9,http://www.natgeotraveller.in/why-our-forests-...,2553
Article 10,http://www.natgeotraveller.in/a-soulmate-for-s...,2482


In [5]:
def getStopWords():
    StopWords = set(nltk.corpus.stopwords.words('english'))
    StopWords.update(set(punctuation))
    StopWords.update(set(['a','they','the','his','so','and','were','from','that','of','in','only','with','to']))
    return(StopWords)
    
def bagOfWords(corpus, processedCorpus, processedCorpusKeys):
    StopWords = getStopWords()
    for article in corpus:
        doc = nltk.tokenize.word_tokenize(corpus[article])
        doc = [word.lower() for word in doc if not word.lower() in StopWords]
        processedCorpus[article] = doc
        processedCorpusKeys.append(article)
    BOWR = {} #bag of words representation
    terms = []
    for article in processedCorpus:
        terms.extend(processedCorpus[article])
    terms = set(terms)
    for term in terms:
        row = []
        for article in processedCorpusKeys:
            count = 0
            for t in processedCorpus[article]:
                if t == term:
                    count += 1
            row.append(count)
        BOWR[term] = row
    BOWR['total_terms'] = [len(processedCorpus[article]) for article in processedCorpusKeys]
    return(BOWR)

In [6]:
processedCorpus = {}
processedCorpusKeys = []
bag = bagOfWords(corpus, processedCorpus, processedCorpusKeys)
df = pd.DataFrame(bag, index = [URL for URL in URLs])
df

Unnamed: 0,passage.the,newcomer,typical,mischief-makers.not,varanasi,tribal,actor,southwest,sneak,’,...,metallic,thrills,celebrated,faraway,rubber,york,tahoe.while,cooking,artists,total_terms
http://www.natgeotraveller.in/a-culture-ride-through-chiang-mai-and-chiang-rai/,0,0,0,0,0,3,0,0,0,30,...,0,0,0,1,0,0,0,0,0,1063
http://www.natgeotraveller.in/splendour-at-sea/,0,0,0,0,0,0,0,0,1,14,...,0,0,0,0,0,0,0,1,0,474
http://www.natgeotraveller.in/in-thailand-nostalgia-on-wheels/,0,0,0,0,0,0,0,0,0,17,...,0,0,0,0,0,0,0,1,0,432
http://www.natgeotraveller.in/a-citys-state-of-mind/,0,0,0,0,0,0,0,0,0,14,...,0,0,0,0,0,2,0,0,0,257
http://www.natgeotraveller.in/the-great-outdoors-may-not-be-great-for-long/,0,0,0,0,0,0,0,0,0,5,...,0,0,0,1,0,1,1,0,0,223
http://www.natgeotraveller.in/searching-for-our-pop-culture-gods/,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,2,0,0,1,236
http://www.natgeotraveller.in/a-bombay-classic-that-endures/,0,0,0,0,0,0,0,0,0,3,...,0,0,0,0,0,1,0,0,0,233
http://www.natgeotraveller.in/redefining-goals/,0,0,0,0,0,0,0,0,0,10,...,0,0,0,1,0,0,0,0,0,249
http://www.natgeotraveller.in/why-our-forests-need-watching/,0,0,0,0,0,0,0,0,0,5,...,0,1,0,0,0,0,0,0,0,243
http://www.natgeotraveller.in/a-soulmate-for-summer/,0,0,0,0,0,0,0,0,0,7,...,0,0,0,0,0,0,0,0,0,220


In [7]:
#TF matrix
def getTFMatrix(bag, processedCorpus, processedCorpusKeys):
    totals = [bag['total_terms'][i] for i in range(len(processedCorpusKeys))]
    TFMatrix = {}
    TFMatrix = {term : [bag[term][i] / totals[i] for i in range(len(processedCorpusKeys))] for term in bag.keys()}
    del TFMatrix['total_terms']
    return(TFMatrix)

In [8]:
TF = getTFMatrix(bag, processedCorpus, processedCorpusKeys)
df = pd.DataFrame(TF, index = URLs)
df

Unnamed: 0,passage.the,newcomer,typical,mischief-makers.not,varanasi,tribal,actor,southwest,sneak,’,...,spin,metallic,thrills,celebrated,faraway,rubber,york,tahoe.while,cooking,artists
http://www.natgeotraveller.in/a-culture-ride-through-chiang-mai-and-chiang-rai/,0.0,0.0,0.0,0.0,0.0,0.002822,0.0,0.0,0.0,0.028222,...,0.0,0.0,0.0,0.0,0.000941,0.0,0.0,0.0,0.0,0.0
http://www.natgeotraveller.in/splendour-at-sea/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00211,0.029536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00211,0.0
http://www.natgeotraveller.in/in-thailand-nostalgia-on-wheels/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002315,0.0
http://www.natgeotraveller.in/a-citys-state-of-mind/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.007782,0.0,0.0,0.0
http://www.natgeotraveller.in/the-great-outdoors-may-not-be-great-for-long/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022422,...,0.0,0.0,0.0,0.0,0.004484,0.0,0.004484,0.004484,0.0,0.0
http://www.natgeotraveller.in/searching-for-our-pop-culture-gods/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,...,0.0,0.0,0.0,0.0,0.0,0.0,0.008475,0.0,0.0,0.004237
http://www.natgeotraveller.in/a-bombay-classic-that-endures/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012876,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004292,0.0,0.0,0.0
http://www.natgeotraveller.in/redefining-goals/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040161,...,0.0,0.0,0.0,0.0,0.004016,0.0,0.0,0.0,0.0,0.0
http://www.natgeotraveller.in/why-our-forests-need-watching/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020576,...,0.0,0.0,0.004115,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.natgeotraveller.in/a-soulmate-for-summer/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031818,...,0.004545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def getIDFVector(bag, processedCorpus, processedCorpusKeys):
    IDF = {}
    terms = []
    for article in processedCorpusKeys:
        terms.extend(processedCorpus[article])
    terms = set(terms)
    for term in terms:
        appears = [0 for _ in range(len(processedCorpusKeys))]
        for i in range(len(processedCorpusKeys)):
            if term in processedCorpus[processedCorpusKeys[i]]:
                appears[i] = 1
        IDF[term] = 0 if sum(appears) == 0 else math.log((1 + len(processedCorpusKeys)) / sum(appears))
    return(IDF)

In [10]:
IDV = getIDFVector(bag, processedCorpus, processedCorpusKeys)
df = pd.DataFrame(IDV, index = ['IDF values'])
df

Unnamed: 0,passage.the,newcomer,typical,mischief-makers.not,varanasi,tribal,actor,southwest,sneak,’,...,spin,metallic,thrills,celebrated,faraway,rubber,york,tahoe.while,cooking,artists
IDF values,3.332205,3.332205,2.233592,3.332205,3.332205,2.639057,3.332205,3.332205,3.332205,0.036368,...,2.639057,3.332205,3.332205,2.639057,1.722767,3.332205,1.386294,3.332205,2.639057,3.332205


In [11]:
def getTF_IDFMatrix(TF, IDV, processedCorpusKeys):
    TF_IDF = {}
    for term in TF:
        TF_IDF[term] = [TF[term][i] * IDV[term] for i in range(len(processedCorpusKeys))]
    return(TF_IDF)

In [12]:
TF_IDF = getTF_IDFMatrix(TF, IDV, processedCorpusKeys)
df = pd.DataFrame(TF_IDF, index = URLs)
df

Unnamed: 0,passage.the,newcomer,typical,mischief-makers.not,varanasi,tribal,actor,southwest,sneak,’,...,spin,metallic,thrills,celebrated,faraway,rubber,york,tahoe.while,cooking,artists
http://www.natgeotraveller.in/a-culture-ride-through-chiang-mai-and-chiang-rai/,0.0,0.0,0.0,0.0,0.0,0.007448,0.0,0.0,0.0,0.001026,...,0.0,0.0,0.0,0.0,0.001621,0.0,0.0,0.0,0.0,0.0
http://www.natgeotraveller.in/splendour-at-sea/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00703,0.001074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005568,0.0
http://www.natgeotraveller.in/in-thailand-nostalgia-on-wheels/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001431,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006109,0.0
http://www.natgeotraveller.in/a-citys-state-of-mind/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.010788,0.0,0.0,0.0
http://www.natgeotraveller.in/the-great-outdoors-may-not-be-great-for-long/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000815,...,0.0,0.0,0.0,0.0,0.007725,0.0,0.006217,0.014943,0.0,0.0
http://www.natgeotraveller.in/searching-for-our-pop-culture-gods/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000616,...,0.0,0.0,0.0,0.0,0.0,0.0,0.011748,0.0,0.0,0.01412
http://www.natgeotraveller.in/a-bombay-classic-that-endures/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00595,0.0,0.0,0.0
http://www.natgeotraveller.in/redefining-goals/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001461,...,0.0,0.0,0.0,0.0,0.006919,0.0,0.0,0.0,0.0,0.0
http://www.natgeotraveller.in/why-our-forests-need-watching/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000748,...,0.0,0.0,0.013713,0.0,0.0,0.0,0.0,0.0,0.0,0.0
http://www.natgeotraveller.in/a-soulmate-for-summer/,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001157,...,0.011996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
def normalize(TF_IDF, processedCorpusKeys):
    norm = {}
    denos = [0 for _ in range(len(processedCorpusKeys))]
    for i in range(len(processedCorpusKeys)):
        denos[i] += sum([TF_IDF[term][i] ** 2 for term in TF_IDF])
    for i in range(len(denos)):
        denos[i] = denos[i] ** 0.5
    for term in TF_IDF:
        norm[term] = [TF_IDF[term][i] / denos[i] for i in range(len(processedCorpusKeys))]
    return(norm)

In [14]:
norm = normalize(TF_IDF, processedCorpusKeys)
df = pd.DataFrame(norm, index = ['Article ' + str(i + 1) for i in range(len(processedCorpusKeys))])
df

Unnamed: 0,passage.the,newcomer,typical,mischief-makers.not,varanasi,tribal,actor,southwest,sneak,’,...,spin,metallic,thrills,celebrated,faraway,rubber,york,tahoe.while,cooking,artists
Article 1,0.0,0.0,0.0,0.0,0.0,0.069606,0.0,0.0,0.0,0.009592,...,0.0,0.0,0.0,0.0,0.015146,0.0,0.0,0.0,0.0,0.0
Article 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047164,0.007206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037353,0.0
Article 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040746,0.0
Article 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012402,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067538,0.0,0.0,0.0
Article 5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004697,...,0.0,0.0,0.0,0.0,0.044505,0.0,0.035813,0.086082,0.0,0.0
Article 6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.059497,0.0,0.0,0.071506
Article 7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.023131,0.0,0.0,0.0
Article 8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006862,...,0.0,0.0,0.0,0.0,0.032508,0.0,0.0,0.0,0.0,0.0
Article 9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004161,...,0.0,0.0,0.076243,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Article 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006424,...,0.066593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Cosine document similarity:
def getDocumentCosineSimilarityResult(norm, processedCorpusKeys):
    similarityRes = {}
    for i in range(0, len(processedCorpusKeys)):
        for j in range(i + 1, len(processedCorpusKeys)):
            similarity = 0
            for term in norm:
                similarity += norm[term][i] * norm[term][j]
            res = 'Cosine similarity of URL ' + str(i + 1) + ' with URL ' + str(j + 1) + ' is: '
            similarityRes[res] = similarity
    return(similarityRes)

In [16]:
result = getDocumentCosineSimilarityResult(norm, processedCorpusKeys)
df = pd.DataFrame([result.keys(), result.values()], index = ['Article Pair', 'Cosine Similarity'], columns = ['Comparision ' + str(i + 1) for i in range(len(result.values()))]).transpose()
df.sort_values('Cosine Similarity', axis = 0, ascending = False, inplace = True, kind = 'quicksort')
print('Key = higher the value of cosine similarity (angle - dot product), the more similar an article pair is')
df

Key = higher the value of cosine similarity (angle - dot product), the more similar an article pair is


Unnamed: 0,Article Pair,Cosine Similarity
Comparision 349,Cosine similarity of URL 25 with URL 26 is:,0.0967496
Comparision 107,Cosine similarity of URL 5 with URL 14 is:,0.092669
Comparision 1,Cosine similarity of URL 1 with URL 2 is:,0.0880026
Comparision 350,Cosine similarity of URL 25 with URL 27 is:,0.0813811
Comparision 325,Cosine similarity of URL 20 with URL 22 is:,0.069155
...,...,...
Comparision 136,Cosine similarity of URL 6 with URL 22 is:,0.00670025
Comparision 129,Cosine similarity of URL 6 with URL 15 is:,0.00579299
Comparision 241,Cosine similarity of URL 12 with URL 22 is:,0.00494994
Comparision 158,Cosine similarity of URL 7 with URL 24 is:,0.00485182


In [17]:
#Document euclidean distances:
def getDocumentEuclideanDistanceResult(norm, processedCorpusKeys):
    similarityRes = {}
    for i in range(0, len(processedCorpusKeys)):
        for j in range(i + 1, len(processedCorpusKeys)):
            similarity = 0
            for term in norm:
                similarity += math.pow(norm[term][i] - norm[term][j], 2)
            similarity = math.pow(similarity, 0.5)
            res = 'Euclidean distance of URL ' + str(i + 1) + ' with URL ' + str(j + 1) + ' is: '
            similarityRes[res] = similarity
    return(similarityRes)

In [18]:
result = getDocumentEuclideanDistanceResult(norm, processedCorpusKeys)
df = pd.DataFrame([result.keys(), result.values()], index = ['Document Pair', 'Euclidean Distance'], columns = ['Comparision ' + str(i + 1) for i in range(len(result.values()))]).transpose()
df.sort_values('Euclidean Distance', axis = 0, ascending = True, inplace = True, kind = 'quicksort')
print('Key = lesser the value of Euclidean distance (geometric distance in n-dimentional Euclidean space), the more similar an article pair is')
df

Key = lesser the value of Euclidean distance (geometric distance in n-dimentional Euclidean space), the more similar an article pair is


Unnamed: 0,Document Pair,Euclidean Distance
Comparision 349,Euclidean distance of URL 25 with URL 26 is:,1.34406
Comparision 107,Euclidean distance of URL 5 with URL 14 is:,1.34709
Comparision 1,Euclidean distance of URL 1 with URL 2 is:,1.35055
Comparision 350,Euclidean distance of URL 25 with URL 27 is:,1.35545
Comparision 325,Euclidean distance of URL 20 with URL 22 is:,1.36444
...,...,...
Comparision 136,Euclidean distance of URL 6 with URL 22 is:,1.40947
Comparision 129,Euclidean distance of URL 6 with URL 15 is:,1.41011
Comparision 241,Euclidean distance of URL 12 with URL 22 is:,1.41071
Comparision 158,Euclidean distance of URL 7 with URL 24 is:,1.41078


## Conclusion:

### As seen from the above results, the cosine similarity and the Euclidean distance both yield the same results (sorted) in terms of similarity.

### Moreover, the documents are hardly similar in the usage of the terms.

### It can be concluded that the articles are fresh every single time.