In [1]:
# @Author: Rutvik Patel (17BCE0729)
# @Date: 29 August 2020, 4 September 2020
# @Description: Document Similarity

In [None]:
import nltk
from string import punctuation
import math
import requests 
import re
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
#Cleaning data (removing HTML tags and other special characters, references, etc.)
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
def remove_newline(text):
    clean = re.compile('\n')
    return re.sub(clean, '', text)
def remove_refs(text):
    clean = re.compile('\[.*\]')
    return re.sub(clean, '', text)

def extractContentByTag(soup, TAG):
    contents = []
    if TAG == 'p':
        contents = ''
        for tag in soup.find('div', {'class:', 'storyWrap'}).findAll(TAG):
            contents += (remove_refs(remove_newline(remove_html_tags(tag.getText()))))
        contents = contents[ : contents.rfind('\xa0To\xa0subscribe\xa0to\xa0National Geographic Traveller India\xa0and\xa0National\xa0Geographic')]
        
    else:
        for parentTag in soup.findAll('div', {'class:', 'cDescription'}):
            for tag in parentTag.findAll(TAG):
                if 'href' in tag.attrs.keys():
                    if not tag.attrs['href'].startswith('http', 0): #Filter useless URLs
                        continue
                    contents.append(tag.attrs['href'])
            
    return(contents)

def extractURLs(seedURL):
    req = requests.get(seedURL)
    soup = BeautifulSoup(req.content, 'html5lib')
    URLs = extractContentByTag(soup, 'a')
    return(URLs)

def extractArticles(URLs):
    corpus = {}
    reqs = [requests.get(URL) for URL in URLs]
    soups = [BeautifulSoup(req.content, 'html5lib') for req in reqs] 
    corpus = {URLs[i] : extractContentByTag(soups[i], 'p') for i in range(len(URLs))}
    return(corpus)
    

In [None]:
#Extracting data frtom the URL using beautiful soup
URLs = extractURLs('http://www.natgeotraveller.in/author/lakshmi-sankaran/')
corpus = extractArticles(URLs)
df = pd.DataFrame([(URL, len(corpus[URL])) for URL in corpus], index = ['Article ' + str(i + 1) for i in range(len(corpus.keys()))], columns = ['Article Link', 'Article Length'])
df

In [None]:
def getStopWords():
    StopWords = set(nltk.corpus.stopwords.words('english'))
    StopWords.update(set(punctuation))
    StopWords.update(set(['a','they','the','his','so','and','were','from','that','of','in','only','with','to']))
    return(StopWords)
    
def bagOfWords(corpus, processedCorpus, processedCorpusKeys):
    StopWords = getStopWords()
    for article in corpus:
        doc = nltk.tokenize.word_tokenize(corpus[article])
        doc = [word.lower() for word in doc if not word.lower() in StopWords]
        processedCorpus[article] = doc
        processedCorpusKeys.append(article)
    BOWR = {} #bag of words representation
    terms = []
    for article in processedCorpus:
        terms.extend(processedCorpus[article])
    terms = set(terms)
    for term in terms:
        row = []
        for article in processedCorpusKeys:
            count = 0
            for t in processedCorpus[article]:
                if t == term:
                    count += 1
            row.append(count)
        BOWR[term] = row
    BOWR['total_terms'] = [len(processedCorpus[article]) for article in processedCorpusKeys]
    return(BOWR)

In [None]:
processedCorpus = {}
processedCorpusKeys = []
bag = bagOfWords(corpus, processedCorpus, processedCorpusKeys)
df = pd.DataFrame(bag, index = [URL for URL in URLs])
df

In [None]:
#TF matrix
def getTFMatrix(bag, processedCorpus, processedCorpusKeys):
    totals = [bag['total_terms'][i] for i in range(len(processedCorpusKeys))]
    TFMatrix = {}
    TFMatrix = {term : [bag[term][i] / totals[i] for i in range(len(processedCorpusKeys))] for term in bag.keys()}
    del TFMatrix['total_terms']
    return(TFMatrix)

In [None]:
TF = getTFMatrix(bag, processedCorpus, processedCorpusKeys)
df = pd.DataFrame(TF, index = URLs)
df

In [None]:
def getIDFVector(bag, processedCorpus, processedCorpusKeys):
    IDF = {}
    terms = []
    for article in processedCorpusKeys:
        terms.extend(processedCorpus[article])
    terms = set(terms)
    for term in terms:
        appears = [0 for _ in range(len(processedCorpusKeys))]
        for i in range(len(processedCorpusKeys)):
            if term in processedCorpus[processedCorpusKeys[i]]:
                appears[i] = 1
        IDF[term] = 0 if sum(appears) == 0 else math.log((1 + len(processedCorpusKeys)) / sum(appears))
    return(IDF)

In [None]:
IDV = getIDFVector(bag, processedCorpus, processedCorpusKeys)
df = pd.DataFrame(IDV, index = ['IDF values'])
df

In [None]:
def getTF_IDFMatrix(TF, IDV, processedCorpusKeys):
    TF_IDF = {}
    for term in TF:
        TF_IDF[term] = [TF[term][i] * IDV[term] for i in range(len(processedCorpusKeys))]
    return(TF_IDF)

In [None]:
TF_IDF = getTF_IDFMatrix(TF, IDV, processedCorpusKeys)
df = pd.DataFrame(TF_IDF, index = URLs)
df

In [None]:
def normalize(TF_IDF, processedCorpusKeys):
    norm = {}
    denos = [0 for _ in range(len(processedCorpusKeys))]
    for i in range(len(processedCorpusKeys)):
        denos[i] += sum([TF_IDF[term][i] ** 2 for term in TF_IDF])
    for i in range(len(denos)):
        denos[i] = denos[i] ** 0.5
    for term in TF_IDF:
        norm[term] = [TF_IDF[term][i] / denos[i] for i in range(len(processedCorpusKeys))]
    return(norm)

In [None]:
norm = normalize(TF_IDF, processedCorpusKeys)
df = pd.DataFrame(norm, index = ['Article ' + str(i + 1) for i in range(len(processedCorpusKeys))])
df

In [None]:
#Cosine document similarity:
def getDocumentCosineSimilarityResult(norm, processedCorpusKeys):
    similarityRes = {}
    for i in range(0, len(processedCorpusKeys)):
        for j in range(i + 1, len(processedCorpusKeys)):
            similarity = 0
            for term in norm:
                similarity += norm[term][i] * norm[term][j]
            res = 'Cosine similarity of URL ' + str(i + 1) + ' with URL ' + str(j + 1) + ' is: '
            similarityRes[res] = similarity
    return(similarityRes)

In [None]:
result = getDocumentCosineSimilarityResult(norm, processedCorpusKeys)
df = pd.DataFrame([result.keys(), result.values()], index = ['Article Pair', 'Cosine Similarity']).transpose()
df.sort_values('Cosine Similarity', axis = 0, ascending = False, inplace = True, kind = 'quicksort')
print('Key = higher the value of cosine similarity (angle - dot product), the more similar an article pair is')
df

In [None]:
#Document euclidean distances:
def getDocumentEuclideanDistanceResult(norm, processedCorpusKeys):
    similarityRes = {}
    for i in range(0, len(processedCorpusKeys)):
        for j in range(i + 1, len(processedCorpusKeys)):
            similarity = 0
            for term in norm:
                similarity += math.pow(norm[term][i] - norm[term][j], 2)
            similarity = math.pow(similarity, 0.5)
            res = 'Euclidean distance of URL ' + str(i + 1) + ' with URL ' + str(j + 1) + ' is: '
            similarityRes[res] = similarity
    return(similarityRes)

In [None]:
result = getDocumentEuclideanDistanceResult(norm, processedCorpusKeys)
df = pd.DataFrame([result.keys(), result.values()], index = ['Document Pair', 'Euclidean Distance']).transpose()
df.sort_values('Euclidean Distance', axis = 0, ascending = True, inplace = True, kind = 'quicksort')
print('Key = lesser the value of Euclidean distance (geometric distance in n-dimentional Euclidean space), the more similar an article pair is')
df