In [1]:
#@Author: Rutvik Patel (17BCE0729)
#@Description: Document similarity task

import nltk
from string import punctuation
import math
import requests 
import re
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
def remove_newline(text):
    clean = re.compile('\n')
    return re.sub(clean, '', text)
def remove_refs(text):
    clean = re.compile('\[.*\]')
    return re.sub(clean, '', text)
    
def extractContent(URL):
    request = requests.get(URL)
    soup = BeautifulSoup(request.content, 'html.parser')
    content = ''
    for tag in soup.findAll('p'):
        content += remove_refs(remove_newline(remove_html_tags(tag.getText())))
    return(content)
    
def fetchArticles(URLs):
    docs = []
    for URL in URLs:
        docs.append(extractContent(URL))
    return(docs)

In [3]:
URLs = ['https://en.wikipedia.org/wiki/Apple',
        'https://en.wikipedia.org/wiki/Pineapple',
        'https://en.wikipedia.org/wiki/Watermelon',
        'https://en.wikipedia.org/wiki/Muskmelon',
        'https://en.wikipedia.org/wiki/Mango',
        'https://en.wikipedia.org/wiki/Banana',
        'https://en.wikipedia.org/wiki/Fruit']

docs = fetchArticles(URLs)
StopWords = set(nltk.corpus.stopwords.words('english'))
StopWords.update(set(punctuation))
StopWords.update(set(['.',',','a','they','the','his','so','and','were','from','that','of','in','only','with','to']))

In [4]:
newdocs = []
def bagOfWords(docs):
    for i in range(len(docs)):
        doc = nltk.tokenize.word_tokenize(docs[i])
        doc = [word for word in doc if not word in StopWords]
        newdocs.append(doc)
        BOWR = {} #bag of words representation
        terms = []
        for doc in newdocs:
            terms.extend(doc)
        terms = set(terms)
        for term in terms:
            row = []
            for doc in newdocs:
                count = 0
                for t in doc:
                    if t == term:
                        count += 1
                row.append(count)
            BOWR[term] = row
        BOWR['total_terms'] = [len(doc) for doc in newdocs]
    return(BOWR)

In [5]:
bag = bagOfWords(docs)
df = pd.DataFrame(bag, index = ['Document ' + str(i + 1) for i in range(len(docs))])
df

Unnamed: 0,increases,emperor,skald,Crushed,lap,openedWild,carry,The,years,problematic,...,ground,center,Apple,method,added,sown,desirable,economic,carotenoids,total_terms
Document 1,0,0,1,0,1,0,1,8,4,0,...,0,1,4,1,0,0,2,0,0,1930
Document 2,0,0,0,1,0,0,0,12,0,0,...,0,0,0,0,1,0,0,1,0,1000
Document 3,1,0,0,0,0,0,0,16,0,0,...,1,1,0,0,0,1,0,0,1,1090
Document 4,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,314
Document 5,0,2,0,0,0,0,0,7,2,0,...,0,1,0,0,1,0,0,1,0,1099
Document 6,1,0,0,0,0,1,0,25,1,1,...,1,2,0,0,1,0,0,0,0,2629
Document 7,0,0,0,0,0,0,0,4,0,0,...,0,0,0,0,1,0,0,0,0,1172


In [6]:
#TF matrix
totals = [bag['total_terms'][i] for i in range(len(docs))]
def getTFMatrix(bag):
    TFMatrix = {}
    TFMatrix = {term : [bag[term][i] / totals[i] for i in range(len(docs))] for term in bag.keys()}
    del TFMatrix['total_terms']
    return(TFMatrix)

In [7]:
TF = getTFMatrix(bag)
df = pd.DataFrame(TF, index = ['Document ' + str(i + 1) for i in range(len(docs))])
df

Unnamed: 0,increases,emperor,skald,Crushed,lap,openedWild,carry,The,years,problematic,...,experience,ground,center,Apple,method,added,sown,desirable,economic,carotenoids
Document 1,0.0,0.0,0.000518,0.0,0.000518,0.0,0.000518,0.004145,0.002073,0.0,...,0.0,0.0,0.000518,0.002073,0.000518,0.0,0.0,0.001036,0.0,0.0
Document 2,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.012,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.001,0.0
Document 3,0.000917,0.0,0.0,0.0,0.0,0.0,0.0,0.014679,0.0,0.0,...,0.0,0.000917,0.000917,0.0,0.0,0.0,0.000917,0.0,0.0,0.000917
Document 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015924,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Document 5,0.0,0.00182,0.0,0.0,0.0,0.0,0.0,0.006369,0.00182,0.0,...,0.00091,0.0,0.00091,0.0,0.0,0.00091,0.0,0.0,0.00091,0.0
Document 6,0.00038,0.0,0.0,0.0,0.0,0.00038,0.0,0.009509,0.00038,0.00038,...,0.00038,0.00038,0.000761,0.0,0.0,0.00038,0.0,0.0,0.0,0.0
Document 7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003413,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000853,0.0,0.0,0.0,0.0


In [8]:
def getIDFVector(bag):
    IDF = {}
    terms = []
    for doc in newdocs:
        terms.extend(doc)
    terms = set(terms)
    for term in terms:
        appears = [0 for _ in range(len(docs))]
        for i in range(len(newdocs)):
            if term in newdocs[i]:
                appears[i] = 1
        IDF[term] = 0 if sum(appears) == 0 else math.log((1 + len(newdocs)) / sum(appears))
    return(IDF)

In [9]:
IDV = getIDFVector(bag)
df = pd.DataFrame(IDV, index = ['IDF values'])
df

Unnamed: 0,increases,emperor,skald,Crushed,lap,openedWild,carry,The,years,problematic,...,experience,ground,center,Apple,method,added,sown,desirable,economic,carotenoids
IDF values,1.386294,2.079442,2.079442,2.079442,2.079442,2.079442,2.079442,0.133531,0.980829,2.079442,...,1.386294,1.386294,0.693147,2.079442,2.079442,0.693147,2.079442,2.079442,1.386294,2.079442


In [10]:
def getTF_IDFMatrix(TF, IDV):
    TF_IDF = {}
    for term in TF:
        TF_IDF[term] = [TF[term][i] * IDV[term] for i in range(len(newdocs))]
    return(TF_IDF)

In [11]:
TF_IDF = getTF_IDFMatrix(TF, IDV)
df = pd.DataFrame(TF_IDF, index = ['Document ' + str(i + 1) for i in range(len(docs))])
df

Unnamed: 0,increases,emperor,skald,Crushed,lap,openedWild,carry,The,years,problematic,...,experience,ground,center,Apple,method,added,sown,desirable,economic,carotenoids
Document 1,0.0,0.0,0.001077,0.0,0.001077,0.0,0.001077,0.000553,0.002033,0.0,...,0.0,0.0,0.000359,0.00431,0.001077,0.0,0.0,0.002155,0.0,0.0
Document 2,0.0,0.0,0.0,0.002079,0.0,0.0,0.0,0.001602,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000693,0.0,0.0,0.001386,0.0
Document 3,0.001272,0.0,0.0,0.0,0.0,0.0,0.0,0.00196,0.0,0.0,...,0.0,0.001272,0.000636,0.0,0.0,0.0,0.001908,0.0,0.0,0.001908
Document 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002126,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Document 5,0.0,0.003784,0.0,0.0,0.0,0.0,0.0,0.000851,0.001785,0.0,...,0.001261,0.0,0.000631,0.0,0.0,0.000631,0.0,0.0,0.001261,0.0
Document 6,0.000527,0.0,0.0,0.0,0.0,0.000791,0.0,0.00127,0.000373,0.000791,...,0.000527,0.000527,0.000527,0.0,0.0,0.000264,0.0,0.0,0.0,0.0
Document 7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000456,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000591,0.0,0.0,0.0,0.0


In [12]:
def normalize(TF_IDF):
    norm = {}
    denos = [0 for _ in range(len(newdocs))]
    for i in range(len(newdocs)):
        denos[i] += sum([TF_IDF[term][i] ** 2 for term in TF_IDF])
    for i in range(len(denos)):
        denos[i] = denos[i] ** 0.5
    for term in TF_IDF:
        norm[term] = [TF_IDF[term][i] / denos[i] for i in range(len(newdocs))]
    return(norm)
    

In [13]:
norm = normalize(TF_IDF)
df = pd.DataFrame(norm, index = ['Document ' + str(i + 1) for i in range(len(docs))])
df

Unnamed: 0,increases,emperor,skald,Crushed,lap,openedWild,carry,The,years,problematic,...,experience,ground,center,Apple,method,added,sown,desirable,economic,carotenoids
Document 1,0.0,0.0,0.017576,0.0,0.017576,0.0,0.017576,0.009029,0.033161,0.0,...,0.0,0.0,0.005859,0.070305,0.017576,0.0,0.0,0.035153,0.0,0.0
Document 2,0.0,0.0,0.0,0.029113,0.0,0.0,0.0,0.022434,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.009704,0.0,0.0,0.019409,0.0
Document 3,0.019272,0.0,0.0,0.0,0.0,0.0,0.0,0.029701,0.0,0.0,...,0.0,0.019272,0.009636,0.0,0.0,0.0,0.028908,0.0,0.0,0.028908
Document 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021188,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Document 5,0.0,0.039866,0.0,0.0,0.0,0.0,0.0,0.00896,0.018804,0.0,...,0.013289,0.0,0.006644,0.0,0.0,0.006644,0.0,0.0,0.013289,0.0
Document 6,0.007651,0.0,0.0,0.0,0.0,0.011477,0.0,0.018424,0.005413,0.011477,...,0.007651,0.007651,0.007651,0.0,0.0,0.003826,0.0,0.0,0.0,0.0
Document 7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007264,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.009427,0.0,0.0,0.0,0.0


In [14]:
norm

{'increases': [0.0,
  0.0,
  0.019271956434498735,
  0.0,
  0.0,
  0.0076511601619633895,
  0.0],
 'emperor': [0.0, 0.0, 0.0, 0.0, 0.039865538926210885, 0.0, 0.0],
 'skald': [0.01757626401305808, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'Crushed': [0.0, 0.0291129619011076, 0.0, 0.0, 0.0, 0.0, 0.0],
 'lap': [0.01757626401305808, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'openedWild': [0.0, 0.0, 0.0, 0.0, 0.0, 0.011476740242945085, 0.0],
 'carry': [0.01757626401305808, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'The': [0.009029282001951201,
  0.022433875258291495,
  0.02970118039541172,
  0.021187808947147436,
  0.008959883162598097,
  0.01842447932188989,
  0.007264208379577485],
 'years': [0.03316143023426992,
  0.0,
  0.0,
  0.0,
  0.01880374416984962,
  0.005413339271083294,
  0.0],
 'problematic': [0.0, 0.0, 0.0, 0.0, 0.0, 0.011476740242945085, 0.0],
 'normally': [0.0,
  0.013731977601714518,
  0.0,
  0.0,
  0.0,
  0.005413339271083294,
  0.013339462613666388],
 'nutty': [0.0, 0.0, 0.028907934651748096, 0.0, 0.0

In [15]:
#Document similarity:
def getSimilarityResult(norm):
    similarityRes = {}
    for i in range(0, len(newdocs)):
        for j in range(i + 1, len(newdocs)):
            similarity = 0
            for term in norm:
                similarity += norm[term][i] * norm[term][j]
            res = 'Similarity of document ' + str(i + 1) + ' with document ' + str(j + 1)
            similarityRes[res] = similarity
    return(similarityRes)

In [16]:
result = getSimilarityResult(norm)
df = pd.DataFrame(
    [result.keys(), result.values()],
    index = ['Document Pair', 'Cosine Similarities'],
    columns = ['Comparision ' + str(i) for i in range(1, len(result.values()) + 1)]
                 ).transpose()
df.sort_values('Cosine Similarities', axis = 0, ascending = False, inplace = True, kind = 'quicksort')
df

Unnamed: 0,Document Pair,Cosine Similarities
Comparision 12,Similarity of document 3 with document 4,0.093121
Comparision 6,Similarity of document 1 with document 7,0.0839856
Comparision 21,Similarity of document 6 with document 7,0.0680922
Comparision 5,Similarity of document 1 with document 6,0.0473762
Comparision 11,Similarity of document 2 with document 7,0.0468659
Comparision 15,Similarity of document 3 with document 7,0.0450652
Comparision 10,Similarity of document 2 with document 6,0.0449699
Comparision 14,Similarity of document 3 with document 6,0.043027
Comparision 4,Similarity of document 1 with document 5,0.0412131
Comparision 2,Similarity of document 1 with document 3,0.0397933
