In [1]:
db = [
    'information requirement : query considers the user feedback as information requirement to search',
    'information retrieval : query depends on the model of information retrieval used',
    'prediction problem : Many problems in information retrieval can be viewed as prediction problems',
    'search : A search engine is one of applications of information retrieval models',
    'Feedback : feedback is typically used by the system to modify the query and improve prediction'.
]

In [2]:
### A) Verify if the titles are exactly same (Apply BinaryDistance(u,v), which gives the
# binary distance between vectors u and v, equal to 0 if they are identical and 1
# otherwise.). If same, label the document as duplicate and discard it else proceed to
# second part of the Checker.

dbDocTitles = [doc.split(':')[0] for doc in db]

for doc in docsToAdd:
   if doc.split(':')[0] not in dbDocTitles:
    db.append(doc)
    dbDocTitles.append(doc.split(':')[0])

For B: https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [3]:
import math
# Term Frequency
def tf(t, d):
    return d.split().count(t)/len(d.split())

# Document Frequency
def docfreq(t):
    termCount = 0
    for doc in db:
        termCount += doc.split(':')[1].split().count(t)
    return termCount

# Modified Inverse Document Frequency
def mod_idf(t):
    # Number of documents containing term t
    N = 0
    for doc in db:
        if t in doc.split(':')[1].split():
            N += 1
    return math.log((N + 1) / (0.5 + docfreq(t)))

# Given weight formula
# tf-idf(t, d) = tf(t, d) * mod_idf(t)
def tf_mod_idf(t, d):
    return tf(t, d) * mod_idf(t)

In [4]:
# Has to be run during the first run in a new env
# import nltk 
# nltk.download('all')

In [5]:
### B) Represent documents (excluding the title) as term document vectors with weight of
# a term in a document computed as <the-given-formula>

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stopWords = set(stopwords.words('english'))

docsMap = {doc.split(':')[0]:doc.split(':')[1] for doc in db}


# Stop word removal
stopWordRemovedResult = {}
for title, content in docsMap.items():
    words = word_tokenize(content)
    filtered_words = [w for w in words if not w.lower() in stopWords and w != '.']
    stopWordRemovedResult[title] = filtered_words
    
# Stemming
porter = PorterStemmer()
stemmedListMap = {}
for title, wordList in stopWordRemovedResult.items():
    stemmedWords = []
    for word in wordList:
        stemmedWord = porter.stem(word)
        stemmedWords.append(stemmedWord)
    stemmedListMap[title] = stemmedWords
stemmedListMap

# Joining
stemmedDB = {title: ' '.join(content) for title, content in stemmedListMap.items()}
allTerms = sorted(list(set([term for doc in stemmedListMap.values() for term in doc])))

# Weight calc
documentVectorMatrix = [[tf_mod_idf(term, doc) for term in allTerms] for doc in stemmedDB.values()]
documentVectorMatrix

df = pd.DataFrame(documentVectorMatrix)
df['title'] = dbDocTitles
df.set_index('title', inplace=True)
df.columns = allTerms
df

Unnamed: 0_level_0,applic,consid,depend,engin,feedback,improv,inform,mani,model,modifi,...,problem,queri,requir,retriev,search,system,typic,use,user,view
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
information requirement,0.0,0.099021,0.0,0.0,0.026046,0.0,0.099021,0.0,0.0,0.0,...,0.0,0.099021,0.099021,0.0,0.026046,0.0,0.0,0.0,0.041097,0.0
information retrieval,0.0,0.0,0.115525,0.0,0.0,0.0,0.115525,0.0,0.047947,0.0,...,0.0,0.115525,0.0,0.115525,0.0,0.0,0.0,0.115525,0.0,0.0
prediction problem,0.0,0.0,0.0,0.0,0.0,0.0,0.099021,0.099021,0.0,0.0,...,0.198042,0.0,0.0,0.099021,0.0,0.0,0.0,0.0,0.0,0.099021
search,0.099021,0.0,0.0,0.099021,0.0,0.0,0.099021,0.0,0.041097,0.0,...,0.0,0.0,0.0,0.099021,0.026046,0.0,0.0,0.0,0.0,0.0
Feedback,0.0,0.0,0.0,0.0,0.02279,0.086643,0.0,0.0,0.0,0.086643,...,0.0,0.086643,0.0,0.0,0.0,0.03596,0.086643,0.086643,0.0,0.0


For C : https://www.geeksforgeeks.org/measuring-the-document-similarity-in-python/

In [6]:
### C) Identify a document as duplicate if the similarity of the document is more than the
# threshold α. (α=0.85). Apply cosine similarity for similarity computations.

import math

def dotProd(A, B):
    s = 0
    for word in A[1]:
        if word in B[1]:
            s += (df[word][A[0]] * df[word][B[0]])
    return s

def cosineSimilarity(D1, D2):  
    numerator = dotProd(D1, D2) 
    denominator = math.sqrt(dotProd(D1, D1)*dotProd(D2, D2)) 
    return numerator / denominator

cosineSimilarityResults = []
for docA in stemmedListMap.items():
    for docB in stemmedListMap.items():
        if docA[0] != docB[0]:
            res = cosineSimilarity(docA, docB)
            cosineSimilarityResults.append([docA[0], docB[0], res, 'Yes' if res > 0.85 else 'No'])

dfC = pd.DataFrame(cosineSimilarityResults)
dfC.columns = ['Doc A', 'Doc B', 'Cosine Similarity', 'Plaigarised']
dfC

Unnamed: 0,Doc A,Doc B,Cosine Similarity,Plaigarised
0,information requirement,information retrieval,0.423565,No
1,information requirement,prediction problem,0.133585,No
2,information requirement,search,0.245122,No
3,information requirement,Feedback,0.20613,No
4,information retrieval,information requirement,0.423565,No
5,information retrieval,prediction problem,0.243904,No
6,information retrieval,search,0.454642,No
7,information retrieval,Feedback,0.352004,No
8,prediction problem,information requirement,0.133585,No
9,prediction problem,information retrieval,0.243904,No
