# PS2 Plagiarism Detector

In [145]:
docs = ["information requirement: query considers the user feedback as information requirement to search",
        "information retrieval: query depends on the model of information retrieval",
        "prediction problem: Many problems in information retrieval can be viewed as prediction problems",
        "search: A search engine is one of applications of information retrieval models"]

new_docs = ["Feedback: feedback is typically used by the system to modify the query and improve prediction",
            "information retrieval: ranking in information retrieval algorithms depends on user query",
            "information problem: query consider on the modey of information retriebal"]


1, A) sub_div

In [146]:
def calculate_binary_distance(vector1, vector2):
    dist = 0
    
    for x1, x2 in zip(vector1, vector2):
        dist += abs(x1-x2)
    
    return dist
        

def check_duplicate(vector, ref_matrix):
    for ref_vec in ref_matrix:
        if calculate_binary_distance(vector, ref_vec)==0:
            return True
    
    return False

In [147]:
existing_titles = []

for doc in docs:
    existing_titles.append(doc.split(":")[0])

new_titles = []

for doc in new_docs:
    new_titles.append(doc.split(":")[0])

In [148]:
existing_titles

['information requirement',
 'information retrieval',
 'prediction problem',
 'search']

In [149]:
new_titles

['Feedback', 'information retrieval', 'information problem']

In [150]:
doc_map = {}

for title in existing_titles:
    for word in title.split():
        if word in doc_map:
            doc_map[word] += 1
        else:
            doc_map[word] = 1

doc_matrix = []

for title in existing_titles:
    vector = [0]*len(doc_map)
    
    for word in title.split():
        if word in doc_map:
            index = list(doc_map.keys()).index(word)
            vector[index] = 1
    
    doc_matrix.append(vector)
    
doc_matrix

[[1, 1, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0],
 [0, 0, 0, 1, 1, 0],
 [0, 0, 0, 0, 0, 1]]

In [151]:
new_matrix = []

for title in new_titles:
    vector = [0]*len(doc_map)
    
    for word in title.split():
        if word in doc_map:
            index = list(doc_map.keys()).index(word)
            vector[index] = 1
    
    new_matrix.append(vector)
    
new_matrix

[[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 0]]

In [152]:
for i in range(len(new_titles)):
    if check_duplicate(new_matrix[i], doc_matrix):
        print(f"{new_titles[i]} is an existing title.")

information retrieval is an existing title.


1) B subdiv

In [153]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [154]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [155]:
def transform_text(td):
    td = td.lower()
    td = nltk.word_tokenize(td)
    
    y = []
    
    for word in td:
        if word not in nltk.corpus.stopwords.words('english'):
            y.append(word)
    
    text = y[:]
    y = []
    
    for word in td:
        y.append(ps.stem(word))
        
    return " ".join(y)

In [156]:
term_documents = []

for doc in docs:
    term_documents.append(doc.split(": ")[1])

term_documents

['query considers the user feedback as information requirement to search',
 'query depends on the model of information retrieval',
 'Many problems in information retrieval can be viewed as prediction problems',
 'A search engine is one of applications of information retrieval models']

In [157]:
processed_docs = []

for doc in term_documents:
    processed_docs.append(transform_text(doc))
    
processed_docs

['queri consid the user feedback as inform requir to search',
 'queri depend on the model of inform retriev',
 'mani problem in inform retriev can be view as predict problem',
 'a search engin is one of applic of inform retriev model']

In [158]:
# Wik = (TFik/LENi)*log(N+1)/0.5+DFk
# TFik - term frequency within doc.
# LENi - len of current doc.
# N - total docs
# DFk - document frequency of the word (no. of documents for which the word exists)

In [159]:
import math

In [160]:
def TF(word, mymap):
    return mymap[word]

def DF(word, corpus_maps):
    count = 0
    
    for hm in corpus_maps:
        if word in hm:
            count += 1
    
    return count

def calculate_weights(doc, doc_map, corpus_maps):
    vector = [0]*len(doc_map)
    
    mymap = {}
    
    for i in doc.split():
        if i in mymap:
            mymap[i] +=1
        else:
            mymap[i] = 1
    
    for word in doc.split():
        TFik = TF(word, mymap)
        LENi = len(doc)
        N = len(corpus_maps)
        DFk = DF(word, corpus_maps)
        
        try:
            index = list(doc_map.keys()).index(word)
            vector[index] = (TFik/LENi)*(math.log(N+1)/(0.5+DFk))
        except ValueError:
            pass
    
    return vector

In [161]:
doc_map = {}

for doc in processed_docs:
    for word in doc.split():
        if word in doc_map:
            doc_map[word] += 1
        else:
            doc_map[word] = 1

doc_matrix = []

list_of_maps = []

for doc in processed_docs:
    hm = {}
    
    for word in doc.split():
        if word in hm:
            hm[word] += 1
        else:
            hm[word] = 1
    
    list_of_maps.append(hm)

for doc in processed_docs:
    vector = calculate_weights(doc, doc_map, list_of_maps)
    doc_matrix.append(vector)

In [162]:
print(doc_matrix)

[[0.011294301139888422, 0.018823835233147374, 0.011294301139888422, 0.018823835233147374, 0.018823835233147374, 0.011294301139888422, 0.006274611744382457, 0.018823835233147374, 0.018823835233147374, 0.011294301139888422, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0.014971515464503257, 0, 0.014971515464503257, 0, 0, 0, 0.0083175085913907, 0, 0, 0, 0.024952525774172098, 0.024952525774172098, 0.014971515464503257, 0.014971515464503257, 0.010693939617502327, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0.010553691229076068, 0.005863161793931149, 0, 0, 0, 0, 0, 0, 0, 0.007538350877911477, 0.01758948538179345, 0.0351789707635869, 0.01758948538179345, 0.01758948538179345, 0.01758948538179345, 0.01758948538179345, 0.01758948538179345, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0.006502779444178183, 0, 0, 0.011705002999520727, 0, 0, 0.011705002999520727, 0.023410005999041455, 0.008360716428229092, 0, 0, 0, 0, 0, 0, 0, 0.01950833833253455, 0.01950833833253455, 0.01950833833253455, 0.0

1) C) Subdiv C

In [163]:
new_term_docs = []

for doc in new_docs:
    new_term_docs.append(doc.split(": ")[1])
    
new_term_docs

['feedback is typically used by the system to modify the query and improve prediction',
 'ranking in information retrieval algorithms depends on user query',
 'query consider on the modey of information retriebal']

In [164]:
processed_new_docs = []

for doc in new_term_docs:
    processed_new_docs.append(transform_text(doc))
    
processed_new_docs

['feedback is typic use by the system to modifi the queri and improv predict',
 'rank in inform retriev algorithm depend on user queri',
 'queri consid on the modey of inform retrieb']

In [165]:
new_matrix = []

for doc in processed_new_docs:
    vector = calculate_weights(doc, doc_map, list_of_maps)
    new_matrix.append(vector)

In [166]:
print(new_matrix)

[[0.008699664391535676, 0, 0.017399328783071353, 0, 0.014499440652559464, 0, 0, 0, 0.014499440652559464, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.014499440652559464, 0, 0, 0.014499440652559464, 0, 0], [0.012146701225917737, 0, 0, 0.02024450204319623, 0, 0, 0.006748167347732077, 0, 0, 0, 0.02024450204319623, 0.02024450204319623, 0, 0, 0.008676215161369813, 0, 0, 0.02024450204319623, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0.014971515464503257, 0.024952525774172098, 0.014971515464503257, 0, 0, 0, 0.0083175085913907, 0, 0, 0, 0, 0.024952525774172098, 0, 0.014971515464503257, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [167]:
# def cosine_similarity(vec1, vec2):
#     mag1 = 0
#     mag2 = 0
#     v1_dot_v2 = 0
    
#     for i in vec1:
#         mag1 += i
#     for i in vec2:
#         mag2 += i
    
#     mag1 = mag1**0.5
#     mag2 = mag2**0.5
    
#     for i,j in zip(vec1, vec2):
#         v1_dot_v2 += i*j
        
#     return v1_dot_v2/(mag1*mag2)

In [168]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

for i in range(len(new_matrix)):
    for j in range(len(doc_matrix)):
        vec1 = np.array(doc_matrix[j])
        vec2 = np.array(new_matrix[i])
#         print(cosine_similarity([vec1], [vec2]), i+1, j+1)
        if cosine_similarity([vec1], [vec2])>=0.85:
            print(f"{i+1}th doc in the system and {j+1}th new doc are similar.")

[[0.49966535]] 1 1
[[0.23204755]] 1 2
[[0.12720778]] 1 3
[[0.15207872]] 1 4
[[0.26640244]] 2 1
[[0.63671739]] 2 2
[[0.18385498]] 2 3
[[0.05004232]] 2 4
[[0.400485]] 3 1
[[0.63468854]] 3 2
[[0.01905523]] 3 3
[[0.17040039]] 3 4
