In [78]:
s1 = 'Plagiarism detection is the process of locating instances of plagiarism within a work or document. The widespread use of computers and the advent of the Internet have made it easier to plagiarize the work of others. Detection of plagiarism can be undertaken in a variety of ways. Human detection is the most traditional form of identifying plagiarism from written work. This can be a lengthy and time-consuming task for the reader and can also result in inconsistencies in how plagiarism is identified within an organization. Text-matching software (TMS), which is also referred to as "plagiarism detection software" or "anti-plagiarism" software, has become widely available, in the form of both commercially available products as well as open-source software. TMS does not actually detect plagiarism per se, but instead finds specific passages of text in one document that match text in another document.'

s2 = 'Plagiarism detection is defined the process of locating instances of plagiarism within a document. TMS does not actually detect plagiarism exactly, but it finds specific passages of text in one document that match text in another document. Traditional form of identifying plagiarism from written work is human detection.'

s3 = 'To detect plagiarism of any form, it is essential to have broad knowledge of its possible forms and classes, and existence of various tools and systems for its detection. Based on impact or severity of damages, plagiarism may occur in an article or in any production in a number of ways. This survey presents a taxonomy of various plagiarism forms and include discussion on each of these forms. Over the years, a good number tools and techniques have been introduced to detect plagiarism. This paper highlights few promising methods for plagiarism detection based on machine learning techniques. We analyse the pros and cons of these methods and finally we highlight a list of issues and research challenges related to this evolving research problem.'

s4 = 'Plagiarism detection'

In [79]:
from similarity.normalized_levenshtein import NormalizedLevenshtein
normalized_levenshtein = NormalizedLevenshtein()
print(normalized_levenshtein.distance(s1,s2))
print(normalized_levenshtein.distance(s2,s3))
print(normalized_levenshtein.distance(s3,s1))
print(normalized_levenshtein.distance(s4,s1))
print(normalized_levenshtein.distance(s4,s2))
print(normalized_levenshtein.distance(s4,s3))

0.7284768211920529
0.7493333333333333
0.7141280353200883
0.977924944812362
0.9375
0.9746666666666667


In [80]:
from similarity.longest_common_subsequence import LongestCommonSubsequence
from similarity.metric_lcs import MetricLCS

def met_lcs(s1,s2):
    lcs = LongestCommonSubsequence()
    metric_lcs = MetricLCS()
    dist = lcs.distance(s1,s2)
    # our metric
    #print((len(s1)+len(s2)-dist)/(2*len(s1)))
    return 1-metric_lcs.distance(s1, s2)
print(met_lcs(s1,s2))
print(met_lcs(s2,s3))
print(met_lcs(s3,s1))
print(met_lcs(s1,s4))
print(met_lcs(s2,s4))
print(met_lcs(s3,s4))

0.2814569536423841
0.2666666666666666
0.41501103752759383
0.02207505518763797
0.0625
0.02533333333333332


In [55]:
from similarity.cosine import Cosine

def met_cosine(s1,s2,n):
    cosine = Cosine(n)
    p1 = cosine.get_profile(s1)
    p2 = cosine.get_profile(s2)
    return cosine.similarity_profiles(p1, p2)
    
n = 2
print(met_cosine(s1,s2,n))
print(met_cosine(s2,s3,n))
print(met_cosine(s3,s1,n))
print(met_cosine(s1,s4,n))
print(met_cosine(s2,s4,n))
print(met_cosine(s3,s4,n))

0.4082482904638631
0.4082482904638631
1.0
0.4082482904638631
1.0000000000000002
0.4082482904638631


In [70]:
from similarity.jaccard import Jaccard

def met_jaccard(s1,s2,n):
    jac = Jaccard(n)
    return jac.similarity(s1, s2)
    
n = 3
print(met_jaccard(s1,s2,n))
print(met_jaccard(s2,s3,n))
print(met_jaccard(s3,s1,n))
print(met_jaccard(s1,s4,n))
print(met_jaccard(s2,s4,n))
print(met_jaccard(s3,s4,n))

0.875
0.0
0.0
0.0
0.0
0.2


In [83]:
def met_weighted(str1, str2):
    ind = []
    #ind.append(met_lcs(str1, str2))
    val1 = 0
    val2 = 0
    val3 = 0
    val4 = 0
    p = 1.2
    for i in range(1,5):
        k = i**p
        val4 += k
        val1 += met_cosine(str1, str2,i)*k
        val2 += met_jaccard(str1, str2,i)*k
        val3 += met_cosine_word(str1, str2,i)*k
    val1 = val1/val4
    val2 = val2/val4
    val3 = val3/val4
    # ind.append(met_lcs(str1, str2))
    ind.append(val1)
    ind.append(val2)
    ind.append(val3)
    met_weights = [0.5, 0.5, 0.0]
    ans = 0
    for i in range(0,len(ind)):
        ans+=ind[i]*met_weights[i]
    return ans


s1 = pre_process(s1)
s2 = pre_process(s2)
s3 = pre_process(s3)
s4 = pre_process(s4)
print(met_weighted(s1,s2))
print(met_weighted(s2,s3))
print(met_weighted(s3,s1))
print(met_weighted(s1,s4))
print(met_weighted(s2,s4))
print(met_weighted(s3,s4))

0.6049519752084935
0.3426514449414421
0.40761883623746475
0.35606211812585503
0.3782826230835536
0.29904896777007306


In [79]:
import functools
import math

from nltk import word_tokenize 
from nltk.util import ngrams

def dotproduct(v1, v2):
    return sum((a*b) for a, b in zip(v1, v2))

def length(v):
    return math.sqrt(dotproduct(v, v))

def cosangle(v1, v2):
    if ((length(v1) * length(v2)) != 0):
        return dotproduct(v1, v2) / (length(v1) * length(v2))
    return 0

def met_cosine_word(s1,s2,n):
    token = word_tokenize(s1)
    ngram = list(ngrams(token, n))
    ngram = ['~'.join(i) for i in ngram]
    freq1 = {} 
    for item in ngram: 
        if (item in freq1): 
            freq1[item] += 1
        else: 
            freq1[item] = 1
    token = word_tokenize(s2)
    ngram = list(ngrams(token, n))
    ngram = ['~'.join(i) for i in ngram]
    freq2 = {} 
    for item in ngram: 
        if (item in freq2): 
            freq2[item] += 1
        else: 
            freq2[item] = 1
    alldict = [freq1, freq2]
    allkey = functools.reduce(set.union, map(set, map(dict.keys, alldict)))
    vec1 = []
    vec2 = []
    for key in allkey:
        if key in freq1:
            vec1.append(freq1[key])
        else:
            vec1.append(0)
        if key in freq2:
            vec2.append(freq2[key])
        else:
            vec2.append(0)
    return cosangle(vec1, vec2)

n = 3
print(met_cosine_word(s1,s2,n))
print(met_cosine_word(s2,s3,n))
print(met_cosine_word(s3,s1,n))
print(met_cosine_word(s1,s4,n))
print(met_cosine_word(s2,s4,n))
print(met_cosine_word(s3,s4,n))

0.3487772492870674
0.0
0.006950816264905735
0
0
0


In [21]:
import string
def pre_process(s):
    s = s.translate(str.maketrans('', '', string.punctuation))
    #s = s.replace(" ", "")
    return s.lower()

In [76]:
s1 = 'hey there, I am using whatsapp!'
s2 = 'hey there; i am using whatsapp'
s3 = 'h a l l l'
s4 = 'h e l l l'
s1 = pre_process(s1)
s2 = pre_process(s2)
s1 = pre_process(s1)
s2 = pre_process(s2)
s3 = pre_process(s3)
s4 = pre_process(s4)