In [None]:
# !git clone https://github.com/AashitaK/Plagiarism-Detection.git

In [None]:
# %cd Plagiarism-Detection

In [None]:
import numpy as np
import pandas as pd
import re

import os
path = "input/" # Update path

import nltk
from nltk import trigrams, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [2]:
def clean_file(myfile):
    mf = myfile.read()
    mf = mf.lower()
    mf = re.sub(r'[\n]\s*',r' ', mf)
    mf = re.sub(r'[\']|[:]|[+]|\d+|[--]', '', mf)
    mf = re.sub(r'\(\)',r'', mf)
    mf = re.sub(r'\.\s+\.', r'.', mf)
    mf = mf.strip()
    return mf

def get_dataframe(files):
    data = []
    for f in files:
        with open(path + f, mode='r', encoding='utf-8-sig') as myfile:
            myfile = clean_file(myfile)
            data.append(myfile)
    df = pd.DataFrame(data, columns=['Text'])
    return df

In [3]:
suspicious_files = sorted([f for f in os.listdir(path) if f.startswith('suspicious-document')])
suspicious = get_dataframe(suspicious_files)
suspicious['File_index'] = [f[19:24] for f in suspicious_files]
plagiarized = pd.read_csv(path + "Plagiarized.csv")
suspicious['Plagiarized'] = plagiarized.Plagiarized
suspicious.head()

Unnamed: 0,Text,File_index,Plagiarized
0,bible studies in the life of paul historical a...,1,1
1,my impatience to inhabit the hermitage not per...,2,1
2,morning on the beachthe three letters i...,3,1
3,this morning it rained so hard (though it was ...,4,0
4,deadham hard a romance by lucas malet (mary st...,5,1


In [4]:
source_files = sorted([f for f in os.listdir(path) if f.startswith('source-document')])
source = get_dataframe(source_files)
source['File_index'] = [f[15:20] for f in source_files]
source.head()

Unnamed: 0,Text,File_index
0,"our next day was a pleasant, lazy day, during ...",18
1,she stepped back to scyllas side. there was a ...,40
2,"punch, or the london charivari. volume . may ,...",47
3,the leicestershires beyond baghdad by edward j...,55
4,"""we soon began to find stones and dirt in the ...",88


In [5]:
def process_text(df):
    df['Tokens'] = df['Text'].apply(word_tokenize)
    punc = (".", ",", "?", "-", "!", "'", '"', "\\", "/", ";", "{", "}", "(", ")",
            "[", "]", "''", "``", "*", "$", "%")
    stop = set(stopwords.words('english'))
    stop_punc = stop.union(punc)
    df.Tokens = df.Tokens.apply(lambda x: [w for w in x if w not in stop_punc])
    df['Trigrams'] = df['Tokens'].apply(lambda x: set(trigrams(x)))
    return df

In [8]:
suspicious = process_text(suspicious)
suspicious.head()

Unnamed: 0,Text,File_index,Plagiarized,Tokens,Trigrams
0,bible studies in the life of paul historical a...,1,1,"[bible, studies, life, paul, historical, const...","{(worship, received, given), (christian, breth..."
1,my impatience to inhabit the hermitage not per...,2,1,"[impatience, inhabit, hermitage, permitting, w...","{(time, bar, squirmed), (damaged, record, marc..."
2,morning on the beachthe three letters i...,3,1,"[morning, beachthe, three, letters, iii, old, ...","{(ill, make, ship), (house, behind, cast), (lo..."
3,this morning it rained so hard (though it was ...,4,0,"[morning, rained, hard, though, fair, yesterda...","{(dined, dinner, comes), (matters, walked, bac..."
4,deadham hard a romance by lucas malet (mary st...,5,1,"[deadham, hard, romance, lucas, malet, mary, s...","{(entitle, good, title), (history, new, york),..."


In [9]:
source = process_text(source)
source.head()

Unnamed: 0,Text,File_index,Tokens,Trigrams
0,"our next day was a pleasant, lazy day, during ...",18,"[next, day, pleasant, lazy, day, inspected, ka...","{(mud, hut, companions), (party, one, hundred)..."
1,she stepped back to scyllas side. there was a ...,40,"[stepped, back, scyllas, side, deathly, doubt,...","{(brother, safe, herd), (bodies, seem, turn), ..."
2,"punch, or the london charivari. volume . may ,...",47,"[punch, london, charivari, volume, may, play, ...","{(speaker, rose, cry), (royal, quartpotarium, ..."
3,the leicestershires beyond baghdad by edward j...,55,"[leicestershires, beyond, baghdad, edward, j.,...","{(english, kept, insisting), (left, front, ene..."
4,"""we soon began to find stones and dirt in the ...",88,"[soon, began, find, stones, dirt, ice, gone, t...","{(crags, within, ship), (stop, train, said), (..."


In [10]:
def Jaccard_similarity_coefficient(A, B):
    J = len(A.intersection(B))/len(A.union(B))
    return J

def containment_measure(A, B):
    J = len(A.intersection(B))/len(B)
    return J

In [11]:
def check_plagiarism_Jaccard(doc_trigrams):
    Jaccard_similarity_scores = source.Trigrams.apply(lambda s: Jaccard_similarity_coefficient(s, doc_trigrams))
    most_similar = Jaccard_similarity_scores.idxmax()
    return Jaccard_similarity_scores[most_similar]#, source.loc[most_similar, 'File_index']

def check_plagiarism_containment(doc_trigrams):
    containment_measure_scores = source.Trigrams.apply(lambda s: containment_measure(s, doc_trigrams))
    most_similar = containment_measure_scores.idxmax()
    return containment_measure_scores[most_similar]#, source.loc[most_similar, 'File_index']

In [12]:
suspicious['Jaccard_similarity_score'] = suspicious.Trigrams.apply(check_plagiarism_Jaccard)
suspicious['Containment_measure_score'] = suspicious.Trigrams.apply(check_plagiarism_containment)

In [None]:
sns.swarmplot(x="Plagiarized", y="Jaccard_similarity_score", data=suspicious)

In [None]:
sns.swarmplot(x="Plagiarized", y="Containment_measure_score", data=suspicious)

In [None]:
sns.relplot(x="Jaccard_similarity_score", y="Containment_measure_score", hue="Plagiarized", data=suspicious)

In [16]:
def LCS(A, B):
    m, n = len(A), len(B)
    counter = [[0]*(n+1) for x in range(m+1)]
    A, B = list(A), list(B)
    longest = 0
    for i in range(m):
        for j in range(n):
            if A[i] == B[j]:
                count = counter[i][j] + 1
                counter[i+1][j+1] = count
                if count > longest:
                    longest = count
    return longest

def check_plagiarism_LCS(doc):
    LCS_scores = source.Tokens.apply(lambda s: LCS(s, doc))
    most_similar = LCS_scores.idxmax()
    return LCS_scores[most_similar]#, source.loc[most_similar, 'File_index']

In [None]:
suspicious['Longest_common_sequence'] = suspicious.Trigrams.apply(check_plagiarism_LCS)

In [None]:
lemmatizer = WordNetLemmatizer()
source.Tokens = source.Tokens.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
suspicious.Tokens = suspicious.Tokens.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

In [None]:
dummy_function = lambda x: x

vectorizer = TfidfVectorizer(
    analyzer='word',
    token_pattern=None,
    tokenizer=dummy_function,
    preprocessor=dummy_function,
    ngram_range=(1, 4),
    max_features=1000,
)

# Concatenate the tokens from both suspicious and source texts
combined_tokens = pd.concat([suspicious.Tokens, source.Tokens])

# Fit and transform the concatenated tokens
DTM = vectorizer.fit_transform(combined_tokens)


In [None]:
LSA = TruncatedSVD(200, algorithm = 'arpack')
DTM_LSA = LSA.fit_transform(DTM)
DTM_LSA = Normalizer(copy=False).fit_transform(DTM_LSA)

In [None]:
similarity_matrix = np.asarray(np.asmatrix(DTM_LSA) * np.asmatrix(DTM_LSA).T)

In [None]:
np.fill_diagonal(similarity_matrix, 0)
L = len(suspicious_files)
similarity_matrix[:L, :L] = np.zeros((L, L))
suspicious['LSA_similarity'] = np.max(similarity_matrix, 1)[:L]

In [None]:
sns.swarmplot(x="Plagiarized", y="LSA_similarity", data=suspicious)

In [None]:
suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score', 'Plagiarized']].corr()

In [None]:
y = suspicious.Plagiarized
X = suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score']]#, 'Longest_common_sequence']]

clf = LogisticRegression()
cross_val_score(clf, X, y, cv=10)

In [None]:
np.mean(cross_val_score(clf, X, y, cv=10))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y.value_counts()