In [None]:
from __future__ import unicode_literals
from sklearn.feature_extraction import DictVectorizer
import pymysql
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from langdetect import detect
import csv
import MySQLdb


In [None]:
# load data form database
conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd="root", db='vroniplag')
cur = conn.cursor(MySQLdb.cursors.DictCursor)
cur.execute("SELECT * FROM fragment ORDER BY fragment_identifier")

originals = []
plagiats = []

groups = {}

i=0
for r in cur:
    
    originals.append(r["source_text"])
    plagiats.append(r["plagiat_text"])
    group = r["fragment_identifier"]
    if not group in groups:
        groups[group] = []
    groups[group].append(i)
    i+=1
  

cur.close()
conn.close()

In [None]:
print(plagiats[0])

In [None]:
# language detection
conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd="root", db='vroniplag')
cur = conn.cursor(MySQLdb.cursors.DictCursor)
cur.execute("SELECT * FROM fragment ORDER BY fragment_identifier")

for r in cur:
    
    try:
        url = r["url"]
        original = r["source_text"]
        plagiat = r["plagiat_text"]

        lang_original = detect(original)
        lang_plagiat = detect(plagiat)

        query = "UPDATE plagiat SET lang_source='" + lang_original + "', lang_plagiat='" + lang_plagiat;
        query += "' WHERE url='" + url + "';";
        cur2 = conn.cursor()
        cur2.execute(query)
        cur2.close()
        
    except Exception:
        print("Exception: " + url)
conn.commit();
cur.close()
conn.close()

In [None]:
# read stopwords
stopwords = set()
with open("stopwords_de.txt") as f:
    content = f.readlines()
    for line in content:
        stopwords.update(line.replace("\n", ""))


# do tf idf
all_docs = originals + plagiats
v = TfidfVectorizer(analyzer="word", stop_words = stopwords, ngram_range=(1,1), max_df=1000, min_df=3)
tfidf = v.fit_transform(all_docs)


In [None]:
# evaluation - target + 19 random docs from the same author
test_length = 50
correct = 0

for key in groups.keys():
    
    eval_size = len(groups[key])
    test_length = min(20, len(groups[key]))
    correctInGroup = 0

    for i in range(0, eval_size ):

        # generate a test set of the original and 19 random text documents

        index = groups[key][i]
        originalDoc = tfidf.getrow(index)

        test_docs = []    
        test_docs.append(tfidf.getrow(len(originals)+index))
        usedIndizes = {}
        usedIndizes[index] = True

        while len(test_docs) < test_length:

            j = random.randrange(0, eval_size)
            if not j in usedIndizes:
                index = groups[key][j]
                usedIndizes[index] = True
                test_docs.append(tfidf.getrow(index))

        # compare test set
        maxSim = 0
        mostSimDoc = 0
        for j in range(0, len(test_docs)):
            vecS = test_docs[j]
            sim = cosine_similarity(originalDoc, vecS)
            sim = sim[0][0]

            if(maxSim < sim):
                maxSim=sim
                mostSimDoc = j

        if mostSimDoc == 0:
            correctInGroup += 1
    
    correct += correctInGroup
    accuracy = correctInGroup / float(eval_size)
    print(key + "\t" + str(eval_size) + "\t" + str(accuracy))
    
accuracy = correct/float(len(originals))
print("Total: \t" + str(accuracy))


In [None]:
# evaluation - target + 19 random docs

test_length = 20
correct = 0
eval_size = len(originals)
for i in range(0, eval_size ):
    
    if i % 500 == 0:
        print("Processed " + str(i) + " docs")
        
    # generate a test set of the original and 19 random text documents
    
    originalDoc = tfidf.getrow(i)
    
    test_docs = []    
    test_docs.append(tfidf.getrow(len(originals)+i))
    usedIndizes = {}
    
    while len(test_docs) < test_length:
        
        j = random.randrange(len(originals), 2*len(originals))
        if not j in usedIndizes:
            usedIndizes[j] = True
            test_docs.append(tfidf.getrow(j))
            
    # compare test set
    maxSim = 0
    mostSimDoc = 0
    for j in range(0, len(test_docs)):
        vecS = test_docs[j]
        sim = cosine_similarity(originalDoc, vecS)
        sim = sim[0][0]
        
        if(maxSim < sim):
            maxSim=sim
            mostSimDoc = j

    if mostSimDoc == 0:
        correct += 1
       
    

print(correct/float(eval_size)) 

In [None]:
# convert to csv- old lrec code

conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd="root", db='paraphrases')
cur = conn.cursor(MySQLdb.cursors.DictCursor)
cur.execute("SELECT * FROM plagiat ORDER BY author")

keys = { 
    "url",
    "fragment_identifier",
    "author",
    "source_text",
    "full_html",
    "category",
    "lang_source",
    "lang_plagiat",
    "plagiat_text",
    "peer_reviewed"
}

counter = 0
with open('vroniplag-corpus.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='\t',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    spamwriter.writerow(keys)
    
    for r in cur:
        if(r["peer_reviewed"] == b'\x00'):
            r["peer_reviewed"] = 1;
        else:
            r["peer_reviewed"] = 0;
        row = []
        for key in keys:
            row.append(r[key])
        spamwriter.writerow(row)
        counter += 1
  
print(counter)

cur.close()
conn.close()