In [1]:
import random
import re
from collections import Counter
from collections import defaultdict

import jieba
import numpy as np
import pandas as pd
import xgboost as xgb
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier



In [2]:
# Load trained w2v model
def get_w2v_model(filename):
    model = Word2Vec.load(filename)
    return model

In [3]:
# Remove special characters
def rm_spec(sent):
    ret = re.sub('[\\n\s+\.\!\/_,$%^*(+\"\')]+|[+——\-()?【】《》“”！，。？、~@#￥%……&*（）]+', '', sent)
    if ret:
        return ret
    return ''

In [4]:
# Split sentence
def split_sent(sent):
    sents = re.split('[。？！\n]', sent)
    ret = []
    for s in sents:
        sl = jieba.lcut(s)
        slt = []
        for item in sl:
            sr = rm_spec(item)
            if sr:
                slt.append(sr)
        if slt:
            ret.append(' '.join(slt))
    return ret

In [5]:
# Process news
def process_news_corpus(filename, news, lst):
    di = defaultdict(list)
    xna_lst = []
    count = 0
    with open(filename, 'w', encoding='utf-8') as fout:
        for i, item in enumerate(lst):
            sent = news['content'][item]
            sents = split_sent(sent)
            if sents:
                di[i].append(count)
                count += len(sents)
                di[i].append(count)
                if '新华社' in news['source'][item] or '新华网' in news['source'][item]:
                    xna_lst.append(i)
                fout.write('\n'.join(sents) + '\n')
    return di, xna_lst

In [6]:
# Add new corpus to train w2v model
def add_more_train_corpus(model, filename):
    sen_list = []
    with open(filename, 'r', encoding='utf-8') as fin:
        for line in fin.readlines():
            sen_list.append(line.split())
    model.train(sentences=sen_list, total_examples=len(sen_list), epochs=1)

In [7]:
# Get news sentences index list from a dict structure
def get_index_lst_from_dict(di, lst):
    ret = []
    for i in lst:
        start = di[i][0]
        end = di[i][1]
        for t in range(start, end):
            ret.append(t)
    return ret

In [8]:
def get_idx_lst_from_dict(di, idx):
    ret = []
    start = di[idx][0]
    end = di[idx][1]
    for t in range(start, end):
        ret.append(t)
    return ret

In [9]:
# Calculate word frequency
def word_freq(corpus_file):
    word_list = []
    with open(corpus_file, 'r', encoding='utf-8') as fin:
        for line in fin.readlines():
            word_list += line.split()
    cc = Counter(word_list)
    num_all = sum(cc.values())

    def get_word_freq(word):
        return cc[word] / num_all

    return get_word_freq

In [10]:
# Get sentence vector matrix
def get_sent_vec(model, get_wd_freq, corpus_file):
    a = 0.001
    col = model.wv.vector_size
    with open(corpus_file, 'r', encoding='utf-8') as fin:
        all_lines = fin.readlines()
        ret = np.zeros((len(all_lines), col))
        for idx, line in enumerate(all_lines):
            sent_vec = np.zeros(col)
            wd_lst = line.split()
            for wd in wd_lst:
                try:
                    pw = get_wd_freq(wd)
                    w = a / (a + pw)
                    sent_vec += w * np.array(model.wv[wd])
                except:
                    pass
            ret[idx] = sent_vec
        return ret

In [11]:
def get_list_with_content(news):
    ret = []
    for i in range(len(news)):
        if pd.isna(news['source'][i]) or pd.isna(news['content'][i]):
            continue
        ret.append(i)
    return ret

In [12]:
def get_remain_list(all_samples, part_samples):
    ret = []
    for i in all_samples:
        if i not in part_samples:
            ret.append(i)
    return ret

In [13]:
# Calculate model's precision and recall rate
def get_precision_and_recall(xna_test_res, otr_test_res):
    cc1 = Counter(xna_test_res)
    cc2 = Counter(otr_test_res)
    tp = cc1[1]
    fp = cc2[1]
    tn = cc2[0]
    fn = cc1[0]
    preci = tp / (tp + fp)
    recal = tp / (tp + fn)
    return preci, recal

In [14]:
def KNNClassifier(xTrain, yTrain):
    neigh = KNeighborsClassifier(n_neighbors=10)
    neigh.fit(xTrain, yTrain)
    return neigh

In [15]:
def SVMClassifier(xTrain, yTrain):
    clf = SVC(gamma='auto')
    clf.fit(xTrain, yTrain)
    return clf

In [16]:
def DTClassifier(xTrain, yTrain):
    clf = DecisionTreeClassifier()
    clf.fit(xTrain, yTrain)
    return clf

In [17]:
def RFClassifier(xTrain, yTrain):
    clf = RandomForestClassifier(n_estimators=10, max_depth=10)
    clf.fit(xTrain, yTrain)
    return clf

In [18]:
class XGBClassifier:
    def __init__(self, xTrain, yTrain):
        dtrain = xgb.DMatrix(xTrain, label=yTrain)
        params = {'objective': 'binary:logitraw', 'silent': 1, 'n_estimators': 1000,
                  'max_depth': 8}
        self.clf = xgb.train(params, dtrain)

    def predict(self, mat):
        test = xgb.DMatrix(mat)
        res = self.clf.predict(test)
        return res.astype(int)

In [19]:
# Classify main process
def main_func(classifier, mat, idx_dict, xna_trn_lst, otr_trn_lst, xna_test_lst, otr_test_lst):
    xna_trn_sent_lst = get_index_lst_from_dict(idx_dict, xna_trn_lst)
    otr_trn_sent_lst = get_index_lst_from_dict(idx_dict, otr_trn_lst)
    print('XNA news sentences for training:', len(xna_trn_sent_lst))
    print('Other news sentences for training:', len(otr_trn_sent_lst))

    X_train = mat[xna_trn_sent_lst + otr_trn_sent_lst, :]
    Y = np.array([0] * mat.shape[0])
    for i in xna_trn_sent_lst: Y[i] = 1
    y_train = Y[xna_trn_sent_lst + otr_trn_sent_lst]

    trainedModel = classifier(X_train, y_train)

    xna_test = []
    otr_test = []
    threshold = 0.3  # Similiar sentence amount over 30% is classified as plagiarized

    for xt in xna_test_lst:
        sent_lst = get_idx_lst_from_dict(idx_dict, xt)
        scores = []
        for si in sent_lst:
            sco = trainedModel.predict([mat[si]])[0]
            scores.append(sco)
        cc = Counter(scores)
        if cc[1] / len(sent_lst) > threshold:
            xna_test.append(1)
        else:
            xna_test.append(0)

    for ot in otr_test_lst:
        sent_lst = get_idx_lst_from_dict(idx_dict, ot)
        scores = []
        for si in sent_lst:
            sco = trainedModel.predict([mat[si]])[0]
            scores.append(sco)
        cc = Counter(scores)
        if cc[1] / len(sent_lst) > threshold:
            otr_test.append(1)
        else:
            otr_test.append(0)

    precision, recall = get_precision_and_recall(xna_test, otr_test)
    print(classifier.__name__ + ' precision {}, recall {}'.format(precision, recall))

    # lst2file('trn_result.txt', xna_test)
    # lst2file('otr_result.txt', otr_test)

In [20]:
# Import original corpus
news_df = pd.read_csv('sqlResult_1558435.csv', encoding='gb18030')

In [21]:
# Get news list which has source and content
lst_with_content = get_list_with_content(news_df)

In [22]:
# Build news index dict of corpus file, and filter XNA news
index_dict, xna_news_lst = process_news_corpus('news_corpus.txt', news_df, lst_with_content)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\King\AppData\Local\Temp\jieba.cache
Loading model cost 0.678 seconds.
Prefix dict has been built succesfully.


In [23]:
otr_news_lst = list(set(index_dict.keys()) - set(xna_news_lst))

In [52]:
sample_train_n = 1000
sample_test_n = 100

In [53]:
xna_samples_train = random.sample(xna_news_lst, sample_train_n)

In [54]:
xna_samples_test = random.sample(set(xna_news_lst) - set(xna_samples_train), sample_test_n)

In [55]:
otr_samples_train = random.sample(otr_news_lst, sample_train_n)

In [56]:
otr_samples_test = random.sample(set(otr_news_lst) - set(otr_samples_train), sample_test_n)

In [43]:
w2v_model = get_w2v_model('wiki_w2v.model')

In [44]:
add_more_train_corpus(w2v_model, 'news_corpus.txt')

In [45]:
get_word_prob = word_freq('news_corpus.txt')

In [46]:
all_sent_mat = get_sent_vec(w2v_model, get_word_prob, 'news_corpus.txt')

In [57]:
## KNN
main_func(KNNClassifier, all_sent_mat, index_dict, xna_samples_train, otr_samples_train,
          xna_samples_test, otr_samples_test)

XNA news sentences for training: 7610
Other news sentences for training: 22870
KNNClassifier precision 0.9523809523809523, recall 0.8


In [58]:
## Decision tree
main_func(DTClassifier, all_sent_mat, index_dict, xna_samples_train, otr_samples_train,
          xna_samples_test, otr_samples_test)

XNA news sentences for training: 7610
Other news sentences for training: 22870
DTClassifier precision 0.7203389830508474, recall 0.85


In [59]:
## Random forest
main_func(RFClassifier, all_sent_mat, index_dict, xna_samples_train, otr_samples_train,
          xna_samples_test, otr_samples_test)

XNA news sentences for training: 7610
Other news sentences for training: 22870
RFClassifier precision 0.9444444444444444, recall 0.68


In [60]:
## XGBoost
main_func(XGBClassifier, all_sent_mat, index_dict, xna_samples_train, otr_samples_train,
          xna_samples_test, otr_samples_test)

XNA news sentences for training: 7610
Other news sentences for training: 22870
XGBClassifier precision 1.0, recall 0.27


In [51]:
## SVM
main_func(SVMClassifier, all_sent_mat, index_dict, xna_samples_train, otr_samples_train,
          xna_samples_test, otr_samples_test)

XNA news sentences for training: 6724
Other news sentences for training: 23659
SVMClassifier precision 1.0, recall 0.44
