In [1]:
# Action 1 文本抄袭自动检测分析
import numpy as np
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

# load stop words
with open('C:\Desktop\开课吧\BI_core\HW3\chinese_stopwords.txt', 'r', encoding='utf-8') as file:
    stopwords = [i[:-1] for i in file.readlines()]

# load data
news = pd.read_csv('C:\Desktop\开课吧\BI_core\HW3\sqlResult.csv', encoding='gb18030')
news.head(3)

# drop nan
news = news.dropna(subset=['content'])
print(news.shape)

(87054, 7)


In [2]:
def split_text(text):
    '''split words'''
    text = text.replace(' ','').replace('\n','').replace('\r','')
    text2 = jieba.cut(text.strip())
    result = ' '.join([w for w in text2 if w not in stopwords])
    return result

split_text(news.iloc[0].content)

Building prefix dict from the default dictionary ...
Dumping model to file cache E:\AppData\Local\Temp\jieba.cache
Loading model cost 3.041 seconds.
Prefix dict has been built succesfully.


'此外 本周 除 小米 手机 款 机型 外 机型 暂停 更新 发布 含 开发 版 体验版 内测 稳定版 暂不受 影响 确保 工程师 集中 全部 精力 进行 系统优化 工作 有人 猜测 精力 主要 用到 MIUI9 研发 之中 MIUI8 去年 发布 距今已有 一年 有余 更新换代 当然 MIUI9 确切 信息 等待 官方消息'

In [3]:
import pickle, os

if not os.path.exists('C:\Desktop\开课吧\BI_core\HW3\corpus.pkl'):
    # if corpus not exist, build it
    corpus = list(map(split_text, [str(i) for i in news.content]))
    with open('C:\Desktop\开课吧\BI_core\HW3\corpus.pkl','wb') as file:
        pickle.dump(corpus, file)
else:
    # use the existing one
    with open('C:\Desktop\开课吧\BI_core\HW3\corpus.pkl','rb') as file:
        corpus = pickle.load(file)

In [4]:
# get TF-IDF matrix of the corpus
countvectorizer = CountVectorizer(encoding='gb18030',  min_df=0.015)
tfidftransformer = TfidfTransformer()

countvector = countvectorizer.fit_transform(corpus)
tfidf = tfidftransformer.fit_transform(countvector)

# set label
label = list(map(lambda source:1 if '新华' in str(source) else 0, news.source))

# split dataset
X_train, X_test, y_train, y_test = train_test_split(tfidf, label, test_size=0.3)

# model and training
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(tfidf)
labels = np.array(label)
compare_news_index = pd.DataFrame({'prediction': pred, 'labels': labels})

# set a filter to get all possible copys
copy_news_index = compare_news_index[(compare_news_index['prediction']==1) & (compare_news_index['labels']!=1)]
true_news_index = compare_news_index[(compare_news_index['labels']==1)].index

In [5]:
print('共有{}篇新华社的文章，另外有{}篇其它文章可能抄袭'.format(true_news_index.shape[0], copy_news_index.shape[0]))

共有78855篇新华社的文章，另外有2785篇其它文章可能抄袭


In [6]:
# data normalizer
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
scaled_array = normalizer.fit_transform(tfidf)

# use Kmeans for clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=25)

# save to file
if not os.path.exists('C:\Desktop\开课吧\BI_core\HW3\label.pkl'):
    with open('C:\Desktop\开课吧\BI_core\HW3\label.pkl','wb') as file:
        k_labels = kmeans.fit_predict(scaled_array)
        pickle.dump(k_labels, file)
else:
    with open('C:\Desktop\开课吧\BI_core\HW3\label.pkl','rb') as file:
        k_labels = pickle.load(file)

if not os.path.exists('C:\Desktop\开课吧\BI_core\HW3\id_class.pkl'):
    with open('C:\Desktop\开课吧\BI_core\HW3\id_class.pkl','wb') as file:
        id_class = {index:class_ for index, class_ in enumerate(k_labels)}
        pickle.dump(id_class, file)
else:
    with open('C:\Desktop\开课吧\BI_core\HW3\id_class.pkl','rb') as file:
        id_class = pickle.load(file)

from collections import defaultdict
if not os.path.exists('C:\Desktop\开课吧\BI_core\HW3\class_id.pkl'):
    class_id = defaultdict(set)
    for index, class_ in id_class.items():
        # only count the true class_id
        if index in true_news_index.tolist():
            class_id[class_].add(index)
        with open('C:\Desktop\开课吧\BI_core\HW3\class_id.pkl','wb') as file:
            pickle.dump(class_id, file)
else:
    with open('C:\Desktop\开课吧\BI_core\HW3\class_id.pkl','rb') as file:
        class_id = pickle.load(file)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
# search for similar text
def find_similar_text(cpindex, top=10):
    # only search for articles which is released by Xinhua
    dist = {i:cosine_similarity(tfidf[cpindex], tfidf[i]) for i in class_id[id_class[cpindex]]}
    return sorted(dist.items(), key=lambda x:x[1][0], reverse=True)[:top]

# pick one in copy_news_index
cpindex = 3352
similar_list = find_similar_text(cpindex)
print(similar_list)
print('\n怀疑抄袭:\n', news.iloc[cpindex].content)
similar2 = similar_list[0][0]
print('相似原文:\n', news.iloc[similar2].content)

[(3134, array([[0.96849134]])), (63511, array([[0.94643202]])), (29441, array([[0.9428342]])), (3218, array([[0.87621844]])), (980, array([[0.87535112]])), (29615, array([[0.86936332]])), (29888, array([[0.86215823]])), (64046, array([[0.85278237]])), (29777, array([[0.84875426]])), (63974, array([[0.73415186]]))]

怀疑抄袭:
 　　中国5月份56座城市新建商品住宅价格环比上涨，4月份为58座上涨。5月份15个一线和热点二线城市房地产市场基本稳定，5月份房地产调控政策效果继续显现。
　　统计局：15个一线和热点二线城市房价同比涨幅全部回落
　　国家统计局城市司高级统计师刘建伟解读5月份房价数据
　　5月份一二线城市房价平均涨幅继续回落
　　国家统计局今日发布了2017年5月份70个大中城市住宅销售价格统计数据。对此，国家统计局城市司高级统计师刘建伟进行了解读。
　　一、15个一线和热点二线城市新建商品住宅价格同比涨幅全部回落、9个城市环比下降或持平
　　5月份，因地制宜、因城施策的房地产调控政策效果继续显现，15个一线和热点二线城市房地产市场基本稳定。从同比看，15个城市新建商品住宅价格涨幅均比上月回落，回落幅度在0.5至6.4个百分点之间。从环比看，9个城市新建商品住宅价格下降或持平；5个城市涨幅在0.5%以内。
　　二、70个大中城市中一二线城市房价同比涨幅持续回落
　　5月份，70个城市中新建商品住宅和二手住宅价格同比涨幅比上月回落的城市分别有29和18个。其中，一二线城市同比涨幅回落尤其明显。据测算，一线城市新建商品住宅和二手住宅价格同比涨幅均连续8个月回落，5月份比4月份分别回落2.2和1.7个百分点；二线城市新建商品住宅和二手住宅价格同比涨幅分别连续6个月和4个月回落，5月份比4月份分别回落0.8和0.5个百分点。
　　三、70个大中城市中房价环比下降及涨幅回落城市个数均有所增加
　　5月份，70个城市中新建商品住宅价格环比下降的城市有9个，比