# 一、中文文本相似度分析

## 1.读取两篇文档（自己设计）：
原文.txt
抄袭.txt

In [1]:
with open('原文.txt', 'w', encoding='utf-8') as f:
    f.write('自然语言处理是人工智能的重要分支，它让计算机能够理解人类的语言。')

with open('抄袭.txt', 'w', encoding='utf-8') as f:
    f.write('自然语言处理是人工智能的重要分支，计算机可以理解人类语言。')

In [10]:
with open('original.txt', 'w', encoding='utf-8') as f:
    f.write('Natural language processing is an important branch of artificial intelligence that enables computers to understand human language.')

with open('plagiarism.txt', 'w', encoding='utf-8') as f:
    f.write('Natural language processing is a key field in AI allowing computers to comprehend human languages.')

## 2.对要计算的多篇文档进行分词

In [3]:
import jieba

with open('原文.txt', 'r', encoding='utf-8') as f:
    text1 = f.read()
with open('抄袭.txt', 'r', encoding='utf-8') as f:
    text2 = f.read()

words1 = list(jieba.cut(text1))
words2 = list(jieba.cut(text2))

print("原文分词:", words1)
print("抄袭分词:", words2)

原文分词: ['自然语言', '处理', '是', '人工智能', '的', '重要', '分支', '，', '它', '让', '计算机', '能够', '理解', '人类', '的', '语言', '。']
抄袭分词: ['自然语言', '处理', '是', '人工智能', '的', '重要', '分支', '，', '计算机', '可以', '理解', '人类', '语言', '。']


In [12]:
with open('original.txt', 'r', encoding='utf-8') as f:
    text1 = f.read()
with open('plagiarism.txt', 'r', encoding='utf-8') as f:
    text2 = f.read()

words3 = text1.lower().split()
words4 = text2.lower().split()

print("Original words:", words3)
print("Plagiarism words:", words4)

Original words: ['natural', 'language', 'processing', 'is', 'an', 'important', 'branch', 'of', 'artificial', 'intelligence', 'that', 'enables', 'computers', 'to', 'understand', 'human', 'language.']
Plagiarism words: ['natural', 'language', 'processing', 'is', 'a', 'key', 'field', 'in', 'ai', 'allowing', 'computers', 'to', 'comprehend', 'human', 'languages.']


## 3.停用词处理


In [4]:
with open('stopWords.txt', 'r', encoding='utf-8') as f:
    stopwords = set([line.strip() for line in f])

words1 = [w for w in words1 if w not in stopwords and len(w) > 1]
words2 = [w for w in words2 if w not in stopwords and len(w) > 1]

print("去停用词后原文:", words1)
print("去停用词后抄袭:", words2)

去停用词后原文: ['自然语言', '处理', '人工智能', '重要', '分支', '计算机', '能够', '理解', '人类', '语言']
去停用词后抄袭: ['自然语言', '处理', '人工智能', '重要', '分支', '计算机', '理解', '人类', '语言']


In [13]:
words3 = [w.strip('.,!?') for w in words3 if w not in stopwords and len(w) > 2]
words4 = [w.strip('.,!?') for w in words4 if w not in stopwords and len(w) > 2]

print("Original filtered:", words3)
print("Plagiarism filtered:", words4)

Original filtered: ['natural', 'language', 'processing', 'important', 'branch', 'artificial', 'intelligence', 'that', 'enables', 'computers', 'understand', 'human', 'language']
Plagiarism filtered: ['natural', 'language', 'processing', 'key', 'field', 'allowing', 'computers', 'comprehend', 'human', 'languages']


## 4.向量化+计算相似度

In [9]:
import math

words_all = list(set(words1 + words2))
vec1 = [words1.count(word) for word in words_all]
vec2 = [words2.count(word) for word in words_all]

dot_product = sum(a * b for a, b in zip(vec1, vec2))
magnitude1 = math.sqrt(sum(a * a for a in vec1))
magnitude2 = math.sqrt(sum(b * b for b in vec2))

dot_product / (magnitude1 * magnitude2)


0.9486832980505138

In [15]:
words_all_eng = list(set(words3 + words4))
vec3 = [words3.count(word) for word in words_all_eng]
vec4 = [words4.count(word) for word in words_all_eng]

dot_product = sum(a * b for a, b in zip(vec3, vec4))
magnitude3 = math.sqrt(sum(a * a for a in vec3))
magnitude4 = math.sqrt(sum(b * b for b in vec4))

dot_product / (magnitude3 * magnitude4)


0.48989794855663554