In [2]:
import os
import re, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import trigrams, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize
path = "input/"  # 設定資料路徑
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
%matplotlib inline
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# 檔案清理函數
def clean_file(myfile):
    text = myfile.read().lower()
    text = re.sub(r'[\n]\s*|[\']|[:]|[+]|\d+|[--]', '', text)
    text = re.sub(r'\(\)|\.\s+\.', '.', text).strip()
    return text

# 讀取文件並建立DataFrame
def get_dataframe(files):
    data = []
    for f in files:
        with open(path + f, mode='r', encoding='utf-8-sig') as myfile:
            data.append(clean_file(myfile))
    return pd.DataFrame(data, columns=['Text'])

[nltk_data] Downloading package punkt to /home/undergrad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/undergrad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/undergrad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# 載入檔案並建立標註
suspicious_files = sorted([f for f in os.listdir(path) if f.startswith('suspicious-document')])
suspicious = get_dataframe(suspicious_files)
suspicious['File_index'] = [f[19:24] for f in suspicious_files]
suspicious['Plagiarized'] = pd.read_csv(path + "Plagiarized.csv").Plagiarized

source_files = sorted([f for f in os.listdir(path) if f.startswith('source-document')])
source = get_dataframe(source_files)
source['File_index'] = [f[15:20] for f in source_files]

In [4]:
# 文本處理函數
def process_text(df):
    punc_stop = set(stopwords.words('english')).union(
        {".", ",", "?", "-", "!", "'", '"', "\\", "/", ";", "{", "}", "(", ")", "[", "]", "''", "``", "*", "$", "%"}
    )
    df['Tokens'] = df['Text'].apply(word_tokenize).apply(lambda x: [w for w in x if w not in punc_stop])
    # 限制三元組的計算至最前面的 500 個詞
    df['Trigrams'] = df['Tokens'].apply(lambda x: set(trigrams(x[:500])))
    return df

# 計算 Jaccard 相似度和 containment 度量
def Jaccard_similarity_coefficient(A, B): return len(A.intersection(B)) / len(A.union(B))
def containment_measure(A, B): return len(A.intersection(B)) / len(B)

def check_plagiarism_Jaccard(doc_trigrams): 
    return source.Trigrams.apply(lambda s: Jaccard_similarity_coefficient(s, doc_trigrams)).max()

def check_plagiarism_containment(doc_trigrams): 
    return source.Trigrams.apply(lambda s: containment_measure(s, doc_trigrams)).max()

# LCS 度量
def LCS(A, B):
    m, n, longest = len(A), len(B), 0
    counter = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            if A[i] == B[j]:
                count = counter[i][j] + 1
                counter[i+1][j+1] = count
                longest = max(longest, count)
    return longest

def check_plagiarism_LCS(doc): 
    return source.Tokens.apply(lambda s: LCS(s, doc)).max()

In [5]:
suspicious, source = process_text(suspicious), process_text(source)

In [12]:
suspicious['Jaccard_similarity_score'] = suspicious.Trigrams.apply(check_plagiarism_Jaccard)
suspicious['Containment_measure_score'] = suspicious.Trigrams.apply(check_plagiarism_containment)

In [None]:
suspicious['Longest_common_sequence'] = suspicious.Tokens.apply(check_plagiarism_LCS)

In [6]:
# 語義分析（LSA）步驟
lemmatizer = WordNetLemmatizer()
source.Tokens, suspicious.Tokens = source.Tokens.apply(lambda x: [lemmatizer.lemmatize(w) for w in x]), suspicious.Tokens.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=None, tokenizer=lambda x: x, preprocessor=lambda x: x, ngram_range=(1, 4), max_features=500)  # 降低max_features
combined_tokens = pd.concat([suspicious.Tokens, source.Tokens])
DTM = vectorizer.fit_transform(combined_tokens)

In [10]:
LSA = TruncatedSVD(50, algorithm='arpack')  # 降低成分數量
DTM_LSA = Normalizer(copy=False).fit_transform(LSA.fit_transform(DTM))

# 分塊計算相似度矩陣
def compute_similarity_matrix_in_batches(matrix, batch_size=250):
    similarity_scores = []
    for i in range(0, matrix.shape[0], batch_size):
        batch = matrix[i:i + batch_size]
        scores = (batch @ matrix.T)
        similarity_scores.append(np.max(scores, axis=1))
    return np.concatenate(similarity_scores)

suspicious['LSA_similarity'] = compute_similarity_matrix_in_batches(DTM_LSA[:len(suspicious)])

In [13]:
# 可視化
sns.swarmplot(x="Plagiarized", y="Jaccard_similarity_score", data=suspicious)

<Axes: xlabel='Plagiarized', ylabel='Jaccard_similarity_score'>

In [14]:
sns.swarmplot(x="Plagiarized", y="Containment_measure_score", data=suspicious)

<Axes: xlabel='Plagiarized', ylabel='Jaccard_similarity_score'>

In [15]:
sns.relplot(x="Jaccard_similarity_score", y="Containment_measure_score", hue="Plagiarized", data=suspicious)

<seaborn.axisgrid.FacetGrid at 0x7cf7cd21a960>

In [16]:
# 相似度特徵與標註的相關性分析
print(suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score', 'Plagiarized']].corr())

                           LSA_similarity  Jaccard_similarity_score  \
LSA_similarity                   1.000000                 -0.041913   
Jaccard_similarity_score        -0.041913                  1.000000   
Containment_measure_score       -0.043257                  0.995360   
Plagiarized                     -0.082941                  0.146659   

                           Containment_measure_score  Plagiarized  
LSA_similarity                             -0.043257    -0.082941  
Jaccard_similarity_score                    0.995360     0.146659  
Containment_measure_score                   1.000000     0.148125  
Plagiarized                                 0.148125     1.000000  


In [17]:
# 模型訓練與測試
X, y = suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score']], suspicious.Plagiarized
clf = LogisticRegression()

# 使用分層隨機分割的交叉驗證以減少內存需求
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
cross_val_scores = cross_val_score(clf, X, y, cv=sss)
print(np.mean(cross_val_scores))

0.62


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

print(y.value_counts())

              precision    recall  f1-score   support

           0       0.62      1.00      0.77        62
           1       0.00      0.00      0.00        38

    accuracy                           0.62       100
   macro avg       0.31      0.50      0.38       100
weighted avg       0.38      0.62      0.47       100

Plagiarized
0    311
1    189
Name: count, dtype: int64
