# Colab設置

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# !git clone https://github.com/AashitaK/Plagiarism-Detection.git

In [None]:
# !pip install nltk pandas scikit-learn joblib

In [None]:
# import nltk
# nltk.download('punkt_tab')

# Server設置

In [None]:
import os
import re, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk import trigrams, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize

# Colab 路徑
# path = "/content/Plagiarism-Detection/input/"  

# 主機路徑
path = "input/"  

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
%matplotlib inline
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# 檔案清理函數
def clean_file(myfile):
    text = myfile.read().lower()
    text = re.sub(r'[\n]\s*|[\']|[:]|[+]|\d+|[--]', '', text)
    text = re.sub(r'\(\)|\.\s+\.', '.', text).strip()
    return text

# 讀取文件並建立DataFrame
def get_dataframe(files):
    data = []
    for f in files:
        with open(path + f, mode='r', encoding='utf-8-sig') as myfile:
            data.append(clean_file(myfile))
    return pd.DataFrame(data, columns=['Text'])

In [None]:
# 載入檔案並建立標註
suspicious_files = sorted([f for f in os.listdir(path) if f.startswith('suspicious-document')])
suspicious = get_dataframe(suspicious_files)
suspicious['File_index'] = [f[19:24] for f in suspicious_files]
suspicious['Plagiarized'] = pd.read_csv(path + "Plagiarized.csv").Plagiarized

source_files = sorted([f for f in os.listdir(path) if f.startswith('source-document')])
source = get_dataframe(source_files)
source['File_index'] = [f[15:20] for f in source_files]

In [None]:
# 文本處理函數
def process_text(df):
    punc_stop = set(stopwords.words('english')).union(
        {".", ",", "?", "-", "!", "'", '"', "\\", "/", ";", "{", "}", "(", ")", "[", "]", "''", "``", "*", "$", "%"}
    )
    df['Tokens'] = df['Text'].apply(word_tokenize).apply(lambda x: [w for w in x if w not in punc_stop])
    # 限制三元組的計算至最前面的 500 個詞
    df['Trigrams'] = df['Tokens'].apply(lambda x: set(trigrams(x[:500])))
    return df

# 計算 Jaccard 相似度和 containment 度量
def Jaccard_similarity_coefficient(A, B): return len(A.intersection(B)) / len(A.union(B))
def containment_measure(A, B): return len(A.intersection(B)) / len(B)

def check_plagiarism_Jaccard(doc_trigrams):
    return source.Trigrams.apply(lambda s: Jaccard_similarity_coefficient(s, doc_trigrams)).max()

def check_plagiarism_containment(doc_trigrams):
    return source.Trigrams.apply(lambda s: containment_measure(s, doc_trigrams)).max()

# LCS 度量
def LCS(A, B):
    m, n, longest = len(A), len(B), 0
    counter = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            if A[i] == B[j]:
                count = counter[i][j] + 1
                counter[i+1][j+1] = count
                longest = max(longest, count)
    return longest

def check_plagiarism_LCS(doc):
    return source.Tokens.apply(lambda s: LCS(s, doc)).max()

In [None]:
suspicious, source = process_text(suspicious), process_text(source)

In [None]:
suspicious['Jaccard_similarity_score'] = suspicious.Trigrams.apply(check_plagiarism_Jaccard)
suspicious['Containment_measure_score'] = suspicious.Trigrams.apply(check_plagiarism_containment)

In [None]:
# suspicious['Longest_common_sequence'] = suspicious.Tokens.apply(check_plagiarism_LCS)

In [None]:
# 語義分析（LSA）步驟
lemmatizer = WordNetLemmatizer()
source.Tokens, suspicious.Tokens = source.Tokens.apply(lambda x: [lemmatizer.lemmatize(w) for w in x]), suspicious.Tokens.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=None, tokenizer=lambda x: x, preprocessor=lambda x: x, ngram_range=(1, 4), max_features=500)  # 降低max_features
combined_tokens = pd.concat([suspicious.Tokens, source.Tokens])
DTM = vectorizer.fit_transform(combined_tokens)

In [None]:
# run LSA
LSA = TruncatedSVD(50, algorithm='arpack')  # 降低成分數量
DTM_LSA = Normalizer(copy=False).fit_transform(LSA.fit_transform(DTM))

# 分塊計算相似度矩陣
def compute_similarity_matrix_in_batches(matrix, batch_size=250):
    similarity_scores = []
    for i in range(0, matrix.shape[0], batch_size):
        batch = matrix[i:i + batch_size]
        scores = (batch @ matrix.T)
        similarity_scores.append(np.max(scores, axis=1))
    return np.concatenate(similarity_scores)

suspicious['LSA_similarity'] = compute_similarity_matrix_in_batches(DTM_LSA[:len(suspicious)])

In [None]:
# # 可視化
# sns.swarmplot(x="Plagiarized", y="Jaccard_similarity_score", data=suspicious)
# sns.swarmplot(x="Plagiarized", y="Containment_measure_score", data=suspicious)
# sns.relplot(x="Jaccard_similarity_score", y="Containment_measure_score", hue="Plagiarized", data=suspicious)

# # 相似度特徵與標註的相關性分析
# print(suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score', 'Plagiarized']].corr())

# train and save model

In [None]:
# # 模型訓練與測試
# X, y = suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score']], suspicious.Plagiarized
# clf = LogisticRegression()

# # 使用分層隨機分割的交叉驗證以減少內存需求
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
# cross_val_scores = cross_val_score(clf, X, y, cv=sss)
# print(np.mean(cross_val_scores))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred))

# print(y.value_counts())


# from joblib import dump

# # Save the trained model to a file

# # Colab路徑
# # model_path = '/content/drive/MyDrive/Colab Notebooks/logistic_regression_model.joblib'

# # Server路徑
# model_path = 'logistic_regression_model.joblib'

# dump(clf, model_path)
# print(f"Model saved to {model_path}")


# load model

In [None]:
from joblib import load

# Load the saved model
model_path = 'logistic_regression_model.joblib'
clf = load(model_path)

# Now you can use the loaded model to make predictions

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Model Training and Testing
X, y = suspicious[['LSA_similarity', 'Jaccard_similarity_score', 'Containment_measure_score']], suspicious.Plagiarized
clf = LogisticRegression()

# Stratified Shuffle Split
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

print(y.value_counts())

# Save the trained model to a file
model_path = 'logistic_regression_model.joblib'
dump(clf, model_path)
print(f"Model saved to {model_path}")

# Cross-validation scores
cross_val_scores = cross_val_score(clf, X, y, cv=sss)
print(f"Mean cross-validation score: {np.mean(cross_val_scores):.4f}")

# Function to calculate the plagiarism ratio using the loaded model
def calculate_plagiarism_ratio_with_model(input_text, model_path):
    # Load the trained model
    clf = load(model_path)
    print(f"Model loaded from {model_path}")

    # Preprocess the input text
    tokens = word_tokenize(input_text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    input_text_processed = ' '.join(tokens)

    # Calculate TF-IDF and similarity features
    vectorizer = TfidfVectorizer()
    source_texts = ["Add your reference source texts here"]  # Replace with actual source texts
    source_vectors = vectorizer.fit_transform(source_texts)
    input_vector = vectorizer.transform([input_text_processed])

    lsa_similarity = cosine_similarity(input_vector, source_vectors).max()
    jaccard_similarity = len(set(tokens) & set(' '.join(source_texts).split())) / len(set(tokens) | set(' '.join(source_texts).split()))
    containment_measure = len(set(tokens) & set(' '.join(source_texts).split())) / len(set(tokens))

    # Create the feature set for prediction
    features = [[lsa_similarity, jaccard_similarity, containment_measure]]

    # Predict plagiarism
    prediction = clf.predict(features)
    probability = clf.predict_proba(features).max()

    is_plagiarized = True if prediction[0] == 1 else False

    # return f"Plagiarism Ratio: {probability:.2f}, Prediction: {'Plagiarized' if prediction[0] == 1 else 'Not Plagiarized'}"
    return probability, is_plagiarized


In [None]:
# # Test the function with user input
# while True:
#     input_text = input("Enter text to check for plagiarism (type 'exit' to quit): ")
#     if input_text.lower() == 'exit':
#         break
#     probability, is_plagiarized = calculate_plagiarism_ratio_with_model(input_text, model_path)
#     output = f"Plagiarism Ratio: {probability:.2f}, Prediction: {'Plagiarized' if is_plagiarized else 'Not Plagiarized'}"
#     print(output)

In [None]:
# launch gradio
import gradio as gr

# 定义不同标签页中的功能
def Human_Plagiarism_Detection(input_text):
    probability, is_plagiarized = calculate_plagiarism_ratio_with_model(input_text, model_path)
    output = f"Plagiarism Ratio: {probability:.2f}, Prediction: {'Plagiarized' if is_plagiarized else 'Not Plagiarized'}"
    return output

def AI_Plagiarism_Detection(input_text):
    return "hello"

# 创建界面
with gr.Blocks() as demo:
    with gr.Tabs():
        # 第一個標籤：Human Plagiarism Detection
        with gr.TabItem("Human Plagiarism Detection"):
            with gr.Row():
                text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here...")
            process_btn = gr.Button("Calculate")
            result = gr.Textbox(label="Plagiarism Ratio")
            process_btn.click(Human_Plagiarism_Detection, inputs=text_input, outputs=result)


        # 第二個標籤：AI Plagiarism Detection
        with gr.TabItem("AI Plagiarism Detection"):
            with gr.Row():
                text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here...")
            process_btn = gr.Button("Calculate")
            result = gr.Textbox(label="Plagiarism Ratio")
            process_btn.click(Human_Plagiarism_Detection, inputs=text_input, outputs=result)

In [None]:
demo.launch(share=True)

In [None]:
demo.close()