# CMPS285-AI-Project
Done by:Jad Raad, Ali Younes, Ali Hamdan, Ahmad Termos
Presented to: Dr. Ahmad Elhaj
Fall 2023-2024

# This part of the code is responsible for creating a simple Tkinter-based graphical user interface (GUI) for a plagiarism checker application.

Let's go through each component and method in detail:

Import Statements:

import tkinter as tk: Imports the Tkinter module, which provides a set of tools for creating graphical user interfaces.
from tkinter import filedialog: Imports the filedialog submodule from Tkinter, which provides functions for opening and saving files.
import requests: Imports the requests module, which is used for making HTTP requests to the FastAPI backend.

Class Definition: PlagiarismCheckerApp

__init__ method:

Initializes the main application window (root) and sets its title to "Plagiarism Checker".
Creates various UI elements using Tkinter widgets such as buttons and labels.
Sets up the layout by packing these widgets with specified padding.
Initializes the file_path attribute to None, which will be used to store the path of the uploaded document.

upload_file method:

Invokes the file dialog to allow the user to select a file for upload.
If a file is selected, the file path is stored in the file_path attribute. 
check_plagiarism method:

Checks if a file has been uploaded. If not, it updates the result label with a message indicating that a document needs to be uploaded.
If a file is uploaded, it sends a POST request to the FastAPI backend (http://127.0.0.1:8000/detect) with the file content.
Parses the response and updates the result label with the plagiarism detection result.

reset method:

Resets the state of the application, allowing the user to upload a new file. It resets the result label and sets the file_path attribute to None.

Main Execution Block:
Checks if the script is being run as the main program (if __name__ == "__main__":).
Creates a Tkinter root window (root), and an instance of the PlagiarismCheckerApp class (app) is created with this root window.
The mainloop() method is called on the root window, which starts the Tkinter event loop, allowing the user to interact with the GUI.

In [None]:
import tkinter as tk
from tkinter import filedialog
import requests

class PlagiarismCheckerApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Plagiarism Checker")

        # UI Elements
        self.upload_button = tk.Button(root, text="Upload Document", command=self.upload_file)
        self.check_button = tk.Button(root, text="Check Plagiarism", command=self.check_plagiarism)
        self.result_label = tk.Label(root, text="Plagiarism Result: ")
        self.reset_button = tk.Button(root, text="Reset", command=self.reset)

        # Layout
        self.upload_button.pack(pady=10)
        self.check_button.pack(pady=10)
        self.result_label.pack(pady=10)
        self.reset_button.pack(pady=10)

        # Initialize file_path attribute
        self.file_path = None

    def upload_file(self):
        file_path = filedialog.askopenfilename()
        if file_path:
            self.file_path = file_path
            # You can handle the uploaded file here (e.g., display the file path)

    def check_plagiarism(self):
        if not self.file_path:
            self.result_label.config(text="Upload a document first.")
            return

        # Send the file path to the FastAPI backend for plagiarism detection
        url = "http://127.0.0.1:8000/detect"
        files = {'file': open(self.file_path, 'rb')}
        data = {'file_path': self.file_path}
        response = requests.post(url, files=files, data=data)

        # Parse the response and update the result_label with the plagiarism result
        if response.status_code == 200:
            similarity_percentage = float(response.json()['similarity_percentage'])
            result_text = f"Plagiarism Result: {similarity_percentage:.2f}%"
            self.result_label.config(text=result_text)
        else:
            self.result_label.config(text="Plagiarism check failed")

    def reset(self):
        # Reset the state to allow uploading a new file
        self.result_label.config(text="Plagiarism Result: ")
        self.file_path = None  # Reset the file path attribute

if __name__ == "__main__":
    root = tk.Tk()
    app = PlagiarismCheckerApp(root)
    root.mainloop()


# This part of the code is responsible for the FastAPI backend of the plagiarism checker. 

It includes functions for loading a dataset, training a new plagiarism model, and handling the plagiarism detection endpoint.

Let's break down each component:

Import Statements:
import pandas as pd: Imports the Pandas library for data manipulation and analysis.
from fastapi import FastAPI, File, UploadFile, Form, HTTPException: Imports necessary modules and classes from FastAPI for creating the API.
from fastapi.responses import JSONResponse: Imports JSONResponse class from FastAPI for returning JSON responses.
import shutil: Imports the shutil module for file operations.
import os: Imports the os module for operating system-related functions.
from PlagiarismChecker import train_plagiarism_model: Imports the train_plagiarism_model function from a module named PlagiarismChecker.

FastAPI Setup:

app = FastAPI(): Initializes a FastAPI application.

dataset_path = 'C:\\Users\\user\\Downloads\\CMPS285-AI-Project\\fake.xlsx': Specifies the path to the Excel dataset. You should replace this with the actual path to your dataset.

Dataset Loading Function: load_dataset

This function attempts to load the Excel dataset specified by dataset_path.
If successful, it checks whether the required columns ('text' and 'label') are present in the dataset.
If everything is in order, it extracts the 'text' and 'label' columns from the DataFrame and returns them as lists.
If any error occurs during dataset loading, it prints an error message and returns empty lists.

Model Training Function: train_new_model

Calls the load_dataset function to get the 'text' and 'label' lists.
Calls the train_plagiarism_model function from the imported module, passing the 'text' and 'label' lists.
Returns the trained plagiarism model.

Plagiarism Detection Endpoint: @app.post("/detect")

This is a FastAPI endpoint for handling POST requests to the "/detect" path.
It receives an uploaded file (file) and a file path (file_path) as form data.
Calls train_new_model to obtain a trained plagiarism model.
Temporarily saves the uploaded file to "temp_file.txt" using shutil.copyfileobj.
Tries to predict plagiarism using the trained model and a sample text. If an exception occurs during prediction, it returns an error response.
Finally, removes the temporary file.
Returns a JSON response containing the calculated similarity percentage.

In [None]:
import pandas as pd
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
import shutil
import os
from PlagiarismChecker import train_plagiarism_model

app = FastAPI()

dataset_path = 'C:\\Users\\user\\Downloads\\CMPS285-AI-Project\\fake.xlsx'  # Replace with your Excel dataset path

def load_dataset(dataset_path):
    try:
        # Load the Excel file into a DataFrame
        df = pd.read_excel(dataset_path, engine='openpyxl')
        print("Dataset loaded successfully.")

        # Check if 'text' and 'label' columns are present
        if 'text' not in df.columns or 'label' not in df.columns:
            print("Columns 'text' and 'label' are required in the dataset.")
            return [], []

        # Access columns using their names
        texts = df['text'].tolist()
        labels = df['label'].tolist()

    except FileNotFoundError:
        print(f"Error: File '{dataset_path}' not found.")
        return [], []

    except pd.errors.EmptyDataError:
        print("Error: The Excel file is empty.")
        return [], []

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return [], []

    return texts, labels

def train_new_model():
    # Load the dataset for each new file
    train_texts, train_labels = load_dataset(dataset_path)

    # Train a new plagiarism model for each new file
    return train_plagiarism_model(train_texts, train_labels)

@app.post("/detect")
async def detect_plagiarism(file: UploadFile = File(...), file_path: str = Form(...)):
    # Train a new model for each new file
    plagiarism_model = train_new_model()

    if plagiarism_model is None:
        return JSONResponse(content={"error": "Failed to load or train the plagiarism model."}, status_code=500)

    # Save the uploaded file temporarily
    with open("temp_file.txt", "wb") as temp_file:
        shutil.copyfileobj(file.file, temp_file)

    try:
        # Predict plagiarism using the trained model
        similarity_percentage = plagiarism_model.predict(["text to compare with the uploaded document"])[0] * 100
    except Exception as e:
        # Handle any errors that might occur during prediction
        return JSONResponse(content={"error": str(e)}, status_code=500)
    finally:
        # Remove the temporary file
        os.remove("temp_file.txt")

    # Return the similarity percentage in the response
    return JSONResponse(content={"similarity_percentage": similarity_percentage})


# This part of the code is responsible for Plagiarsim Checker Module:

It starts by defining a dataset class (PlagiarismDataset), a plagiarism model class (PlagiarismModel), and a function (train_plagiarism_model) to train the plagiarism detection model using BERT (Bidirectional Encoder Representations from Transformers) from the Hugging Face Transformers library.

Let's break down each component:

PlagiarismDataset Class:

This class inherits from the Dataset class in PyTorch and is responsible for preparing the data for training the plagiarism model.

__init__ Method:
Initializes the dataset with texts, labels, a tokenizer, and a maximum sequence length.

__len__ Method:
Returns the number of samples in the dataset.

__getitem__ Method:
Retrieves a specific sample at the given index (idx).
Converts the text and label to the appropriate format for model input.
Tokenizes the text using the provided tokenizer (BertTokenizer).
Returns a dictionary containing 'input_ids', 'attention_mask', and 'label'.

PlagiarismModel Class:

This class defines the plagiarism detection model using BERT.

__init__ Method:
Initializes the model with a tokenizer, the BERT model, and a device (default is set to 'cuda' if GPU is available, otherwise 'cpu').

train Method:
Trains the model using the provided training texts and labels.
Tokenizes the texts using the tokenizer and creates a DataLoader for training.
Uses the AdamW optimizer for training.
Iterates through the specified number of epochs, batches, and updates the model parameters based on the training loss.

predict Method:
Evaluates the model on a set of texts and returns the predicted probabilities for plagiarism.

predict_with_similar_text Method:
Evaluates the model on a set of texts and returns both the predicted probabilities for plagiarism and a list of similar texts.

train_plagiarism_model Function:
This function creates an instance of the PlagiarismModel class, loads the BERT tokenizer, and initializes the BERT model for sequence classification.

Error Handling:
Checks if there is any training data provided. If not, it prints an error message and returns None.

Model Training:
Calls the train method of the plagiarism model to train the model using the provided texts and labels.
Return:

Returns the trained plagiarism model.

# BERT Tokenizer
BERT (Bidirectional Encoder Representations from Transformers) is employed as the underlying model for plagiarism detection. BERT is a powerful pre-trained natural language processing model developed by Google. It has been pre-trained on a large corpus of text data and has shown remarkable performance in various natural language understanding tasks.

Let's break down the role of BERT in your project:

Tokenizer (BertTokenizer):

BERT operates on fixed-size sequences of tokens. The BertTokenizer is responsible for tokenizing the input text into subwords that BERT can understand.
It also adds special tokens, such as [CLS] (classification) and [SEP] (separator), to the beginning and end of the tokenized sequence.
Model Architecture (BertForSequenceClassification):

The BertForSequenceClassification model is a variant of BERT fine-tuned for sequence classification tasks, such as sentiment analysis or, in your case, plagiarism detection.
It consists of the BERT base model architecture with an additional classification layer on top.
The classification layer is trained to predict the class labels (in your case, whether a given text is plagiarized or not) based on the representation learned by BERT.
Training:

The train_plagiarism_model function initializes an instance of PlagiarismModel, where BERT is used as the underlying model.
The model is then trained using a dataset that contains labeled examples of text and their corresponding labels (plagiarized or not).
During training, the model's weights are updated to minimize the classification loss, improving its ability to distinguish between plagiarized and non-plagiarized text.
Prediction:

The trained model can be used for plagiarism detection on new, unseen texts. The predict method takes a list of texts and returns the predicted probabilities of plagiarism for each text.
The predict_with_similar_text method not only provides plagiarism probabilities but also identifies similar texts based on the predicted labels.
Embedding Semantic Information:

BERT captures rich semantic information and context from the input text. The pre-trained BERT model has learned contextualized embeddings for each token, allowing it to understand the relationships between words and their contexts.
This contextualized information is crucial for understanding the nuances in natural language and is beneficial for tasks like plagiarism detection where the subtle differences in language usage are important.

# AdamW optimizer

Initialization:
self.model.parameters(): Retrieves the parameters (weights) of the plagiarism detection model.
lr=learning_rate: Sets the learning rate, which controls the step size during optimization.

Zeroing Gradients:
Before computing the gradients, the optimizer is instructed to zero out the gradients of the model parameters. This is necessary to avoid accumulating gradients from previous iterations.

Forward Pass:
Performs a forward pass through the plagiarism detection model (self.model).
Calculates the loss based on the model's predictions (outputs) and the ground truth labels (labels).

Backward Pass:
Performs a backward pass to compute the gradients of the loss with respect to the model parameters.

Weight Decay:
Updates the model parameters using the calculated gradients and the specified learning rate.
Additionally, AdamW applies weight decay during parameter updates. Weight decay penalizes large weights by subtracting a fraction of the weights from the gradient. This helps prevent overfitting by discouraging overly complex models.

Repeating the Process:
Steps 2-5 are repeated for each batch in the training data.

Monitoring Training:
Accumulates the loss for each batch to calculate the average loss later.

Epoch Summary:
Prints the average loss for the epoch, providing feedback on the training progress.


the hyperparameters are set as follows:

-epochs=3: The model is trained for three epochs. An epoch is one complete pass through the entire training dataset.
-batch_size=16: Training is performed in batches, and each batch contains 16 samples. Batch training is a common technique to make the optimization process more computationally efficient and to leverage parallel processing capabilities.




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch 
from torch.utils.data import DataLoader, Dataset


class PlagiarismDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        try:
            label = int(self.labels[idx])
        except ValueError:
            # Handle the case where the label is not a valid integer
            print(f"Warning: Invalid label '{self.labels[idx]}' at index {idx}. Setting label to 0.")
            label = 0

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class PlagiarismModel:
    def __init__(self, tokenizer, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.tokenizer = tokenizer
        self.model = model.to(device)
        self.device = device

    def train(self, train_texts, train_labels, epochs=3, batch_size=16, learning_rate=2e-5):
        train_dataset = PlagiarismDataset(train_texts, train_labels, self.tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        optimizer = AdamW(self.model.parameters(), lr=learning_rate)

        for epoch in range(epochs):
            self.model.train()
            total_loss = 0.0

            for batch in train_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)

                optimizer.zero_grad()
                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()

                loss.backward()
                optimizer.step()

            average_loss = total_loss / len(train_loader)
        
        
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {average_loss:.4f}')

    def predict(self, texts):
        self.model.eval()
        encoded_texts = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
        input_ids = encoded_texts['input_ids'].to(self.device)
        attention_mask = encoded_texts['attention_mask'].to(self.device)

        with torch.no_grad():
            logits = self.model(input_ids, attention_mask=attention_mask).logits

        probabilities = torch.softmax(logits, dim=1)
        return probabilities[:, 1].cpu().numpy()

    def predict_with_similar_text(self, texts):
            self.model.eval()
            encoded_texts = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoded_texts['input_ids'].to(self.device)
            attention_mask = encoded_texts['attention_mask'].to(self.device)

            with torch.no_grad():
                logits = self.model(input_ids, attention_mask=attention_mask).logits

            probabilities = torch.softmax(logits, dim=1)
            predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

            # Extract similar text
            similar_texts = [text if label == 1 else "" for text, label in zip(texts, predicted_labels)]
            return probabilities[:, 1].cpu().numpy(), similar_texts

def train_plagiarism_model(train_texts, train_labels):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    plagiarism_model = PlagiarismModel(tokenizer, model)

    if len(train_texts) == 0 or len(train_labels) == 0:
        print("Error: No training data provided.")
        return None

    plagiarism_model.train(train_texts, train_labels)

    return plagiarism_model
