In [35]:
import pandas as pd
from tqdm import tqdm

In [36]:
import os
import json

def preprocess_data(data_path):
    data = []
    
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                    entry = {
                        "abstract": json_data["abstract"],
                        "paper_id": json_data["articleNumber"], # Assuming "articleNumber" is equivalent to "paper_id"
                        # You can include more attributes as needed
                        # "articleTitle": json_data["articleTitle"],
                        # "authors": json_data["authors"],
                        # "doi": json_data["doi"],
                        # "publicationTitle": json_data["publicationTitle"],
                        # "publicationYear": json_data["publicationYear"],
                        # "volume": json_data["volume"],
                        # "documentLink": json_data["documentLink"],
                        # Add more attributes as required
                    }
                    data.append(entry)
    
    return data

data_path = "/kaggle/input/data-set/data_set"
source_data = preprocess_data(data_path)


In [26]:
source_data

[{'abstract': 'The increased addition of DERs, their intermittent nature, and the high cost of ESSs are the key hurdles for interconnected multi-microgrids to operate economically. It is important to have a system that is simple to adopt, computationally inexpensive, does not share private data. In this paper, an energy management scheme is proposed for multiple interconnected renewable energy resources within t...',
  'paper_id': '9634005'},
 {'abstract': 'This study proposes an algorithm to save driving energy in an autonomous vehicle based on vehicle-to-vehicle technology. Saving the vehicular driving energy can be realized by reducing unnecessary deceleration and acceleration occurred in road congestion and by reducing the resistance caused by the internal factors of the vehicle. The algorithm proposed in this study defines cornering resistance, ...',
  'paper_id': '9906095'},
 {'abstract': 'To simplify the system structure of permanent magnet synchronous linear motor (PMSLM), opti

In [37]:
!pip -q install transformers
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer,  AutoModelForSequenceClassification

# Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity


In [38]:
model_path = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_path, 
                                          do_lower_case=True)

model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          output_attentions=False,
                                                          output_hidden_states=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
def create_vector_from_text(tokenizer, model, text, MAX_LEN = 510):
    
    input_ids = tokenizer.encode(
                        text, 
                        add_special_tokens = True, 
                        max_length = MAX_LEN,                           
                   )    

    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", 
                              truncating="post", padding="post")
    
    # Remove the outer list.
    input_ids = results[0]

    # Create attention masks    
    attention_mask = [int(i>0) for i in input_ids]
    
    # Convert to tensors.
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)

    # Add an extra dimension for the "batch" (even though there is only one 
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)
    
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():        
        logits, encoded_layers = model(
                                    input_ids = input_ids, 
                                    token_type_ids = None, 
                                    attention_mask = attention_mask,
                                    return_dict=False)

    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]
        
    # Extract the embedding.
    vector = encoded_layers[layer_i][batch_i][token_i]

    # Move to the CPU and convert to numpy ndarray.
    vector = vector.detach().cpu().numpy()

    return(vector)

In [52]:
import numpy as np
import json

def create_vector_database(data):
    
    # The list of all the vectors and 
    vectors = []
  

# Assuming your JSON file is named 'data.json'
with open('data.json', 'r') as json_file:
    json_data = json.load(json_file)

# Access the 'abstract' field from each item in the JSON data
source_data = [item['abstract'] for item in json_data]

    # Loop over all the comment and get the embeddings
for text in tqdm(source_data):
        
        # Get the embedding 
    vector = create_vector_from_text(tokenizer, model, text)
        
        #add it to the list
    vectors.append(vector)
    
    data["vectors"] = vectors
    data["vectors"] = data["vectors"].apply(lambda emb: np.array(emb))
    data["vectors"] = data["vectors"].apply(lambda emb: emb.eshape(1, -1))
    
    return data

FileNotFoundError: [Errno 2] No such file or directory: 'data.json'

In [49]:
vector_database = create_vector_database(source_data)
     

TypeError: create_vector_database() missing 2 required positional arguments: 'tokenizer' and 'model'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support



           0       1.00      1.00      1.00         1

           1       1.00      1.00      1.00         1



    accuracy                           1.00         2

   macro avg       1.00      1.00      1.00         2

weighted avg       1.00      1.00      1.00         2




In [None]:
joblib.dump(model, 'plagiarism_model.pkl')

['plagiarism_model.pkl']

In [None]:
# Load the saved model
loaded_model = joblib.load('plagiarism_model.pkl')

# New text for plagiarism detection
new_text = "Rome dates back to 753 BC, so it has a long history."

# Preprocess the new text (e.g., apply the same preprocessing steps as during training)
new_text = preprocess_text(new_text)

# Convert the preprocessed text into TF-IDF vectors (assuming you have the vectorizer)
new_text_vector = tfidf_vectorizer.transform([new_text])

# Make predictions using the loaded model
prediction = loaded_model.predict(new_text_vector)

# Calculate cosine similarity between new text and training data
cosine_similarity_score = cosine_similarity(new_text_vector, X_train).max()

# Interpret the prediction and similarity score
if prediction[0] == 0:
    print("The text is not plagiarized.")
else:
    print(f"The text is plagiarized with a similarity score of {cosine_similarity_score*100:.2f}%.")


The text is plagiarized with a similarity score of 94.57%.
