In [None]:
import nltk
nltk.download('popular')
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

Load Dataset

In [None]:
data = pd.read_csv('/dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1


In [None]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,187
1,183


In [None]:
data.shape

(370, 4)

## **CLEAN TEXT**

In [None]:
def proprocess_text(text):
  #remove punctuation
  text= text.translate(str.maketrans("","", string.punctuation))
  # convert to lowercase
  text= text.lower()
  # remove stopwords
  stop_words = set(stopwords.words('english'))
  text= " ".join((word for word in text.split() if word not in stop_words))
  return text
proprocess_text(" this is my $%@!^&*(%$?>:| text to use for dummy text")

'text use dummy text'

In [None]:
data['source_text']= data['source_text'].apply(proprocess_text)
data['plagiarized_text']= data['plagiarized_text'].apply(proprocess_text)

In [None]:
data

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1
...,...,...,...,...
365,397,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0
366,398,studying history helps understanding present,understanding present aided studying history,0
367,399,listening classical music improve focus,focus improved listening classical music,0
368,400,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


**VECTORIZATION**

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['source_text'] + " " + data['plagiarized_text'])

In [None]:
y = data['label']

**TRAIN TEST SPLIT**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**APPLYING LOGISTIC REGRESSION**

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("accuracy", accuracy_score(y_test, y_pred))
print("classification", classification_report(y_test, y_pred))
print("confusion", confusion_matrix(y_test, y_pred))

accuracy 0.8243243243243243
classification               precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

confusion [[30  5]
 [ 8 31]]


**RANDOM FOREST MODEL**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("accuracy", accuracy_score(y_test, y_pred))
print("classification", classification_report(y_test, y_pred))
print("confusion", confusion_matrix(y_test, y_pred))

accuracy 0.7972972972972973
classification               precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74

confusion [[34  1]
 [14 25]]


**NAIV BAYS MODEL**

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("accuracy", accuracy_score(y_test, y_pred))
print("classification", classification_report(y_test, y_pred))
print("confusion", confusion_matrix(y_test, y_pred))

accuracy 0.8648648648648649
classification               precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

confusion [[30  5]
 [ 5 34]]


**SVM**

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("accuracy", accuracy_score(y_test, y_pred))
print("classification", classification_report(y_test, y_pred))
print("confusion", confusion_matrix(y_test, y_pred))

accuracy 0.8783783783783784
classification               precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

confusion [[31  4]
 [ 5 34]]


**SAVE SVM AND VECTORIZER**

In [None]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tfdf_vectorizer.pkl', 'wb'))

**LOAD MODEL AND VECTORIZER**

In [None]:
model = pickle.load(open('model.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfdf_vectorizer.pkl', 'rb'))

**DETECTION SYSTEM**

In [None]:
def detect(input_text):
  #vectorized the text
  vectorized_text= tfdif_vectorizer.transform([input_text])
def detect(input_text):
  #vectorized the text
  vectorized_text= tfidf_vectorizer.transform([input_text])
  #then will do prediction by model
  result= model.predict(vectorized_text)
  return "Plagiarism Detected" if result[0] == 1 else "No Plagiarism Detected"

In [None]:
#example (it is a plagiarised text)
input_text = 'Researchers have discovered a new species of butterfly in the Amazon Rainforest.'
detect(input_text)

'plagiarism detected'

In [None]:
#example (it has no plagiarism)
input_text = 'The quick brown fox jumps over the lazy dog.'
detect(input_text)

'No Plagiarism Detected'

In [None]:
#example (it has no plagiarism)
input_text= 'Practicing Yoga enhances physical flexibility.'
detect(input_text)

'No Plagiarism Detected'

In [None]:
#sklearn version
import sklearn
sklearn.__version__

'1.6.1'

In [None]:
!pip install flask-ngrok


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok
import difflib

app = Flask(__name__)
run_with_ngrok(app)  # Automatically starts ngrok when you run the app

@app.route('/')
def home():
    return "Flask app is running on Colab!"

@app.route('/check', methods=['POST'])
def check_plagiarism():
    data = request.get_json()
    text1 = data['text1']
    text2 = data['text2']
    # Use difflib to calculate similarity
    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    return jsonify({'similarity': round(similarity * 100, 2)})

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
Exception in thread Thread-9:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.11/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/urllib3/connectionpool.py", line 493, in _make_reques