In [1]:
import os
import unicodedata
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Gowri
[nltk_data]     sri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
telugu_stopwords = set([
    "ఇది", "ఒక", "ఈ", "మరియు", "కాని", "అందుకే", "మీరు", "వారిని", "ఉన్న",
    "ఉంటే", "ఎలా", "ఎందుకు", "కాబట్టి", "ఎప్పుడు", "ఇంకా", "మాత్రమే", "మొత్తం",
    "నేను", "నీకు", "వారు", "ఆది", "మంచి", "తరువాత", "కూడా", "అక్కడ", "మీ", 
    "వెంట", "అంటే", "ఇంకా", "చెప్పారు", "అవి", "మరి", "అందరూ", "అప్పటికీ", 
    "దాని", "అంతా", "ఎవరూ", "లేదా", "ఏది", "ఎప్పుడూ", "ఎక్కడా", "అవును", 
    "కాబట్టి", "అప్పుడు", "అందువల్ల", "విషయం", "ఎందుకంటే", "వద్ద", "చేత", "పైన"
])

In [4]:
def preprocess_text(text):
    text = unicodedata.normalize('NFKC', text)  
    telugu_punctuation = '।॥,!?।'  
    text = re.sub(f"[{telugu_punctuation}]", "", text)  
    tokens = text.split() 
    tokens = [word for word in tokens if word not in telugu_stopwords] 
    return " ".join(tokens) 

In [5]:
folder_path = 'telugu_text'


In [6]:
file_texts = []
file_names = []


In [7]:
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'): 
        file_path = os.path.join(folder_path, filename)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        preprocessed_text = preprocess_text(text)
        
        file_texts.append(preprocessed_text)
        file_names.append(filename)

In [8]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(file_texts)

In [9]:
similarity_matrix = cosine_similarity(tfidf_matrix)


In [10]:
for i in range(len(file_names)):
    for j in range(i + 1, len(file_names)):  
        similarity_score = similarity_matrix[i][j]
        if similarity_score > 0.5: 
            print(f"Plagiarism detected between {file_names[i]} and {file_names[j]} with similarity: {similarity_score:.4f}")

Plagiarism detected between 11.txt and 43.txt with similarity: 0.5371
Plagiarism detected between 22.txt and seven.txt with similarity: 0.5476
Plagiarism detected between 34.txt and three.txt with similarity: 0.5703
