In [4]:
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
def preprocess_code(code):
    code = re.sub(r'\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)
    code = re.sub(r'\"\"\".*?\"\"\"', '', code, flags=re.DOTALL) 
    code = re.sub(r'#.*', '', code) 
    
    code = re.sub(r'^\s*(import|from)\s+[^\n]+', '', code, flags=re.MULTILINE)
    
    return code

In [None]:
student_files = [doc for doc in os.listdir('code_data') if doc.endswith('.py')]

In [None]:
student_code = [preprocess_code(open(os.path.join('code_data', _file), encoding='utf-8', errors='ignore').read())
                for _file in student_files]


In [None]:
def vectorize(Text): 
    return TfidfVectorizer(token_pattern=r'\b\w+\b').fit_transform(Text).toarray()


In [None]:
def similarity(doc1, doc2): 
    return cosine_similarity([doc1, doc2])


In [None]:
vectors = vectorize(student_code)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()

In [3]:








def check_plagiarism():
    for student_a, text_vector_a in s_vectors:
        for student_b, text_vector_b in s_vectors:
            if student_a != student_b:
                sim_score = similarity(text_vector_a, text_vector_b)[0][1]
                if sim_score > 0.5:
                    student_pair = sorted((student_a, student_b))
                    score = (student_pair[0], student_pair[1], sim_score)
                    plagiarism_results.add(score)
    return plagiarism_results

for data in check_plagiarism():
    print(f'{data[0]} vs {data[1]}: Similarity Score: {data[2]:.2f}')


a12.py vs a30.py: Similarity Score: 0.77
a10.py vs a5.py: Similarity Score: 0.71
a2.py vs a25.py: Similarity Score: 0.86
a3.py vs a5.py: Similarity Score: 0.56
a13.py vs a41.py: Similarity Score: 0.79
a14.py vs a26.py: Similarity Score: 0.68
a10.py vs a3.py: Similarity Score: 0.65
a34.py vs a45.py: Similarity Score: 0.98
a4.py vs a5.py: Similarity Score: 0.53
a43.py vs a47.py: Similarity Score: 0.73
a3.py vs a7.py: Similarity Score: 0.64
a5.py vs a7.py: Similarity Score: 0.73
a36.py vs a44.py: Similarity Score: 0.57
a10.py vs a7.py: Similarity Score: 0.88
