### Library Imports

In [1]:
import pandas as pd
import ast
import tokenize
import io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import re
import os
from transformers import AutoTokenizer, AutoModel
import torch

### Plagiarism Detection

In [3]:
class PythonPlagiarismDetector:
    def __init__(self):        
        self.model_name = "microsoft/codebert-base"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
    
    def get_codebert_embedding(self, code):
        tokens = self.tokenizer(code, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**tokens)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    def preprocess_code(self, code):
        code = re.sub(r'#.*', '', code)
        code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
        code = re.sub(r"'''.*?'''", '', code, flags=re.DOTALL)
        code = re.sub(r'\s+', ' ', code).strip()
        return code

    def extract_tokens(self, code):
        try:
            tokens = []
            tokens_generator = tokenize.tokenize(io.BytesIO(code.encode('utf-8')).readline)
            for token in tokens_generator:
                if token.type not in [tokenize.COMMENT, tokenize.NEWLINE, tokenize.INDENT, 
                                      tokenize.DEDENT, tokenize.NL]:
                    tokens.append(token.string)
            return ' '.join(tokens)
        except:
            return self.preprocess_code(code)

    def extract_ast_features(self, code):
        try:
            tree = ast.parse(code)
            node_types = {}
            for node in ast.walk(tree):
                node_type = type(node).__name__
                node_types[node_type] = node_types.get(node_type, 0) + 1
            function_names = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]
            class_names = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
            return {
                'node_types': node_types,
                'function_names': function_names,
                'class_names': class_names
            }
        except:
            return {'node_types': {}, 'function_names': [], 'class_names': []}

    def extract_style_metrics(self, code):
        lines = code.split('\n')
        metrics = {
            'avg_line_length': np.mean([len(line) for line in lines]) if lines else 0,
            'max_indentation': max([len(line) - len(line.lstrip()) for line in lines]) if lines else 0,
            'blank_line_ratio': sum(1 for line in lines if not line.strip()) / len(lines) if lines else 0,
        }
        return metrics

    def compare_codes(self, code1, code2):
        code1_clean = self.preprocess_code(code1)
        code2_clean = self.preprocess_code(code2)

        emb1 = self.get_codebert_embedding(code1_clean)
        emb2 = self.get_codebert_embedding(code2_clean)
        token_similarity = cosine_similarity([emb1], [emb2])[0][0]

        ast1 = self.extract_ast_features(code1_clean)
        ast2 = self.extract_ast_features(code2_clean)
        node_types1 = set(ast1['node_types'].keys())
        node_types2 = set(ast2['node_types'].keys())
        node_type_similarity = len(node_types1.intersection(node_types2)) / max(len(node_types1.union(node_types2)), 1)

        style1 = self.extract_style_metrics(code1)
        style2 = self.extract_style_metrics(code2)

        overall_similarity = 0.7 * token_similarity + 0.25 * node_type_similarity + 0.05

        return {
            'token_similarity': token_similarity,
            'ast_similarity': node_type_similarity,
            'overall_similarity': overall_similarity
        }

    def detect_plagiarism(self, submissions, threshold=0.8):
        suspicious_pairs = []
        similarity_matrix = np.zeros((len(submissions), len(submissions)))
        submission_ids = list(submissions.keys())

        for i in range(len(submission_ids)):
            for j in range(i + 1, len(submission_ids)):
                id1, id2 = submission_ids[i], submission_ids[j]
                code1, code2 = submissions[id1], submissions[id2]
                result = self.compare_codes(code1, code2)
                similarity = result['overall_similarity']

                similarity_matrix[i, j] = similarity
                similarity_matrix[j, i] = similarity

                if similarity > threshold:
                    suspicious_pairs.append((id1, id2, similarity))

        return suspicious_pairs, similarity_matrix, submission_ids

    def detect_plagiarism_from_files(self, directory, threshold=0.8):
        submissions = {}
        for filename in os.listdir(directory):
            if filename.endswith('.py'):
                file_path = os.path.join(directory, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        code = f.read()
                        submissions[filename] = code
                    except:
                        print(f"Error reading {filename}")
        return self.detect_plagiarism(submissions, threshold)


### Testing With Dataset

In [5]:
df = pd.read_csv('dataset_project.csv')
df

Unnamed: 0,original_code,submitted_code,label
0,"def multiply(a, b): return a * b","def product(x, y): return x * y",1
1,def prime_check(n): return all(n % i != 0 for ...,def is_prime(n): return all(n % i != 0 for i i...,1
2,def prime_check(n): return all(n % i != 0 for ...,"def multiply(a, b): return a * b",0
3,def even_or_odd(n): return 'Even' if n % 2 == ...,def check_parity(x): return 'Even' if x % 2 ==...,1
4,def count_vowels(s): return sum(1 for c in s i...,def vowel_count(text): return sum(1 for ch in ...,1
5,def prime_check(n): return all(n % i != 0 for ...,def is_prime(n): return all(n % i != 0 for i i...,1
6,def fibonacci(n): return n if n <= 1 else fibo...,def reverse_string(s): return s[::-1],0
7,"def multiply(a, b): return a * b","def product(x, y): return x * y",1
8,def even_or_odd(n): return 'Even' if n % 2 == ...,"def multiply(a, b): return a * b",0
9,def count_vowels(s): return sum(1 for c in s i...,def vowel_count(text): return sum(1 for ch in ...,1


In [6]:
df.shape

(25, 3)

In [7]:
df['label'].value_counts()

label
1    15
0    10
Name: count, dtype: int64

In [8]:
detector = PythonPlagiarismDetector()

results = []

for idx, row in df.iterrows():
    original = row['original_code']
    submitted = row['submitted_code']
    label = row['label']
    
    similarity_result = detector.compare_codes(original, submitted)
    
    results.append({
        'index': idx,
        'token_similarity': similarity_result['token_similarity'],
        'ast_similarity': similarity_result['ast_similarity'],
        'overall_similarity': similarity_result['overall_similarity'],
        'label': label
    })

results_df = pd.DataFrame(results)
results_df
results_df.to_csv("similarity_results.csv", index=False)

In [15]:
results_df


Unnamed: 0,index,token_similarity,ast_similarity,overall_similarity,label
0,0,0.99407,1.0,0.995849,1
1,1,0.999284,1.0,0.999499,1
2,2,0.953178,0.421053,0.822488,0
3,3,0.994958,1.0,0.99647,1
4,4,0.995397,1.0,0.996778,1
5,5,0.999284,1.0,0.999499,1
6,6,0.957507,0.421053,0.825518,0
7,7,0.99407,1.0,0.995849,1
8,8,0.971513,0.571429,0.872917,0
9,9,0.995397,1.0,0.996778,1
