# Similarity Test

Use this to test how similar selected texts are!


**How it works:**
1. Input the number of files you want to compare
2. Upload the files
3. See the results!

**Required modules:**
- `pip install scikit-learn`

In [9]:
# Import necessary modules!
import os
import tkinter as tk
from tkinter import filedialog
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ask user how many files they want to upload
while True:
    try:
        num_files = int(input("How many files would you like to upload? "))
        if num_files < 1:
            print("Please enter a number greater than 0.")
            continue
        break
    except ValueError:
        print("Please enter a valid number.")

# Create root window and hide it
root = tk.Tk()
root.withdraw()

selected_files = []
student_notes = []

if num_files <= 2:
    # Single file selection for 1-2 files
    for i in range(num_files):
        while True:
            source_path = filedialog.askopenfilename(
                title=f"Select Text File {i+1} of {num_files}", 
                filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")]
            )
            if source_path:
                selected_files.append(source_path)
                print(f"Selected file {i+1}: {source_path}")
                break
            else:
                print(f"No file selected for slot {i+1}. Please select a file.")
else:
    # Batch upload for more than 2 files
    while True:
        source_paths = filedialog.askopenfilenames(
            title=f"Select EXACTLY {num_files} Text Files (Batch Upload)", 
            filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")]
        )
        selected_files = list(source_paths)
        
        if len(selected_files) == num_files:
            print(f"Perfect! Selected {len(selected_files)} files:")
            for i, file_path in enumerate(selected_files, 1):
                print(f"  {i}. {file_path}")
            break
        elif len(selected_files) == 0:
            print("No files selected. Please select files.")
        else:
            print(f"You selected {len(selected_files)} files, but you specified {num_files} files.")
            print(f"Please select exactly {num_files} files.")

# Read content from selected files
student_files = []
for file_path in selected_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            student_notes.append(content)
            student_files.append(os.path.basename(file_path))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")


def vectorize(Text): return TfidfVectorizer().fit_transform(Text).toarray()
def similarity(doc1, doc2): return cosine_similarity([doc1, doc2])


vectors = vectorize(student_notes)
s_vectors = list(zip(student_files, vectors))
plagiarism_results = set()


def similarity_check():
    global s_vectors
    for student_a, text_vector_a in s_vectors:
        new_vectors = s_vectors.copy()
        current_index = new_vectors.index((student_a, text_vector_a))
        del new_vectors[current_index]
        for student_b, text_vector_b in new_vectors:
            sim_score = similarity(text_vector_a, text_vector_b)[0][1]
            student_pair = sorted((student_a, student_b))
            score = (student_pair[0], student_pair[1], sim_score)
            plagiarism_results.add(score)
    return plagiarism_results
    

for data in similarity_check():
    file1, file2, score = data
    print(f"Similarity data:\n ({file1}, {file2}, {float(score)})")

You selected 4 files, but you specified 3 files.
Please select exactly 3 files.
Perfect! Selected 3 files:
  1. C:/Users/gerra/Documents/side projects/python bitesize projects/test1.txt
  2. C:/Users/gerra/Documents/side projects/python bitesize projects/test2.txt
  3. C:/Users/gerra/Documents/side projects/python bitesize projects/test3.txt
Similarity data:
 (test1.txt, test3.txt, 0.27345017765273255)
Similarity data:
 (test1.txt, test2.txt, 0.5979687361418285)
Similarity data:
 (test2.txt, test3.txt, 0.0)
