# 1. Setup: Libraries, Secrets, Config, Models, Clients


In [1]:
# --- Install dependencies ---
!pip install -q torch transformers openai chromadb pandas PyYAML sentence-transformers

# --- Imports ---
import os
import logging
import time
import json
from typing import List, Optional, Dict, Any, Tuple

import torch
from transformers import AutoTokenizer, AutoModel
from openai import OpenAI
import chromadb
from google.colab import userdata, drive # Import drive

# --- Basic Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration Constants ---
try:
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    if not OPENAI_API_KEY:
        raise ValueError("OpenAI API Key not found in Colab Secrets.")
    logger.info("OpenAI API Key loaded from Colab Secrets.")
except Exception as e:
    logger.critical(f"Failed to get OpenAI API Key: {e}. Notebook cannot proceed without it.")

DRIVE_REPO_PATHS = [
    "/content/drive/MyDrive/severity-ai-website",
    "/content/drive/MyDrive/mini_os_with_react",
]
# file extensions to index
PROGRAMMING_LANGUAGES = [".js"]

# Other configurations
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "gpt-4o"
COLLECTION_NAME = "drive_code_db_v2"
DB_PATH = "/content/temp_chroma_db_drive"
MAX_CHUNK_LENGTH_TOKENS = 512 # Max tokens per chunk
CHUNK_OVERLAP_TOKENS = 100    # Overlap in tokens
RAG_NUM_RESULTS = 3
INDEXING_BATCH_SIZE = 32    # Number of chunks to embed and add at once

# --- Check GPU ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    logger.info(f"GPU found: {torch.cuda.get_device_name(0)}. Using GPU.")
else:
    device = torch.device("cpu")
    logger.info("No GPU found. Using CPU (will be slower).")

# --- Load Embedding Model ---
logger.info(f"Loading embedding model & tokenizer: {EMBEDDING_MODEL_NAME}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
    embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME)
    embedding_model.to(device)
    embedding_model.eval()

    model_max_len = getattr(tokenizer, 'model_max_length', MAX_CHUNK_LENGTH_TOKENS)
    if MAX_CHUNK_LENGTH_TOKENS > model_max_len:
        logger.warning(f"Configured max_chunk_length ({MAX_CHUNK_LENGTH_TOKENS}) exceeds tokenizer's max length ({model_max_len}). Using {model_max_len}.")
        MAX_CHUNK_LENGTH_TOKENS = model_max_len
    logger.info(f"Embedding model loaded. Using chunk length: {MAX_CHUNK_LENGTH_TOKENS}, overlap: {CHUNK_OVERLAP_TOKENS}")
except Exception as e:
    logger.critical(f"Failed to load embedding model/tokenizer: {e}", exc_info=True)
    embedding_model = None
    tokenizer = None

# --- Initialize OpenAI Client ---
logger.info(f"Initializing OpenAI client (Model: {LLM_MODEL_NAME})...")
if OPENAI_API_KEY: # Only proceed if key was loaded
    try:
        openai_client = OpenAI(api_key=OPENAI_API_KEY)
        logger.info("OpenAI client initialized.")
    except Exception as e:
        logger.critical(f"Failed to initialize OpenAI client: {e}", exc_info=True)
        openai_client = None
else:
    logger.warning("Skipping OpenAI client initialization as API key is missing.")
    openai_client = None

# --- Initialize ChromaDB Client ---
logger.info(f"Initializing ChromaDB client at: {DB_PATH}")
try:
    chroma_client = chromadb.PersistentClient(path=DB_PATH)
    collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME)
    logger.info(f"ChromaDB collection '{COLLECTION_NAME}' ready. Initial count: {collection.count()}")
except Exception as e:
    logger.critical(f"Failed to initialize ChromaDB: {e}", exc_info=True)
    collection = None

print("\n--- Setup and Initialization Complete ---")
# Verify Drive paths exist (basic check)
for path in DRIVE_REPO_PATHS:
    if not os.path.isdir(path):
        print(f"WARNING: Specified repository path does not exist or is not a directory: {path}")
        logger.warning(f"Specified repository path does not exist or is not a directory: {path}")

if not all([embedding_model, tokenizer, openai_client, collection]):
    print("\nERROR: One or more critical components (Model, Tokenizer, OpenAI, DB) failed to initialize. Check logs. Subsequent cells may fail.")
    logger.error("One or more critical components failed to initialize.")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m746.6 kB/s[0m eta [36

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]


--- Setup and Initialization Complete ---


# 2. Core Functions: Embedding, Chunking, File Finding, Indexing, Checking

In [2]:
# Ensure models/clients loaded before defining functions that use them
if not all([embedding_model, tokenizer, openai_client, collection]):
    logger.error("Cannot define core functions - critical components missing from setup.")
    # Define dummy functions or skip cell execution
    def get_embeddings_batch(texts): return None
    def find_code_files(repo_paths, extensions): return []
    def index_code_files(files_to_index): logger.error("Indexing skipped."); return 0
    def check_plagiarism(snippet): logger.error("Plagiarism check skipped."); return {"error": "Components missing."}

else:
    logger.info("Defining core helper and processing functions...")

    # --- Batch Embedding Function ---
    def get_embeddings_batch(texts: List[str]) -> Optional[List[Optional[List[float]]]]:
        """Generates embeddings for a batch of texts using the loaded model."""
        if not texts: return []
        results = [None] * len(texts) # Initialize results list
        valid_texts_indices = [i for i, t in enumerate(texts) if t and t.strip()]
        valid_texts = [texts[i] for i in valid_texts_indices]

        if not valid_texts:
            logger.warning("Received batch with only empty texts.")
            return results # Return list of Nones

        try:
            inputs = tokenizer(
                valid_texts,
                return_tensors="pt",
                truncation=True,
                padding=True, # Pad to longest in the valid batch
                max_length=MAX_CHUNK_LENGTH_TOKENS,
                return_attention_mask=True
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = embedding_model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

            # Place embeddings back into the original structure
            for i, embedding in enumerate(batch_embeddings):
                original_index = valid_texts_indices[i]
                results[original_index] = embedding.tolist()
            return results

        except Exception as e:
            logger.error(f"Error generating batch embeddings: {e}", exc_info=True)
            # Return list of Nones of the correct size if batch fails
            return [None] * len(texts)


    # --- Token-Based Chunking Function ---
    def chunk_code_file(file_path: str) -> List[Tuple[str, int, int]]:
        """Reads a file and chunks its content based on tokens."""
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                code = f.read()
            if not code or not code.strip():
                return []

            # Tokenize the entire file
            tokens = tokenizer(code, add_special_tokens=False, return_attention_mask=False).input_ids

            chunks_data = [] # Store (decoded_chunk_text, start_token_idx, end_token_idx)
            start_idx = 0
            while start_idx < len(tokens):
                end_idx = start_idx + MAX_CHUNK_LENGTH_TOKENS
                # Ensure we don't go past the end
                # end_idx = min(end_idx, len(tokens)) # Not needed if stride works

                chunk_token_ids = tokens[start_idx:end_idx]
                if not chunk_token_ids: break # Stop if no tokens left

                # Decode the chunk
                chunk_text = tokenizer.decode(chunk_token_ids, skip_special_tokens=True).strip()

                if chunk_text: # Only store non-empty chunks
                    chunks_data.append((chunk_text, start_idx, end_idx))

                # Move to the next chunk start position
                start_idx += MAX_CHUNK_LENGTH_TOKENS - CHUNK_OVERLAP_TOKENS
                if start_idx >= len(tokens): break # Safety check
                # Prevent infinite loop if overlap >= chunk_size
                if MAX_CHUNK_LENGTH_TOKENS - CHUNK_OVERLAP_TOKENS <= 0:
                    logger.warning("Chunk step size is non-positive, adjusting to 1 to prevent loop.")
                    start_idx = end_idx # Simple advance if overlap is too large

            return chunks_data

        except Exception as e:
            logger.error(f"Error chunking file {file_path}: {e}", exc_info=True)
            return []

    # --- Find Code Files Function ---
    def find_code_files(repo_paths: List[str], extensions: List[str]) -> List[Tuple[str, str, str]]:
        """Finds code files with given extensions in the specified directories."""
        found_files = [] # Store tuples of (absolute_path, relative_path, repo_name)
        normalized_extensions = [ext.lower() for ext in extensions]

        for repo_path in repo_paths:
            if not os.path.isdir(repo_path):
                logger.warning(f"Skipping invalid repository path: {repo_path}")
                continue

            repo_name = os.path.basename(repo_path) # Get repo name from path
            logger.info(f"Scanning for files in '{repo_name}' ({repo_path})...")

            for root, _, files in os.walk(repo_path):
                # Basic check to skip potential virtual envs or large hidden dirs
                if ".git" in root.split(os.sep) or "venv" in root.split(os.sep) or "node_modules" in root.split(os.sep):
                    continue

                for file in files:
                    if any(file.lower().endswith(ext) for ext in normalized_extensions):
                        absolute_path = os.path.join(root, file)
                        # Calculate relative path *within* the specific repo base path
                        relative_path = os.path.relpath(absolute_path, repo_path)
                        found_files.append((absolute_path, relative_path, repo_name))

        logger.info(f"Found {len(found_files)} code files matching extensions: {extensions}")
        return found_files


    # --- Indexing Function (Processes Files) ---
    def index_code_files(files_to_index: List[Tuple[str, str, str]]) -> int:
        """Chunks, embeds, and indexes the provided code files."""
        logger.info(f"Starting indexing for {len(files_to_index)} code files...")
        if collection is None:
            logger.error("ChromaDB collection not available. Skipping indexing.")
            return 0

        total_chunks_added = 0
        files_processed = 0
        chunks_to_embed_batch = [] # Accumulate chunks across files for batching

        for abs_path, rel_path, repo_name in files_to_index:
            logger.debug(f"Processing file: {rel_path} (from repo: {repo_name})")
            files_processed += 1

            # Chunk the code file using tokens
            # Returns list of (chunk_text, start_token_idx, end_token_idx)
            code_chunks_data = chunk_code_file(abs_path)

            if not code_chunks_data:
                 logger.warning(f"No valid chunks generated for file: {rel_path}")
                 continue

            # Prepare chunk data for embedding and DB storage
            for i, (chunk_text, start_tok, end_tok) in enumerate(code_chunks_data):
                 # Create a unique ID for the chunk
                 safe_rel_path = rel_path.replace(os.sep, "__").replace('.', '_')
                 chunk_id = f"{repo_name}__{safe_rel_path}_chunk_{i}"
                 metadata = {
                     "repo_name": repo_name,
                     "relative_path": rel_path,
                     "chunk_index": i,
                     "start_token": start_tok,
                     "end_token": end_tok
                 }
                 chunks_to_embed_batch.append({
                     "id": chunk_id,
                     "text": chunk_text,
                     "metadata": metadata
                 })

            # Process batches when accumulated enough or at the end
            if len(chunks_to_embed_batch) >= INDEXING_BATCH_SIZE or files_processed == len(files_to_index):
                if not chunks_to_embed_batch: continue # Skip if batch is empty

                logger.info(f"Preparing batch of {len(chunks_to_embed_batch)} chunks for embedding...")
                batch_ids = [item['id'] for item in chunks_to_embed_batch]
                batch_texts = [item['text'] for item in chunks_to_embed_batch]
                batch_metadatas = [item['metadata'] for item in chunks_to_embed_batch]

                # Get embeddings (returns list potentially containing Nones)
                batch_embeddings = get_embeddings_batch(batch_texts)

                # Filter out failed embeddings before adding to DB
                valid_ids = []
                valid_embeddings = []
                valid_metadatas = []
                valid_documents = []
                num_failed_in_batch = 0

                if batch_embeddings is None: # Whole batch failed
                    logger.error(f"Embedding failed for entire batch of {len(chunks_to_embed_batch)} chunks.")
                    num_failed_in_batch = len(chunks_to_embed_batch)
                else:
                    for i, emb in enumerate(batch_embeddings):
                        if emb:
                            valid_ids.append(batch_ids[i])
                            valid_embeddings.append(emb)
                            valid_metadatas.append(batch_metadatas[i])
                            valid_documents.append(batch_texts[i]) # Store text
                        else:
                            logger.warning(f"Embedding failed for chunk ID: {batch_ids[i]}, skipping.")
                            num_failed_in_batch += 1

                # Add valid items to ChromaDB
                if valid_ids:
                    try:
                        collection.add(
                            ids=valid_ids,
                            embeddings=valid_embeddings,
                            metadatas=valid_metadatas,
                            documents=valid_documents
                        )
                        total_chunks_added += len(valid_ids)
                        logger.info(f"Added batch of {len(valid_ids)} chunks to DB. ({num_failed_in_batch} failures in batch)")
                    except Exception as e:
                        logger.error(f"Failed to add batch to ChromaDB: {e}", exc_info=True)
                        # Optionally, track which files failed here
                elif num_failed_in_batch > 0:
                     logger.error(f"No valid embeddings generated for batch of size {len(chunks_to_embed_batch)}.")

                # Clear the batch accumulator
                chunks_to_embed_batch = []


        logger.info(f"--- Indexing Complete ---")
        logger.info(f"Processed {files_processed} files.")
        logger.info(f"Added {total_chunks_added} total chunks to collection '{COLLECTION_NAME}'.")
        try:
             logger.info(f"Final collection count: {collection.count()}")
        except Exception as e:
             logger.error(f"Failed to get final collection count: {e}")
        return total_chunks_added


    # --- Plagiarism Check Function ---
    PLAGIARISM_PROMPT_TEMPLATE = """
You are an AI assistant helping to detect potential code plagiarism.
Analyze the user's code snippet below and compare it against the provided reference code snippets retrieved from a database.

**User Code Snippet:**
```python
{user_code}
```

**Reference Code Snippets:**
{context}

**Comparison:**

Task:
Based ONLY on the provided snippets, determine if the user's code is likely plagiarized from the reference snippets. Consider logic, structure, comments, and variable names. Trivial similarities (like standard imports or basic syntax) should not be flagged unless the overall structure is identical.

Respond in JSON format with two keys:

"is_plagiarized": boolean (true if likely plagiarized, false otherwise).

"reasoning": string (a brief 1-2 sentence explanation for your decision).
"""

def check_plagiarism(user_snippet: str) -> Dict[str, Any]:
    """Checks a user snippet for plagiarism using RAG + LLM."""
    start_time = time.time()
    logger.info(f"Checking plagiarism for snippet (length {len(user_snippet)})...")

    if not user_snippet or not user_snippet.strip():
        return {"error": "Input snippet is empty."}
    if collection is None or openai_client is None or embedding_model is None:
         return {"error": "Required components (DB, LLM, Embedder) not available."}

    # 1. Embed the user snippet
    user_embedding = get_embeddings_batch([user_snippet])[0] # Use batch embed for single item
    if not user_embedding:
        return {"error": "Failed to generate embedding for user snippet."}

    # 2. Query ChromaDB
    try:
        results = collection.query(
            query_embeddings=[user_embedding],
            n_results=RAG_NUM_RESULTS,
            include=["documents", "metadatas", "distances"]
        )
    except Exception as e:
        logger.error(f"ChromaDB query failed: {e}", exc_info=True)
        return {"error": f"Database query failed: {e}"}

    # 3. Prepare Context for LLM
    similar_docs = results.get("documents", [[]])[0]
    similar_metadatas = results.get("metadatas", [[]])[0]
    distances = results.get("distances", [[]])[0]

    if not similar_docs:
        logger.info("No similar documents found in DB.")
        return {
            "is_plagiarized": False,
            "reasoning": "No similar code snippets found in the source database.",
            "references": [],
            "llm_called": False
        }

    context_str = ""
    references = []
    for i, (doc, meta, dist) in enumerate(zip(similar_docs, similar_metadatas, distances)):
        repo = meta.get('repo_name', 'Unknown Repo')
        source = meta.get('relative_path', 'Unknown Source') # Use relative path now
        chunk_idx = meta.get('chunk_index', 'N/A')
        context_str += f"--- Reference Snippet {i+1} ---\n"
        context_str += f"Source: {repo}/{source} (Chunk: {chunk_idx}, Distance: {dist:.4f})\n"
        context_str += f"```python\n{doc}\n```\n\n"
        references.append({"repo": repo, "file": source, "distance": f"{dist:.4f}", "chunk_index": chunk_idx})

    # 4. Call LLM
    prompt = PLAGIARISM_PROMPT_TEMPLATE.format(user_code=user_snippet, context=context_str)

    try:
        response = openai_client.chat.completions.create(
            model=LLM_MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant designed to output JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=150,
            response_format={"type": "json_object"}
        )
        llm_output_raw = response.choices[0].message.content.strip()
        logger.debug(f"LLM Raw Output: {llm_output_raw}")

        # 5. Parse LLM Response
        try:
            llm_result = json.loads(llm_output_raw)
            is_plagiarized = llm_result.get("is_plagiarized", False)
            reasoning = llm_result.get("reasoning", "LLM response missing reasoning.")
            if not isinstance(is_plagiarized, bool):
                 logger.warning("LLM 'is_plagiarized' key was not a boolean. Defaulting to false.")
                 is_plagiarized = False
                 reasoning += " (LLM response format warning: is_plagiarized not boolean)"
        except json.JSONDecodeError:
            logger.error(f"Failed to decode LLM JSON response: {llm_output_raw}")
            return {"error": "Failed to parse LLM JSON response.", "llm_raw_output": llm_output_raw}
    except Exception as e:
        logger.error(f"OpenAI API call failed: {e}", exc_info=True)
        return {"error": f"LLM API call failed: {e}"}

    total_time = time.time() - start_time
    logger.info(f"Plagiarism check finished in {total_time:.2f}s. Decision: {is_plagiarized}")
    return {
        "is_plagiarized": is_plagiarized,
        "reasoning": reasoning,
        "references": references if is_plagiarized else [],
        "llm_called": True,
        "duration_sec": round(total_time, 2)
    }
print("\n--- Core Functions Defined ---")


--- Core Functions Defined ---


In [3]:
# @title 3. Run Indexing Process
import time

logger.info("--- Starting Indexing Process ---")
print("--- Finding and Indexing Code Files from Google Drive ---")

# Check dependencies first
if collection is None or embedding_model is None or tokenizer is None:
    print("\nERROR: Cannot index code - critical components (DB, Model, Tokenizer) missing. Check Cell 1 logs.")
    logger.error("Cannot index code - critical components missing.")
else:
    # 1. Find files to index from specified Drive paths
    start_find_time = time.time()
    files_to_index = find_code_files(DRIVE_REPO_PATHS, PROGRAMMING_LANGUAGES)
    end_find_time = time.time()
    print(f"File scanning took {end_find_time - start_find_time:.2f} seconds.")

    if not files_to_index:
        print("\nWARNING: No code files found in the specified Google Drive paths matching the extensions.")
        print(f" > Searched paths: {DRIVE_REPO_PATHS}")
        print(f" > Looked for extensions: {PROGRAMMING_LANGUAGES}")
        logger.warning("No code files found to index.")
    else:
        # 2. Clear existing collection items for a clean run
        count_before = collection.count()
        if count_before > 0:
             logger.warning(f"Collection '{COLLECTION_NAME}' already contains {count_before} items. Clearing for fresh indexing...")
             print(f"Clearing {count_before} existing items from collection '{COLLECTION_NAME}'...")
             try:
                 chroma_client.delete_collection(name=COLLECTION_NAME) # Delete collection
                 collection = chroma_client.create_collection(name=COLLECTION_NAME) # Recreate empty
                 logger.info(f"Collection cleared and recreated. New count: {collection.count()}")
                 print("Collection cleared.")
             except Exception as e:
                  logger.error(f"Error clearing existing collection: {e}. Indexing may contain duplicates or fail.")
                  print(f"ERROR clearing collection: {e}")

        # 3. Run the indexing function
        print(f"\nStarting indexing of {len(files_to_index)} files...")
        start_index_time = time.time()
        num_added = index_code_files(files_to_index)
        end_index_time = time.time()

        print(f"\n--- Indexing Summary ---")
        print(f" > Indexing process took {end_index_time - start_index_time:.2f} seconds.")
        print(f" > Attempted to process {len(files_to_index)} files.")
        print(f" > Added {num_added} chunks to the database.")
        try:
            final_count = collection.count()
            print(f" > Final collection size ('{COLLECTION_NAME}'): {final_count} items.")
            logger.info(f"Final collection size: {final_count}")
        except Exception as e:
            print(f" > Error getting final collection count: {e}")
            logger.error(f"Error getting final collection count: {e}")


print("\n--- Indexing Step Complete ---")

--- Finding and Indexing Code Files from Google Drive ---
File scanning took 14.12 seconds.

Starting indexing of 32 files...


Token indices sequence length is longer than the specified maximum sequence length for this model (855 > 512). Running this sequence through the model will result in indexing errors



--- Indexing Summary ---
 > Indexing process took 28.02 seconds.
 > Attempted to process 32 files.
 > Added 50 chunks to the database.
 > Final collection size ('drive_code_db_v2'): 50 items.

--- Indexing Step Complete ---


In [4]:
# @title 4. Test Plagiarism Checker

logger.info("--- Testing Plagiarism Checker ---")
print("\n--- Testing Plagiarism Checker ---")

# --- Test Cases ---

test_snippets = [
    """
  const DesktopContainer = styled.div`
  width: 100vw;
  height: 100vh;
  background-image: url(${(props) => props.wallpaper});
  background-size: cover;
  background-position: center;
  overflow: hidden;
`;

const DesktopIcons = styled.div`
  padding: 20px;
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(80px, 1fr));
  grid-gap: 20px;
`;

function Desktop() {
  const { state } = useContext(AppContext);

""",
    """
function getUserData(userId) {
  console.log('Fetching data for user:', userId);
  // Placeholder for actual fetch logic
  return { id: userId, name: 'Test User' };
}
""",
    """
def simple_loop(count):
    for i in range(count):
        print(f"Iteration {i+1}")
    return count
"""
]

# --- Run Checks ---
if collection is not None and openai_client is not None and embedding_model is not None:
    # Verify the collection not empy
    if collection.count() == 0:
         print("\nWARNING: Collection is empty.")
         logger.warning("Collection is empty.")

    for i, snippet in enumerate(test_snippets):
        print(f"\n--- Checking Test Snippet {i+1} ---")
        print("Code:")
        print(snippet)
        print("-" * 20)

        # Perform the check
        result = check_plagiarism(snippet)

        print("Result:")
        print(json.dumps(result, indent=2))
        print("-" * 30)

else:
    logger.error("Cannot run plagiarism checks: Required components not available.")
    print("\nERROR: Cannot run checks, critical components missing. Check logs in Cell 1 & 2.")

print("\n--- Plagiarism Check Testing Done ---")


--- Testing Plagiarism Checker ---

--- Checking Test Snippet 1 ---
Code:

  const DesktopContainer = styled.div`
  width: 100vw;
  height: 100vh;
  background-image: url(${(props) => props.wallpaper});
  background-size: cover;
  background-position: center;
  overflow: hidden;
`;

const DesktopIcons = styled.div`
  padding: 20px;
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(80px, 1fr));
  grid-gap: 20px;
`;

function Desktop() {
  const { state } = useContext(AppContext);


--------------------
Result:
{
  "is_plagiarized": true,
  "reasoning": "The user's code snippet shares significant structural and syntactic similarities with Reference Snippet 1, including identical styled component definitions and function structure, indicating likely plagiarism.",
  "references": [
    {
      "repo": "mini_os_with_react",
      "file": "src/components/Desktop.js",
      "distance": "2.7305",
      "chunk_index": 0
    },
    {
      "repo": "mini_os_with_react",
      "fi