In [None]:

import os
import shutil
import asyncio
import aiohttp
import aiofiles
import time
from pathlib import Path
from typing import Set, List, Optional, Dict, Any, Tuple
import json
import re

PROCESSED_FILES_RECORD = "processed_files.txt"

sources = {
    "OpenRouter": {
        "base_url": "https://openrouter.ai/api/v1",
        "model": "qwen/qwen-2.5-coder-32b-instruct:free",
        "api_key": "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"
    },
    "Groq": {
        "base_url": "https://api.groq.com/openai/v1/",
        "model": "qwen-2.5-coder-32b",
        "api_key": "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"
    },
    "Groq2": {
        "base_url": "https://api.groq.com/openai/v1/",
        "model": "qwen-2.5-coder-32b",
        "api_key": "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"
    },
    "X.ai": {
        "base_url": "https://api.x.ai/v1",
        "model": "grok-2-latest",
        "api_key": "xai-Lggu94vl22xIj5ThXFVu9nxbXjqYOqbTtLlxnibCypTkA5F8N4SvT0SDAYKIK8EkQCuVHv3tbfOPGiwt"
    },
    "Ollama": {
        "base_url": "http://localhost:11434",
        "model": "qwen2.5-coder",
        "api_key": None
    }
}

# Set the selected source
source = "X.ai"

# Base configuration
BASE_CONFIG = {
    "source_directories": [r"C:\Users\harold.noble\Desktop\RIC\app"],
    "skip_directories": [],
    "selected_pass": "adjust_comments",
    "requests_per_minute": 5,
    "concurrent_requests": 3,
    "max_context_tokens": 6000,
    "skip_long_files": False,
    "temperature": 0.6,
    "top_p": 0.95,
    "file_extensions": [".py", ".ts", ".svelte"],
    "retry_sleep_seconds": 30,
    "search_terms": [],
    "prompt_templates": {
        "optimize": (
            "Optimize the {file_type} code while preserving core functionality:\n"
            "- Remove all i18n references (e.g., t(), $t, i18n.*) unless tied to core logic\n"
            "- Eliminate unused variables, imports, and dead code after verifying no external dependencies\n"
            "- Refactor repeated logic into reusable functions or components with descriptive names (e.g., calculateTotal, not calc)\n"
            "- Simplify code by:\n"
            "  - Using early returns to reduce nesting\n"
            "  - Replacing complex conditionals with clearer alternatives where possible\n"
            "- Improve performance by minimizing loops and redundant operations\n"
            "- Use {file_type}-specific best practices (e.g., async/await in .ts, reactive statements in .svelte)\n"
            "- Naming: variables/functions in camelCase, classes/components in PascalCase\n"
            "- Keep lines between 80-100 characters; break logically if longer\n"
            "- No empty lines between imports; one empty line before code\n"
            "- Preserve core logic, including onMount(() => {{) in .svelte files\n"
            "- Define core functionality as: primary operations the script performs, excluding optional features\n"
            "- Be decisive: apply the most effective optimization without alternatives\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without explanations or additional text."
        ),
        "comment_cleanup": (
            "Process the {file_type} code while preserving all functionality:\n"
            "- Remove all existing comments (e.g., //, #, /* */)\n"
            "- Add precise documentation as follows:\n"
            "  - For .py files: add a triple-quoted docstring ('''') to every function and top-level script block, including:\n"
            "    - Purpose: one sentence describing what it does\n"
            "    - Parameters: list each parameter with type and purpose\n"
            "    - Returns: describe the return value and type\n"
            "  - For .ts/.svelte files: add JSDoc comments (/** */) above every function and component, including:\n"
            "    - Purpose: one sentence describing what it does\n"
            "    - @param {{type}} name - purpose of each parameter\n"
            "    - @returns {{type}} - description of return value\n"
            "  - For HTML/Svelte markup: add <!-- Section: purpose --> comments to separate and describe major structural blocks (e.g., header, main content)\n"
            "- Keep comments concise: max 2 lines unless complex logic requires more\n"
            "- For complex logic, add brief inline comments (e.g., // Calculate total score)\n"
            "- No empty lines between imports; one empty line before code begins\n"
            "- Preserve core logic, including onMount(() => {{) in .svelte files\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without explanations or additional text."
        ),
        "combined": (
            "Combine the following two tasks for the {file_type} code while preserving core functionality:\n"
            "1. Optimize the code by removing all i18n references, unused variables, imports, dead code, and by refactoring repeated logic. "
            "Apply general optimizations such as eliminating unnecessary if statements when only one option exists, "
            "simplifying conditionals, and reducing redundant operations. "
            "Use descriptive naming, leverage {file_type}-specific best practices, and maintain clear control flow. "
            "Remove code relating to: ldap, updating version, oauth, haptic, mobile, channels, googledrive, onedrive, CallOverlay, transparentBackground, backgroundImageUrl.\n"
            "2. Clean up comments by removing all existing comments and adding clear, concise, and informative documentation. "
            "Add comments to every function: for .py files, use docstrings with purpose, parameters, and returns; "
            "for .ts/.svelte files, use JSDoc comments (purpose, @param, @returns). "
            "For HTML/markup, add structured comments to describe the layout and purpose of every section and function.\n\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without any explanations or additional text."
        ),
        "python_combined": (
            "Process the following Python code by combining these two tasks while preserving core functionality:\n"
            "1. Optimize the code by removing unused variables, imports, dead code, and refactoring repeated logic. "
            "Apply Python-specific optimizations such as using list comprehensions where appropriate, simplifying conditionals, "
            "eliminating unnecessary if statements with single options, and reducing redundant operations. "
            "Use PEP 8-compliant naming, leverage Python best practices (e.g., context managers, built-in functions), and ensure clear control flow. "
            "Ensure no blank lines between import statements and remove code related to: ldap, updating version, oauth, haptic, mobile, channels, "
            "googledrive, onedrive, CallOverlay, transparentBackground, backgroundImageUrl.\n"
            "2. Clean up comments by removing all existing comments and adding clear, concise, and informative documentation. "
            "Add docstrings to every function with purpose, parameters, and return values following Python docstring conventions (e.g., Google or NumPy style). "
            "Include brief inline comments only where complex logic requires clarification.\n\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without any explanations or additional text."
        ),
        "adjust_comments": (
            "Adjust comments in the {file_type} code while preserving all functionality:\n"
            "- Remove all existing comments\n"
            "- Add concise, meaningful single-line comments (unless a docstring) as follows:\n"
            "  - For .py files: add a triple-quoted docstring ('''') to every function describing its purpose in one sentence\n"
            "    - Add single-line comments (e.g., # Purpose) before significant code blocks\n"
            "  - For .ts/.svelte files: add single-line JSDoc-style comments (// Purpose) above functions\n"
            "    - Use // @param name - purpose and // @returns - purpose where applicable\n"
            "  - For HTML/Svelte markup: add single-line <!-- Purpose --> comments before major sections\n"
            "- Keep comments on one line, max 80 characters, unless it's a docstring\n"
            "- Focus comments on explaining 'why' rather than 'what', unless the logic is complex\n"
            "- Do not modify any functional code, only adjust comments\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without explanations or additional text."
        ),
    }
}

# Set up CONFIGS based on source
if source == "Groq":
    CONFIGS = [
        {**BASE_CONFIG, "base_url": sources["Groq"]["base_url"], "api_key": sources["Groq"]["api_key"], "model": sources["Groq"]["model"]},
        {**BASE_CONFIG, "base_url": sources["Groq2"]["base_url"], "api_key": sources["Groq2"]["api_key"], "model": sources["Groq2"]["model"]}
    ]
else:
    CONFIGS = [{**BASE_CONFIG, "base_url": sources[source]["base_url"], "api_key": sources[source]["api_key"], "model": sources[source]["model"]}]


def load_processed_files() -> Set[str]:
    """Load previously processed file paths."""
    return set(line.strip() for line in open(PROCESSED_FILES_RECORD, "r", encoding="utf-8").readlines()) if os.path.exists(PROCESSED_FILES_RECORD) else set()

def save_processed_files(file_paths: List[str]) -> None:
    """Append newly processed file paths to record."""
    with open(PROCESSED_FILES_RECORD, "a", encoding="utf-8") as f:
        f.writelines(f"{fp}\n" for fp in file_paths)


class RateLimiter:
    def __init__(self, requests_per_minute: int):
        self.interval = 60.0 / requests_per_minute
        self.last_request_time = time.time()
        self.lock = asyncio.Lock()

    async def acquire(self):
        async with self.lock:
            current_time = time.time()
            sleep_time = self.interval - (current_time - self.last_request_time)
            if sleep_time > 0:
                await asyncio.sleep(sleep_time)
            self.last_request_time = time.time()


async def refactor_code(session: aiohttp.ClientSession, file_path: str, prompt: str, rate_limiter: RateLimiter, sem: asyncio.Semaphore, config: Dict) -> str:
    """Refactor code using API with improved error handling."""
    headers = {"Authorization": f"Bearer {config['api_key']}", "Content-Type": "application/json"}
    payload = {
        "model": config["model"],
        "messages": [{"role": "user", "content": prompt}],
        "temperature": config["temperature"],
        "top_p": config["top_p"],
    }

    async with sem:  # Only use semaphore as context manager
        await rate_limiter.acquire()  # Call acquire directly
        for attempt in range(3):
            try:
                async with session.post(
                    f"{config['base_url']}/chat/completions",
                    json=payload,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=180)
                ) as response:
                    if response.status != 200:
                        if response.status == 429:
                            await asyncio.sleep(config["retry_sleep_seconds"])
                            continue
                        print(f"API error {response.status} for {file_path}: {await response.text()}")
                        return ""

                    result = await response.json()
                    code = result.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
                    if not code:
                        print(f"Empty response for {file_path}")
                        return ""

                    file_ext = Path(file_path).suffix[1:]
                    markers = [f"```{file_ext}", "```"]
                    if code.startswith(markers[0]) and code.endswith(markers[1]):
                        code = code[len(markers[0]):-len(markers[1])].strip()
                    return code

            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {file_path}: {e}")
                if attempt < 2:
                    await asyncio.sleep(config["retry_sleep_seconds"])
    return ""


def split_into_chunks(content: str, max_tokens: int) -> List[str]:
    """Split code into chunks based on token estimate (approx 4 chars per token)."""
    lines = content.splitlines()
    chunks = []
    current_chunk = []
    current_length = 0
    token_limit = max_tokens * 4  # Rough estimate: 4 characters per token

    for line in lines:
        line_length = len(line)
        if current_length + line_length > token_limit and current_chunk:
            chunks.append("\n".join(current_chunk))
            current_chunk = [line]
            current_length = line_length
        else:
            current_chunk.append(line)
            current_length += line_length + 1  # +1 for newline
    if current_chunk:
        chunks.append("\n".join(current_chunk))
    return chunks


async def process_single_file(session: aiohttp.ClientSession, file_path: str, rate_limiter: RateLimiter, sem: asyncio.Semaphore, config: Dict) -> bool:
    """Process a single file, splitting into chunks if necessary."""
    file_ext = Path(file_path).suffix[1:]
    try:
        async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
            content = await f.read()
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return False

    # Split into chunks if content is too long
    chunks = split_into_chunks(content, config["max_context_tokens"])
    modified_chunks = []

    for i, chunk in enumerate(chunks):
        if config["skip_long_files"] and (len(chunk) // 4) > config["max_context_tokens"]:
            print(f"Skipping chunk {i+1} of {file_path} (too long)")
            modified_chunks.append(chunk)  # Keep original chunk if too long
            continue

        prompt = config["prompt_templates"].get(config["selected_pass"], "").format(file_type=file_ext, content=chunk)
        if not prompt:
            print(f"Invalid pass for chunk {i+1} of {file_path}")
            modified_chunks.append(chunk)
            continue

        modified_content = await refactor_code(session, file_path, prompt, rate_limiter, sem, config)
        if not modified_content:
            modified_chunks.append(chunk)  # Keep original if refactoring fails
        else:
            modified_chunks.append(modified_content)

    modified_content = "\n".join(modified_chunks)
    if modified_content == content:
        return False

    try:
        async with aiofiles.open(file_path, "w", encoding="utf-8") as f:
            await f.write(modified_content)
        print(f"Processed: {file_path}")
        return True
    except Exception as e:
        print(f"Failed to write {file_path}: {e}")
        return False
    
def should_skip_file(file_path: Path, skip_dirs: List[Path]) -> bool:
    """Check if file should be skipped based on directories."""
    file_path_str = str(file_path)
    return any(file_path_str.startswith(str(skip_dir)) for skip_dir in skip_dirs)

def scan_directories(config: Dict) -> List[str]:
    """Scan directories for files containing search terms."""
    processed_files = load_processed_files()
    file_paths = []
    skip_dirs = [Path(d).resolve() for d in config["skip_directories"]]

    for directory in config["source_directories"]:
        dir_path = Path(directory).resolve()
        if not dir_path.is_dir():
            continue
        for ext in config["file_extensions"]:
            for file_path in dir_path.glob(f"**/*{ext}"):
                file_path_str = str(file_path.resolve())
                if file_path_str in processed_files or should_skip_file(file_path, skip_dirs):
                    continue
                file_paths.append(file_path_str)

    print(f"Found {len(file_paths)} potential files")
    return file_paths

def backup_files(file_paths: List[str]) -> str:
    """Create backup of files."""
    if not file_paths:
        return ""
    backup_dir = f"backup_{int(time.time())}"
    os.makedirs(backup_dir, exist_ok=True)
    for fp in file_paths:
        shutil.copy(fp, os.path.join(backup_dir, os.path.basename(fp)))
    return backup_dir

async def process_file_batch(session: aiohttp.ClientSession, file_paths: List[str], config: Dict, rate_limiter: RateLimiter, sem: asyncio.Semaphore) -> List[bool]:
    """Process a batch of files."""
    return await asyncio.gather(*[process_single_file(session, fp, rate_limiter, sem, config) for fp in file_paths])


async def main(search_terms: List[str] = None, max_files_to_process: Optional[int] = None):
    """Main function with search term filtering."""
    # Update config with search terms
    for config in CONFIGS:
        config["search_terms"] = search_terms or []

    if not os.path.exists(PROCESSED_FILES_RECORD):
        open(PROCESSED_FILES_RECORD, "w", encoding="utf-8").close()

    # Scan all files first
    all_file_paths = scan_directories(CONFIGS[0])

    # Filter files containing search terms
    matching_files = []
    for file_path in all_file_paths:
        try:
            async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                content = await f.read()
            pattern = '|'.join(rf"(?:{re.escape(term)})" for term in config["search_terms"])
            if re.search(pattern, content, re.IGNORECASE):
                matching_files.append(file_path)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    print(f"Found {len(matching_files)} files containing search terms: {search_terms}")

    # Apply max_files_to_process limit after filtering
    file_paths = matching_files
    if max_files_to_process is not None:
        file_paths = file_paths[:max_files_to_process]

    if not file_paths:
        print("No matching files found.")
        return

    print(f"Processing {len(file_paths)} files")
    backup_dir = backup_files(file_paths)

    file_batches = [file_paths] if source != "Groq" or len(CONFIGS) == 1 else [file_paths[:len(file_paths)//2], file_paths[len(file_paths)//2:]]

    rate_limiters = [RateLimiter(config["requests_per_minute"]) for config in CONFIGS]
    semaphores = [asyncio.Semaphore(config["concurrent_requests"]) for config in CONFIGS]

    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=sum(c["concurrent_requests"] for c in CONFIGS))) as session:
        all_results = []
        for config, files, rate_limiter, sem in zip(CONFIGS, file_batches, rate_limiters, semaphores):
            if files:
                # Remove the search term check from process_single_file since we already filtered
                results = await process_file_batch(session, files, config, rate_limiter, sem)
                all_results.extend(results)

        processed_files = [fp for fp, success in zip(file_paths, all_results) if success]
        save_processed_files(processed_files)
        print(f"Processed {len(processed_files)} files")

    if backup_dir and os.path.exists(backup_dir):
        shutil.rmtree(backup_dir)

In [2]:
await main(search_terms=["mobile"], max_files_to_process=5)

Found 427 potential files
Found 8 files containing search terms: ['mobile']
Processing 5 files
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\stores\index.ts
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\routes\+layout.svelte
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\chat\MessageInput.svelte
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\common\Banner.svelte
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\chat\Chat.svelte
Processed 5 files
