In [None]:
#!/usr/bin/env python3
# ./refactor_script.py
"""
Script to refactor Python, Svelte, and TypeScript files using LLM streaming.
Processes 5 files per run, tracks progress in a file, and overwrites originals with refactored code.
Supports skipping specified subfolders and focuses on docstrings, formatting, optimization,
error handling, and cleanup. Logs problematic files to 'problematic_files.txt' for manual review.
"""

import os
import re
import time
from typing import List, Set, Optional
import ast
import chardet
from groq import Groq
from tqdm import tqdm

# Configuration constants
GROQ_API_KEY = "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"
DEFAULT_DIRECTORY = r"C:\Users\harold.noble\Desktop\open-webui - Copy\app\src"
SKIP_FOLDERS = frozenset({".github", "code_helper", "ollama-0"})
PROCESSED_FILES_TRACKER = "processed_files.txt"
PROBLEMATIC_FILES_LOG = "problematic_files.txt"  # File to log problematic files
FILES_PER_RUN = 10
LLM_MODEL = "deepseek-r1-distill-llama-70b"
LLM_TEMPERATURE = 0.5
LLM_TOP_P = 0.9
TIMEOUT_SECONDS = 180
MAX_RETRIES = 3
MAX_TOKENS = 6000  # Token limit based on error message

# LLM prompts
SYSTEM_PROMPT = """
You are an expert developer specializing in code refactoring, created to assist with optimizing code scripts. Your role is to refactor entire scripts while preserving their original functionality and function names.

IMPORTANT: Your response must contain ONLY the refactored code with no explanations, comments about changes, or formatting markers. Do not include markdown code blocks, explanations, or anything else that is not part of the actual code.
"""

REFACTOR_PROMPT = """
Refactor this code (Python, Svelte, or TypeScript) with the following improvements:

1. Remove all code related to these components and their derivatives:
   - webhookUrl, LDAP, oauth, enable_community_sharing, ENABLE_CHANNELS, playground, nonLocalVoices, haptic, mobile
   - All i18n related code and references

2. Optimize performance while maintaining readability:
   - Simplify logic flows
   - Eliminate redundancies and inefficiencies
   - Streamline operations

3. Clean up the codebase:
   - Remove unused imports and dead code

4. Enhance error handling:
   - Use specific exception types (ValueError, IOError, etc.)
   - Include actionable error messages

5. Improve documentation:
   - Replace comments with appropriate docstrings for modules, functions, and classes
   - Retain inline comments for notes not suitable for docstrings

IMPORTANT: Do not rename any functions during refactoring. Return ONLY the refactored code without explanations, markdown formatting, or change summaries.

Code:
{code_content}
"""


def fetch_llm_response(
    prompt: str,
    system_prompt: str = SYSTEM_PROMPT,
    model: str = LLM_MODEL,
    temperature: float = LLM_TEMPERATURE,
    top_p: float = LLM_TOP_P,
    retries: int = MAX_RETRIES,
) -> str:
    """
    Fetch refactored code from the Groq API.

    Args:
        prompt: The input prompt with code to refactor.
        system_prompt: Instructions for the LLM.
        model: The LLM model to use.
        temperature: Sampling temperature for LLM output.
        top_p: Top-p sampling parameter for LLM output.
        retries: Number of retry attempts on failure.

    Returns:
        The refactored code as a string.

    Raises:
        ValueError: If prompt is invalid or API key is unset.
        RuntimeError: If API call fails after all retries.
    """
    if not prompt or not isinstance(prompt, str):
        raise ValueError("Prompt must be a non-empty string")
    if len(prompt) > MAX_TOKENS * 4:  # Rough estimate: 4 chars per token
        raise ValueError("Prompt exceeds token limit")
    if not GROQ_API_KEY or GROQ_API_KEY == "type_your_api_key_here":
        raise ValueError("Valid GROQ_API_KEY environment variable or constant required")

    client = Groq(api_key=GROQ_API_KEY)

    for attempt in range(retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt},
                ],
                model=model,
                temperature=temperature,
                top_p=top_p,
                stream=False,
                timeout=TIMEOUT_SECONDS,
            )
            return extract_code(chat_completion.choices[0].message.content)
        except Exception as e:
            if attempt == retries - 1:
                raise RuntimeError(f"Failed to fetch LLM response after {retries} attempts: {str(e)}")
            time.sleep(2 ** attempt)


def extract_code(response: str) -> str:
    """
    Extract pure code from LLM response, stripping non-code content.

    Args:
        response: Raw LLM response string.

    Returns:
        The extracted code string.
    """
    code_block_pattern = r"```(?:python|typescript|svelte|js|jsx|ts|tsx)?(.*?)```"
    code_blocks = re.findall(code_block_pattern, response, re.DOTALL)
    if code_blocks:
        return "\n\n".join(block.strip() for block in code_blocks)

    lines = response.split('\n')
    code_lines = [line for line in lines if not any(
        marker.lower() in line.lower() for marker in (
            "here's the refactored", "explanation:", "summary of changes:"
        )
    )]
    return "\n".join(code_lines) or response


def load_processed_files(tracker_file: str) -> Set[str]:
    """
    Load previously processed file paths from the tracker file, creating it if it doesn't exist.

    Args:
        tracker_file: Path to the processed files tracker.

    Returns:
        Set of processed file paths.
    """
    processed = set()
    try:
        if not os.path.exists(tracker_file):
            with open(tracker_file, "w", encoding="utf-8") as f:
                f.write("# Processed Files\n# Tracks refactored files\n\n")
            print(f"Created new tracker file: {tracker_file}")
        else:
            with open(tracker_file, "r", encoding="utf-8") as f:
                processed.update(
                    line.strip() for line in f
                    if line.strip() and not line.startswith("#")
                )
    except IOError as e:
        print(f"Error: Could not access {tracker_file}: {e}")
    return processed


def save_processed_files(tracker_file: str, processed: Set[str]) -> None:
    """
    Save processed file paths to the tracker file.

    Args:
        tracker_file: Path to the tracker file.
        processed: Set of processed file paths.
    """
    try:
        with open(tracker_file, "w", encoding="utf-8") as f:
            f.write("# Processed Files\n# Tracks refactored files\n\n")
            f.writelines(f"{path}\n" for path in sorted(processed))
    except IOError as e:
        print(f"Error: Failed to save {tracker_file}: {e}")


def load_problematic_files(log_file: str) -> Set[str]:
    """
    Load previously logged problematic file paths, creating the file if it doesn't exist.

    Args:
        log_file: Path to the problematic files log.

    Returns:
        Set of problematic file paths.
    """
    problematic = set()
    try:
        if not os.path.exists(log_file):
            with open(log_file, "w", encoding="utf-8") as f:
                f.write("# Problematic Files\n# Tracks files that failed refactoring\n\n")
            print(f"Created new problematic files log: {log_file}")
        else:
            with open(log_file, "r", encoding="utf-8") as f:
                problematic.update(
                    line.strip() for line in f
                    if line.strip() and not line.startswith("#")
                )
    except IOError as e:
        print(f"Error: Could not access {log_file}: {e}")
    return problematic


def save_problematic_files(log_file: str, problematic: Set[str]) -> None:
    """
    Save problematic file paths to the log file.

    Args:
        log_file: Path to the problematic files log.
        problematic: Set of problematic file paths.
    """
    try:
        with open(log_file, "w", encoding="utf-8") as f:
            f.write("# Problematic Files\n# Tracks files that failed refactoring\n\n")
            f.writelines(f"{path}\n" for path in sorted(problematic))
    except IOError as e:
        print(f"Error: Failed to save {log_file}: {e}")


def is_valid_code(file_path: str, code: str) -> bool:
    """
    Validate code syntax based on file type.

    Args:
        file_path: Path to the file.
        code: Code content to validate.

    Returns:
        True if code is syntactically valid, False otherwise.
    """
    if file_path.endswith(".py"):
        try:
            ast.parse(code)
            return True
        except SyntaxError:
            return False
    return bool(code.strip())


def process_single_file(file_path: str, problematic_files: Set[str]) -> bool:
    """
    Refactor a single file and overwrite it if successful, log to problematic files if it fails.

    Args:
        file_path: Path to the file to process.
        problematic_files: Set of files that failed processing.

    Returns:
        True if refactoring succeeded and file was updated, False otherwise.
    """
    try:
        encoding = "utf-8"
        try:
            with open(file_path, "r", encoding=encoding) as f:
                original_content = f.read()
        except UnicodeDecodeError:
            with open(file_path, "rb") as f:
                raw = f.read()
                detected = chardet.detect(raw)
                encoding = detected["encoding"] or "latin1"
            with open(file_path, "r", encoding=encoding) as f:
                original_content = f.read()
            tqdm.write(f"Note: {file_path} read with encoding '{encoding}'")

        prompt = REFACTOR_PROMPT.format(code_content=original_content)
        try:
            refactored_content = fetch_llm_response(prompt)
        except (ValueError, RuntimeError) as e:
            tqdm.write(f"Error processing {file_path}: {str(e)}")
            problematic_files.add(file_path)
            return False

        if not is_valid_code(file_path, refactored_content):
            tqdm.write(f"Error: Invalid refactored code for {file_path}")
            problematic_files.add(file_path)
            return False

        if refactored_content.strip() == original_content.strip():
            tqdm.write(f"Note: No changes needed for {file_path}")
            return False

        refactored_content = refactored_content.rstrip() + "\n"
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(refactored_content)
        tqdm.write(f"Successfully refactored {file_path}")
        return True

    except IOError as e:
        tqdm.write(f"Error processing {file_path}: {str(e)}")
        problematic_files.add(file_path)
        return False


def get_files_to_process(directory: str) -> List[str]:
    """
    Find all target files in the directory, excluding skipped folders.

    Args:
        directory: Root directory to search.

    Returns:
        List of file paths to process.
    """
    extensions = (".py", ".svelte", ".ts", ".tsx")
    return [
        os.path.join(root, file)
        for root, dirs, files in os.walk(directory)
        if not any(skip in root for skip in SKIP_FOLDERS)
        for file in files
        if file.endswith(extensions)
    ]


def main() -> None:
    """
    Main function to orchestrate file refactoring, processing 5 files per run with progress tracking.
    """
    if not os.path.isdir(DEFAULT_DIRECTORY):
        print(f"Error: Directory '{DEFAULT_DIRECTORY}' not found")
        return

    processed_files = load_processed_files(PROCESSED_FILES_TRACKER)
    problematic_files = load_problematic_files(PROBLEMATIC_FILES_LOG)
    files_to_process = get_files_to_process(DEFAULT_DIRECTORY)

    if not files_to_process:
        print(f"No target files found in {DEFAULT_DIRECTORY}")
        return

    remaining_files = [f for f in files_to_process if f not in processed_files]
    if not remaining_files:
        print("All files already processed")
        return

    files_this_run = remaining_files[:FILES_PER_RUN]
    print(f"Processing {len(files_this_run)} of {len(remaining_files)} remaining files")

    with tqdm(total=len(files_this_run), desc="Refactoring", unit="file") as pbar:
        for file_path in files_this_run:
            if process_single_file(file_path, problematic_files):
                processed_files.add(file_path)
                save_processed_files(PROCESSED_FILES_TRACKER, processed_files)
            if file_path in problematic_files:
                save_problematic_files(PROBLEMATIC_FILES_LOG, problematic_files)
            pbar.update(1)

    remaining = len(remaining_files) - len(files_this_run)
    print(f"Completed batch of {len(files_this_run)} files. {remaining} files remain.")
    if remaining > 0:
        print("Run the script again to process the next batch.")
    if problematic_files:
        print(f"Check '{PROBLEMATIC_FILES_LOG}' for {len(problematic_files)} files that need manual review.")


if __name__ == "__main__":
    main()

Processing 10 of 343 remaining files


Refactoring:  10%|█         | 1/10 [00:03<00:33,  3.73s/file]

Error processing C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\index.ts: Failed to fetch LLM response after 3 attempts: Error code: 413 - {'error': {'message': 'Request too large for model `deepseek-r1-distill-llama-70b` in organization `org_01j1qez3f7fj98shsdn8gf3qsm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 7071, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Refactoring:  20%|██        | 2/10 [00:10<00:45,  5.65s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\audio\index.ts


Refactoring:  30%|███       | 3/10 [00:20<00:52,  7.43s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\auths\index.ts


Refactoring:  40%|████      | 4/10 [00:24<00:35,  6.00s/file]

Error processing C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\chats\index.ts: Failed to fetch LLM response after 3 attempts: Error code: 413 - {'error': {'message': 'Request too large for model `deepseek-r1-distill-llama-70b` in organization `org_01j1qez3f7fj98shsdn8gf3qsm` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 6488, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


Refactoring:  50%|█████     | 5/10 [00:50<01:07, 13.45s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\configs\index.ts


Refactoring:  60%|██████    | 6/10 [01:11<01:03, 15.77s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\evaluations\index.ts


Refactoring:  70%|███████   | 7/10 [01:15<00:36, 12.14s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\files\index.ts


Refactoring:  80%|████████  | 8/10 [01:20<00:19,  9.67s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\folders\index.ts


Refactoring:  90%|█████████ | 9/10 [01:42<00:13, 13.66s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\functions\index.ts


Refactoring: 100%|██████████| 10/10 [01:48<00:00, 10.83s/file]

Successfully refactored C:\Users\harold.noble\Desktop\open-webui - Copy\app\src\lib\apis\groups\index.ts
Completed batch of 10 files. 333 files remain.
Run the script again to process the next batch.
Check 'problematic_files.txt' for 2 files that need manual review.



