<a href="https://colab.research.google.com/github/lavanyaadapa05/CleanCode/blob/main/cleanCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymupdf langchain tiktoken faiss-cpu sentence-transformers langchain-google-genai gradio


Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.16 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.17-py3-none-any.whl.metadata (9.8 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from

In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [None]:
# @title Default title text
import os
import fitz  # PyMuPDF
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS


In [None]:
def extract_text_fallback(pdf_path):
    """Extracts text from PDF using 'text' method."""
    doc = fitz.open(pdf_path)
    extracted_text = []

    for i, page in enumerate(doc):
        if i < 30:  # Skip first 30 pages (Adjust if needed)
            continue
        text = page.get_text("text").strip()
        extracted_text.append(text)

    return "\n\n".join(extracted_text)

# Path to PDF
pdf_path = "/content/cleanCode.pdf"  # Update path if needed
full_text = extract_text_fallback(pdf_path)

print(f"\nTotal extracted characters: {len(full_text)}")



Total extracted characters: 838123


In [None]:
def smart_chunking(text, chunk_size=1000, overlap=150):
    """Splits text into meaningful chunks while preserving context."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", ". ", "! ", "? "]  # Prioritize splitting at sentence breaks
    )
    return splitter.split_text(text)

# Apply smart chunking
text_chunks = smart_chunking(full_text)

print(f"Total chunks created: {len(text_chunks)}")


Total chunks created: 1002


In [None]:
# Ensure API key is set
os.environ["GOOGLE_API_KEY"] = "AIzaSyBgJDRTkjwujipEvegv6Le7U9DeprzcPGg"
api_key = os.environ["GOOGLE_API_KEY"]

# Initialize Google Gemini embeddings
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)

# Convert text chunks into vectors
vector_db = FAISS.from_texts(text_chunks, embedding_model)

# Save the vector database
vector_db.save_local("clean_code_faiss")
print("✅ Vector database saved!")


✅ Vector database saved!


In [None]:
def retrieve_relevant_chunks(query, k=5):
    """Retrieves top-k relevant text chunks based on query."""
    retrieved_docs = vector_db.similarity_search(query, k=k)
    return [doc.page_content for doc in retrieved_docs]


In [None]:
def generate_response(query):
    """Generates AI response based on the retrieved context."""
    retrieved_chunks = retrieve_relevant_chunks(query)
    context = "\n\n".join(retrieved_chunks[:5])  # Take top 5 chunks

    qa_prompt = f"""
    You are an AI assistant that evaluates code cleanliness based on 'Clean Code' principles.

    - If the user provides code, analyze it and determine if it follows Clean Code principles.
    - If clean, say "Yes, your code follows Clean Code principles."
    - If not, explain why.
    - If the user asks a theoretical question, provide a detailed answer.
    - If the question is unrelated to Clean Code, politely decline.

    ----------------
    Context:
    {context}

    User Question: {query}
    """

    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")
    result = llm.invoke(qa_prompt)

    return result.content


In [None]:
import gradio as gr

# Function to analyze uploaded file and return response in chatbot format
def analyze_uploaded_file(files, chat_history):
    try:
        if not files:
            chat_history.append({"role": "assistant", "content": "No file uploaded."})
            return chat_history

        file_path = files.name  # Extract actual file path

        # Read the uploaded file
        with open(file_path, "r", encoding="utf-8") as f:
            code_content = f.read()

        # Retrieve relevant context
        retrieved_chunks = retrieve_relevant_chunks(code_content)
        context = "\n\n".join(retrieved_chunks[:5])  # Extract relevant information

        # Generate response (fix: pass only the necessary argument)
        result = generate_response(code_content)  # ✅ Only pass `code_content`

        # Append messages in correct format
        chat_history.append({"role": "user", "content": "📂 File Uploaded"})
        chat_history.append({"role": "assistant", "content": result})

        return chat_history
    except Exception as e:
        chat_history.append({"role": "assistant", "content": f"Error reading file: {str(e)}"})
        return chat_history


# Function to handle text input
def handle_text_input(user_input, chat_history):
    if not user_input.strip():
        return chat_history  # Ignore empty inputs

    # Generate response (fix: pass only the necessary argument)
    response = generate_response(user_input)  # ✅ Only pass `user_input`

    # Append messages in the correct format
    chat_history.append({"role": "user", "content": user_input})
    chat_history.append({"role": "assistant", "content": response})

    return chat_history


# Gradio Chatbot UI with File Upload Feature
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(type="messages")  # ✅ Fixed type

    with gr.Row():
        txt = gr.Textbox(show_label=False, placeholder="Type your question here...", container=False)
        submit_btn = gr.Button("Submit")

    with gr.Row():
        upload_btn = gr.File(label="📤", type="filepath")  # ✅ Corrected type

    # Button Click Actions
    submit_btn.click(handle_text_input, inputs=[txt, chatbot], outputs=[chatbot])
    upload_btn.change(analyze_uploaded_file, inputs=[upload_btn, chatbot], outputs=[chatbot])

demo.launch(debug=True)


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5c84c5a66ffead18ec.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


