Java Parser

In [40]:
# !pip install javalang
# !gcc -shared -o kotlin.so -fPIC src/parser.c
project_path = "Project Address"

'gcc' is not recognized as an internal or external command,
operable program or batch file.


In [8]:
import os
import javalang

class JavaCodeAnalyzer:
    def __init__(self, project_path):
        self.project_path = project_path
        self.files = self.get_source_files()

    def get_source_files(self):
        """Find all Java and Kotlin files in the project."""
        source_files = []
        
        for root, _, files in os.walk(self.project_path):
            print(f"Checking folder: {root}")  # 🔍 Debugging line
            
            # ✅ Ignore unnecessary folders
            if "target" in root or ".mvn" in root or "test" in root or ".git" in root or ".idea" in root:
                print(f"Skipping: {root}")  # 🔍 Debugging line
                continue  

            for file in files:
                if file.endswith(".java") or file.endswith(".kt"):  # ✅ Now includes Kotlin
                    source_files.append(os.path.join(root, file))

        print(f"Found {len(source_files)} source files:")
        for file in source_files:
            print(file)

        return source_files



        
    def extract_code_structure(self, file_path):
        """Extracts classes, methods, and key statements from Java code."""
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()
        
        tree = javalang.parse.parse(code)
        classes = []
        methods = []
        api_endpoints = []

        for path, node in tree.filter(javalang.tree.ClassDeclaration):
            classes.append(node.name)

        for path, node in tree.filter(javalang.tree.MethodDeclaration):
            methods.append(f"{node.name}()")
        
            # Detect API endpoints in Spring Boot applications
            for annotation in node.annotations:
                if annotation.name in ["GetMapping", "PostMapping", "RequestMapping"]:
                    api_endpoints.append(annotation.element)

        return {
            "file": file_path,
            "classes": classes,
            "methods": methods,
            "api_endpoints": api_endpoints
        }
    
    def analyze_project(self):
        analysis_results = []
        for file in self.files:
            analysis_results.append(self.extract_code_structure(file))
        return analysis_results

In [9]:
from tree_sitter import Language, Parser

# Load Kotlin grammar
# KOTLIN_LANGUAGE = Language('tree-sitter-kotlin/kotlin.so', 'kotlin')

def extract_code_structure(file_path):
    """Extracts classes and functions from a Kotlin file."""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()

        parser = Parser()
        parser.set_language(KOTLIN_LANGUAGE)
        tree = parser.parse(code.encode("utf8"))
        root_node = tree.root_node

        classes = []
        functions = []

        # Traverse the AST to find class and function definitions
        def traverse(node):
            if node.type == "class_declaration":
                classes.append(node.child_by_field_name("name").text.decode("utf8"))
            elif node.type == "function_declaration":
                functions.append(node.child_by_field_name("name").text.decode("utf8"))

            for child in node.children:
                traverse(child)

        traverse(root_node)

        return {"file": file_path, "classes": classes, "functions": functions}

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return {"file": file_path, "error": str(e)}


In [12]:
from langchain_openai import ChatOpenAI

class CodeSummarizer:
    def __init__(self):
        self.llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

    def summarize_code(self, code_structure):
        """Generate a summary explaining the purpose of a class or method."""
        prompt = f"""
        You are analyzing a Java project. Based on the following code structure, explain its purpose in simple terms:
        - Classes: {', '.join(code_structure['classes'])}
        - Methods: {', '.join(code_structure['methods'])}
        - API Endpoints: {', '.join(code_structure['api_endpoints'])}

        Give a high-level overview of what this file is doing.
        """
        return self.llm.predict(prompt)

    def summarize_whole_project(self, project_structure):
        """Generate a high-level summary of the project."""
        prompt = f"""
        Given the following extracted structure from a Java project, summarize its purpose:

        {project_structure}

        Try to infer if it's a web application, CLI tool, or something else.
        """
        return self.llm.predict(prompt)


In [13]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

class ProjectKnowledgeBase:
    def __init__(self):
        self.vector_db = Chroma(collection_name="code_insights", embedding_function=OpenAIEmbeddings())

    def store_code_summary(self, file_summary, file_path):
        self.vector_db.add_texts([file_summary], metadatas=[{"source": file_path, "type": "code_summary"}])

    def store_project_summary(self, project_summary):
        self.vector_db.add_texts([project_summary], metadatas=[{"type": "project_summary"}])

    def query_summary(self, query):
        return self.vector_db.similarity_search(query, k=2)


In [15]:
from langchain.chains import RetrievalQA

class CodeQueryEngine:
    def __init__(self, knowledge_base):
        self.qa_system = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0), retriever=knowledge_base.vector_db.as_retriever())

    def ask(self, query):
        return self.qa_system.run(query)


In [None]:
# execute project
if __name__ == "__main__":
    
    # Step 1: Extract Code Structure
    analyzer = JavaCodeAnalyzer(project_path)
    code_structure = analyzer.analyze_project()

    # Step 2: Summarize Each File
    summarizer = CodeSummarizer()
    kb = ProjectKnowledgeBase()

    for file_data in code_structure:
        summary = summarizer.summarize_code(file_data)

        if not isinstance(summary, str):
            summary = str(summary)  # Convert to string

        kb.store_code_summary(summary, file_data['file'])

    # Step 3: Generate a High-Level Summary
    project_summary = summarizer.summarize_whole_project(code_structure)
    kb.store_project_summary(project_summary)

    # Step 4: Ask Questions
    query_engine = CodeQueryEngine(kb)
    print(query_engine.ask("What is the main goal of this project?"))
