In [None]:
# Step 1: Install necessary libraries
!pip install google-generativeai langchain-community transformers accelerate tiktoken faiss-cpu

import os
import json
import re
import faiss
import numpy as np
import google.generativeai as genai
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import git

# Step 1: Clone the SakilaProject GitHub repository
repo_url = "https://github.com/janjakovacevic/SakilaProject.git"
repo_dir = '/content/SakilaProject'

# Clone the repository if it does not exist
if not os.path.exists(repo_dir):
    print("Cloning SakilaProject repository...")
    git.Repo.clone_from(repo_url, repo_dir)
else:
    print("SakilaProject repository already exists!")

# Step 2: Setup Gemini API
genai.configure(api_key="AIzaSyBi6lAb1umq0z_LT1n8TW92QAfulacux3U")
model = genai.GenerativeModel('gemini-1.5-flash')

# Step 3: Read Java files from the cloned repository
print("Files detected:")

all_code = ""
for root, dirs, files in os.walk(repo_dir):
    for file in files:
        if file.endswith(".java"):
            file_path = os.path.join(root, file)
            print(f"Found Java file: {file_path}")
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                if content.strip():
                    all_code += f"\n\n// File: {file}\n\n" + content

print(f"\nTotal characters of Java code read: {len(all_code)}")

# Step 4: Chunk the code into smaller pieces
chunk_size = 800
chunks = [all_code[i:i+chunk_size] for i in range(0, len(all_code), chunk_size)]
print(f"Total number of chunks: {len(chunks)}")

# Step 5: Load local HuggingFace model and embedding model
pipe = pipeline("text-generation", model="gpt2", max_length=1024)
llm = HuggingFacePipeline(pipeline=pipe)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 6: Generate embeddings
all_embeddings = []
for idx, chunk in enumerate(chunks):
    print(f"Generating embedding for chunk {idx + 1}/{len(chunks)}...")
    embeddings = embedding_model.embed_documents([chunk])
    all_embeddings.append(np.array(embeddings))

# Stack all embeddings into a numpy array
all_embeddings_np = np.vstack(all_embeddings)
embedding_dim = all_embeddings_np.shape[-1]  # This is the dimension of the embeddings

# Step 7: Create and train a FAISS index

nlist = 100  # Number of Voronoi cells (clusters)

# Initialize the quantizer index (IndexFlatL2)
quantizer = faiss.IndexFlatL2(embedding_dim)

# Initialize the FAISS index with the quantizer
faiss_index = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist, faiss.METRIC_L2)

# Train the FAISS index with the embeddings
faiss_index.train(all_embeddings_np)

# Add the embeddings to the FAISS index
faiss_index.add(all_embeddings_np)

# Step 8: Save FAISS index
faiss.write_index(faiss_index, '/content/sakila_faiss.index')
print(f"✅ FAISS index saved at /content/sakila_faiss.index")

# Step 9: Query function
def query_faiss_index(query_text, top_k=3):
    query_embedding = embedding_model.embed_documents([query_text])
    query_embedding_np = np.array(query_embedding[0]).reshape(1, -1)
    _, indices = faiss_index.search(query_embedding_np, top_k)
    top_chunks = [chunks[idx] for idx in indices[0]]
    return top_chunks

# Example query
query = "How to create a new user in the database?"
top_chunks = query_faiss_index(query)
print("\nTop matching code chunks:")
for idx, chunk in enumerate(top_chunks):
    print(f"\n--- Chunk {idx + 1} ---\n{chunk[:500]}...")  # show first 500 chars

# Step 10: Analyze each chunk using Gemini model
overview_texts = []
functions_list = []
complexity_scores = []
complexity_descriptions = []

# Helper to extract JSON from Gemini output
def extract_json(text):
    try:
        json_text = re.search(r"\{.*\}", text, re.DOTALL).group()
        return json.loads(json_text)
    except Exception as e:
        print(f"❌ Failed to extract JSON: {e}")
        print("Model output was:\n", text)
        return None

for idx, chunk in enumerate(chunks):
    print(f"Analyzing chunk {idx + 1}/{len(chunks)}...")

    prompt = f"""
You are an expert codebase analyzer.
Analyze the following Java code and extract:
- A high-level overview (2-3 sentences)
- List of key functions (function signature + description)
- Code complexity (score out of 10 + reason)
Return ONLY a JSON like this:
{{
  "project overview": "#####",
  "functions": [
    {{"function name": "#####", "description": "#####"}},
    ...
  ],
  "complexity": {{"score": "#/10", "description": "#####"}}
}}

Code:
{chunk}
"""

    try:
        response = model.generate_content(prompt)
        output = response.text.strip()

        parsed = extract_json(output)

        if parsed:
            overview_texts.append(parsed.get("project overview", ""))

            functions = parsed.get("functions", [])
            if isinstance(functions, list):
                functions_list.extend(functions)

            complexity = parsed.get("complexity", {})
            if complexity:
                complexity_score = complexity.get("score", "5/10")
                complexity_description = complexity.get("description", "Not provided")

                # Extract score as integer
                if isinstance(complexity_score, str) and "/" in complexity_score:
                    complexity_score = int(complexity_score.split("/")[0])
                elif isinstance(complexity_score, (int, float)):
                    complexity_score = int(complexity_score)

                complexity_scores.append(complexity_score)
                complexity_descriptions.append(complexity_description)

    except Exception as e:
        print(f"Error analyzing chunk {idx}: {e}")

# Step 11: Merge results properly
average_score = round(sum(complexity_scores) / max(len(complexity_scores), 1))

final_json = {
    "project overview": " ".join(overview_texts),
    "functions": functions_list,
    "complexity": {
        "score": f"{average_score}/10",
        "description": " ".join(complexity_descriptions)
    }
}

# Step 12: Save final JSON
output_path = '/content/sakila_final_analysis2.json'
if final_json:
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_json, f, indent=4)
    print(f"✅ Final structured JSON saved at {output_path}")
else:
    print("❌ No final analysis generated!")

# Step 13: Display the final result
import pprint
with open(output_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

print("\n📄 Final Project Analysis JSON:")
pprint.pprint(data)
