In [None]:
# Step 1: Install necessary libraries
!pip install langchain langchain-community transformers accelerate tiktoken

# Step 2: Clone the SakilaProject repository
!git clone https://github.com/janjakovacevic/SakilaProject

# Step 3: Check and read Java files from the cloned project
import os
import json

# Update the path to reflect where the GitHub repo is cloned
project_path = '/content/SakilaProject'
print("Files detected:")

# Check and read all .java files from the project
all_code = ""
for root, dirs, files in os.walk(project_path):
    for file in files:
        if file.endswith(".java"):  # Looking for Java files instead of Python
            print(f"Found Java file: {os.path.join(root, file)}")
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                content = f.read()
                if content.strip() != "":
                    all_code += f"\n\n# File: {file}\n\n" + content

print(f"\nTotal characters of Java code read: {len(all_code)}")

# Step 4: Chunk the code into smaller pieces for processing
chunks = [all_code[i:i+1000] for i in range(0, len(all_code), 1000)]
print(f"Total number of chunks: {len(chunks)}")

# Step 5: Load the HuggingFace model and LangChain pipeline
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline

pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_length=512)
llm = HuggingFacePipeline(pipeline=pipe)

# Step 6: Analyze each chunk using the model
results = []

for idx, chunk in enumerate(chunks):
    print(f"Analyzing chunk {idx+1}/{len(chunks)}...")
    prompt = f"""
    Analyze this Java code:
    - High-level purpose
    - Key methods
    - Complexity

    Code:
    {chunk}
    """

    try:
        response = llm.invoke(prompt)
        results.append({
            "chunk_index": idx,
            "analysis": response
        })
    except Exception as e:
        print(f"Error analyzing chunk {idx}: {e}")

# Step 7: Save the structured output to a JSON file
if results:
    with open('/content/sakila_analysis.json', 'w') as f:
        json.dump(results, f, indent=4)
    print("✅ JSON file saved at /content/sakila_analysis.json")
else:
    print("❌ No results generated! Something went wrong.")

# Step 8: Optionally, display the JSON content in Colab
import pprint
with open('/content/sakila_analysis.json', 'r') as f:
    data = json.load(f)

# Pretty print the JSON output
pprint.pprint(data)
