# Read an analysis report and for each violation, generate a sample patch for an incident

## Workflow
* For each analysis report
    * For each violation
        * Form a prompt 
            * If there are 2 or more incidents, use one of them as the prior/solved, where we find the latest state of file and use that as solved
            * Use the extra contextual info we have in the prompt
        * Send the prompt to LLM to get a Result
        * Parse Result for:
            * Explanation
            * Code Patch
        * Save the Explanation and Code Patch as separate files
        * Later steps for verification
            * Attempt to apply the code patch to the original file
            * Use TreeSplitter to see if the is parseable
            * If an error shows up, work with LLM to attempt to fix/apply/repeat

In [1]:
#| default_exp result

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

from kyma_poc.report import Report
from kyma_poc.scm import GitDiff

import os 

class LLMResult:
    """ The intent of this class is to help us form several Prompt examples using a single application
        which we have already migrated.  We are using this single application and picking a few 
        violations our analyzer finds and then will construct a few prompt examples to assess the
        quality of response from a LLM
    """
    def __init__(self):
        """ We expect to have 2 directories that represent the same example application
            path_original_source is the original state of the application
            path_solved_source is the solved state of the application (after it has been migrated)
        """
        self.path_original_source = None
        self.path_solved_source = None
        self.path_to_report = None
        self.report = None 

    def set_path_original_source(self, example_initial_git_path):
        self.path_original_source = example_initial_git_path
    
    def set_path_solved_source(self, example_solved_git_path):
        self.path_solved_source = example_solved_git_path

    def parse_report(self, path_to_report):
        self.report = Report(path_to_report).get_report()

    def get_prompt_template(self):
        with open("./templates/template_01.txt", 'r') as f:
            template = f.read()
        return PromptTemplate.from_template(template)
    
    def _extract_diff(self, text: str):
        try:
            _, after = text.split("```diff")
            return after.split("```")[0]
        except Exception as e:
            print(f"Error: {e}")
            return "Error: Unable to extract diff"   

    def create_prompt(self, description, incidents, template):
        # To form a prompt we need:
        template = self.get_prompt_template()
        print(f"{len(incidents)} incidents:  {description}\n")

    def _update_uri(self, uri):
        return uri.replace("file:///opt/input/source/", "")
     
    def _ensure_output_dir_exists(self, output_dir):
        try:
            os.makedirs(output_dir, exist_ok=True)
        except OSError as error:
            print(f"Error creating directory {output_dir}: {error}")
            raise error

    def _write_output(self, filename, content):
        with open(filename, 'w') as f:
            # We want to start each run with a clean file
            f.truncate(0)
            f.write(content)

    def process(self, model_name="", limit_to_rulesets=None, limit_to_violations=None):
        if self.report is None:
            raise Exception("No report to process.  Please parse a report first")
        if self.path_original_source is None:
            raise Exception("No 'path_original_source'.  Please use set_path_original_source()")
        if self.path_solved_source is None:
            raise Exception("No 'path_solved_source'.  Please use set_path_solved_source()")

        # Create result directory 
        self._ensure_output_dir_exists("./results")

        for ruleset_name in self.report.keys():
            if limit_to_rulesets is not None and ruleset_name not in limit_to_rulesets:
                print(f"Skipping {ruleset_name} as it is not in {limit_to_rulesets}")
                continue
            ruleset = self.report[ruleset_name]
            ruleset_name_display = ruleset_name.replace('/', '_')
            print(f"Processing {ruleset_name} {ruleset_name_display}")
            for count, key in enumerate(ruleset['violations']):
                if limit_to_violations is not None and key not in limit_to_violations:     
                    print(f"Skipping {key} as it is not in {limit_to_violations}")
                    continue
                

                ###############################################################
                # For each violation, we will form only 1 prompt
                # If we have 2 incidents, we will use second as a 'solved' example, looking at the 
                # other repo which has the solved code present
                # Otherwise we will just send the prompt with the first incident
                #
                # Note this only a POC so we are intentionally ignoring other incidents that
                # would need to be solved.
                ###############################################################
                items = ruleset['violations'][key]

                if len(items['incidents']) == 0:
                    # No incidents so skip this iteration
                    continue
                
                description = items['description']
                current_issue_original_code =  items['incidents'][0].get('codeSnip', None)    
                lineNumber = items['incidents'][0].get('lineNumber', None)
                current_issue_filename = self._update_uri(items['incidents'][0]['uri'])
                current_issue_message = items['incidents'][0].get('message', None)  
               
                example_original_code = ""
                example_updated_code = ""
                example_original_filename = ""
                example_updated_filename = ""
                if len(items['incidents']) > 1:
                    example_lineNumber = items['incidents'][1].get('lineNumber', None)
                    example_original_filename = self._update_uri(items['incidents'][1]['uri'])
                    example_updated_filename = example_original_filename
                    try:
                        example_original_code = GitDiff(self.path_original_source).get_file_contents(example_original_filename)
                    except Exception as e:
                        print(f"Error: {e}")
                        example_original_code = ""
                    try:
                        example_updated_code = GitDiff(self.path_solved_source).get_file_contents(example_updated_filename)
                    except Exception as e:
                        print(f"Error: {e}")
                        example_updated_code = ""
                        
                prompt = self.get_prompt_template()
                template_args = {
                    "description": description,
                    "current_issue_filename": current_issue_filename,
                    "current_issue_message": current_issue_message,
                    "current_issue_original_code": current_issue_original_code,
                    "example_original_code": example_original_code,
                    "example_updated_code": example_updated_code,
                    "example_updated_filename":  example_updated_filename,
                    "example_original_filename": example_original_filename,
                }
                formatted_prompt = prompt.format(**template_args)
                #self._write_output(f"./results/{ruleset_name_display}_{key}_{count}_template.txt", formatted_prompt)
             
                llm = ChatOpenAI(temperature=0.1, model_name=model_name)
                chain = LLMChain(llm=llm, prompt=prompt)
                result = chain.run(template_args)
                result_diff = self._extract_diff(result)
                
                # Create result directory 
                self._ensure_output_dir_exists(f"./results/{model_name}")
                with open(f"./results/{model_name}/{ruleset_name_display}_{key}_{count}_full_run.md", "w") as f:
                    f.truncate(0)
                    f.write(f"## Prompt:\n")
                    f.write(f"{formatted_prompt}\n")
                    f.write(f"\n\n## Result:\n")
                    f.write(f"{result}\n\n")

                with open(f"./results/{model_name}/{ruleset_name_display}_{key}_{count}.diff", "w") as f:
                    f.truncate(0)
                    f.write(result_diff)

        print(f"Process complete")


In [4]:
example_solved_git_path = "../data/coolstuff-quarkus"
example_initial_git_path = "../data/coolstuff-javaee"
path_to_report = '../data/example_reports/coolstuff-javaee/output.yaml'
output_dir = './example/reports/coolstuff-javaee'
 
model_name = "gpt-3.5-turbo-16k" 
#model_name="gpt-4-1106-preview"

llmResult = LLMResult()
llmResult.set_path_original_source(example_initial_git_path)
llmResult.set_path_solved_source(example_solved_git_path)
llmResult.parse_report(path_to_report)
limit_to_rulesets = ['quarkus/springboot']
limit_to_violations = ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
llmResult.process(model_name, limit_to_rulesets, limit_to_violations)


Skipping eap7/websphere as it is not in ['quarkus/springboot']
Skipping eap8/eap7 as it is not in ['quarkus/springboot']
Skipping openshift as it is not in ['quarkus/springboot']
Processing quarkus/springboot quarkus_springboot
Skipping cdi-to-quarkus-00030 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00000 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00010 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00020 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00030 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00040 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00050 as it is not in ['cdi-to-quarkus-00040', 'cdi-to-quarkus-00050']
Skipping javaee-pom-to-quarkus-00060 as it is not in ['cdi-to-qua