In [12]:
import requests
import dotenv
import pandas as pd

from typing import List

from prettytable import PrettyTable
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field

In [3]:
dotenv.load_dotenv()

True

In [4]:
DEFAULT_BACKEND_URL = "http://localhost:8000"
THREADS_API = "/api/v1/threads/"

In [35]:
pd.set_option("display.max_colwidth", None)  # Display full content in each column
pd.set_option("display.max_columns", None)   # Show all columns
pd.set_option("display.expand_frame_repr", False)  # Prevent line wrapping

# Utils

In [6]:
def create_thread() -> str:
    url = DEFAULT_BACKEND_URL + THREADS_API
    response = requests.post(url)
    thread_id = response.json()["id"]
    return thread_id


def create_message(thread_id: str, message: dict) -> dict:
    url = DEFAULT_BACKEND_URL + THREADS_API + f"{thread_id}/messages/"
    response = requests.post(url, json=message)
    return response.json()

In [7]:
class EvaluationCategory(BaseModel):
    value: bool = Field(...)
    details: List[str] = Field(...)


class CodeEvaluation(BaseModel):
    code_correctness: EvaluationCategory = Field(...)
    edge_case_handling: EvaluationCategory = Field(...)
    readability: EvaluationCategory = Field(...)
    context_correctness: EvaluationCategory = Field(...)


parser = JsonOutputParser(pydantic_object=CodeEvaluation)

In [29]:
def print_evaluation_results(evaluation: CodeEvaluation):
    data = []
    
    for category_name, category in evaluation.items():
        data.append([
            category_name.replace("_", " ").title(),
            "✅" if category["value"] else "❌",
            "\n".join(category["details"])
        ])
    
    df = pd.DataFrame(data, columns=["Category", "Passed", "Details"])
    return df

# Eval

In [27]:
def evaluate(input: str, answer: str, context_prompt: str, type: str) -> CodeEvaluation:
    if type != "rule":
        raise ValueError("Type must be 'rule'")

    prompt = """
    You are an AI assistant tasked with evaluating C# code written for the Ferryt Low-Code Platform. The platform provides predefined process fields, screens, methods, and objects. Your job is to analyze the code's quality and provide detailed feedback on the following aspects:

    1. **Correctness** (`code_correctness`): 
        - Does the code work as intended, adhering to the functionality described in the user query?
        - Identify any logical issues, bugs, or incorrect operations in the code.

    2. **Edge Case Handling** (`edge_case_handling`):
        - Analyze whether the code properly handles edge cases, such as:
            - Null fields.
            - Invalid or unexpected user inputs.
            - Unavailable process fields or screens.
        - Suggest improvements for better error handling or robustness.

    3. **Readability** (`readability`):
        - Is the code clear, well-structured, and easy to understand?
        - Provide suggestions for improving readability, such as using more concise constructs, clear naming conventions, or comments where needed.

    4. **Correct Use of Context** (`context_correctness`):
        - Evaluate if the code makes **correct** use of the relevant context provided in the Ferryt Low-Code Platform:
            - Does the code use process fields, screens, or methods appropriately to fulfill the described functionality?
            - Are required safeguards in place (e.g., checking `.HasValue` before accessing `.Value`) to prevent runtime errors?
            - Are operations on fields (e.g., `SetVisible`, `SetEditable`, `SetRequired`) and screens (e.g., `Show`, `Hide`) used correctly and aligned with platform guidelines?
            - Is unnecessary or redundant use of context avoided?

        **Important**: The goal is not to use all available context but to ensure the **relevant context is used correctly**.

    **Return only valid JSON in the exact format below. Do not add explanations.**  
    Format:
    ```json
    {{
        "code_correctness": {{
            "value": true,
            "details": ["message goes here", "message goes here"]
        }},
        "edge_case_handling": {{
            "value": true,
            "details": ["message goes here", "message goes here"]
        }},
        "readability": {{
            "value": true,
            "details": ["message goes here", "message goes here"]
        }},
        "context_correctness": {{
            "value": true,
            "details": ["message goes here", "message goes here"]
        }}
    }}

    ---

    ### Context:
    {context}

    ---


    ### User Input:
    {user_input}

    ---

    ### Generated Code:
    {generated_code}
    """

    print("Input: ", input)
    print("Context: ", context_prompt)
    print("Answer: ", answer)
    prompt = prompt.replace("{", "{{").replace("}", "}}")
    prompt = prompt.format(generated_code=answer, user_input=input, context=context_prompt)
    llm = ChatOpenAI(temperature=0.0, model="gpt-4")
    llm_chain = PromptTemplate.from_template(prompt) | llm | parser
    response = llm_chain.invoke(
        input={
            "question": "Evaluate the code.",
            "user_input": input,
            "context": context_prompt,
            "generated_code": answer,
        }
    )
    return print_evaluation_results(response)

In [37]:
test_cases = [
    {
        "input": "Napisz regułę, która zwiększy składki o 35%, jeżeli dwóch członków rodziny ma conajmniej 50 lat i jeden członek rodziny jest niepełnoletni.",
        "response": """
int oldCount = 0;
int minorCount = 0;

for (int i = 0; i < PF.UR_DaneRodziny_T.Length; i++)
{
    if (PF.UR_DaneRodziny_T[i].Wiek.Value >= 50)
    {
        oldCount++;
    }
    else if (PF.UR_DaneRodziny_T[i].Wiek.Value < 18)
    {
        minorCount++;
    }
}

if (oldCount >= 2 && minorCount >= 1)
{
    PF.UR_Skladka.Value = PF.UR_Skladka.Value * 1.35m;
}
        """,
        "context": """
    You're an AI assistant on the Ferryt Low-Code Platform.

    # Universal Prompt for C# Code Generation for Client Data and Screens Management

    When writing C# code, use the available process fields and screens to manage client data and user interface. Follow these guidelines:

    ## 1. Available Process Fields:
    - `PF.UR_DaneKlienta_S.Imie` : String
    - `PF.UR_DaneKlienta_S.Nazwisko` : String
    - `PF.UR_DaneKlienta_S.Email` : String
    - `PF.UR_DaneKlienta_S.NrTelefonu` : String
    - `PF.UR_DaneKlienta_S.PESEL` : String
    
    - `PF.UR_DaneRodziny_T` : table
        * Imie
        * Nazwisko 
        * Wiek
        
    - `PF.UR_KodZnizkowy` : String
    - `PF.UR_OdrzuceniePrzezWiek` : Bool
    - `PF.UR_PoziomUbezpieczenia` : String
    - `PF.UR_RodzajUbezpieczenia` : String
    - `PF.UR_Skladka` : Decimal
    - `PF.UR_UbezpieczenieDodatkowe` : Bool
    - `PF.tech_Message` : String
    

    ### Properties:
    Fields have the following properties:
    - `HasValue`, `IsEditable`, `IsRequired`, `IsVisible`, `Value`.

    ### Actions:
    You can perform these operations on the fields:
    - `SetEditable(bool)`, `SetVisible(bool)`, `SetRequired(bool)`, `SetNull()`, `GetValueOrDefault(defaultValue)`.
    
    ### Arrays:
    - Arrays are complex datatypes that store fields or structures. Their name should end with `_T`.
        A specific element in the array can be accessed via `[index]` or `.Items(index)`.
    - Operations on arrays: Length, SetMinimumSize(int)

    ## 2. Available Screens:
    - `E010_Powitanie` (alias =`Powitanie`)
    - `E020_DaneRodziny` (alias =`Dane rodziny`)
    - `E030_WyborUbezpieczenia` (alias =`Wybór ubezpieczenia`)
    - `E040_Zgody` (alias =`Zgody`)
    - `E050_Podsumowanie` (alias =`Podsumowanie`)
    - `E060_Odrzucenie` (alias =`Odrzucenie`)
    - `E_techMessage` (alias =`E_techMessage`)
    - `Tech_SessionEndScreen` (alias =`Tech_SessionEndScreen`)
    - `Tech_TopScreen` (alias =`Tech_TopScreen`)
    - `Tech_BottomScreen` (alias =`Tech_BottomScreen`)
    - `Tech_TabsScreen` (alias =`Tech_TabsScreen`)

    ### Methods:
    Screens can be accessed by `G.`.
    Screens can be controlled using the following methods:
    - `Hide()`, `HideAll()`, `Show()`, `ShowAll()`.

    ## 3. Accessing User Data:
    - `USER.Current` provides access to properties like `IsAuthenticated`, `UserLogin`, `UserFullName`, `UserEmail`, 
        `UserID`, etc.
        
    ## 4. ENV Object:
    - ENV gives access to many environment variables like:
        * ActionIndices[0] - points to the element on which the action was last performed (in arrays)
        * ApplicationGuid
        * UserDevice.IsMobile
        * UserDevice.BrowserInfo
        * UserDevice.OperatingSystem etc.
        
    ## 5. ACTION object
    - ACTION object has methods like: `ClearDocumentRequirement()`, `ClearFieldRequirement()`, 
        `SetFieldsReadOnly()` - disables editable fields, `VisibilityFlagHide(flagName)` - hide all actions with the 
        flag set, `VisibilityFlagHideAll()`, `VisibilityFlagShow(flagName)` - show all actions with the flag set`, 
        `VisibilityFlagShowAll()`
        
    ## 6. LOOPS?
    
    ## 7. Enumerator?

    ## 8. Example Functionalities:
    - Setting field values based on conditions.
    - Controlling field visibility, editability and requirements.
    - Assigning user data to client data under specific conditions.
    - Displaying or hiding screens.
    
    ## 9. Notes:
    - Reference to the `.Value` property, may cause an error if the field is null. 
        To avoid this, first check that the value is via `.HasValue`.
    - If code is very simple, divide it to the 3 separate blocks: `if` , `then`, `else`. `If` statement must not be 
        empty. If something is to be executed every time, enter `true` there.
        """,
        "type": "rule",
    },
]

In [38]:
for tc in test_cases:
    result_df = evaluate(input=tc["input"], answer=tc["response"], context_prompt=tc["context"], type=tc["type"])

Input:  Napisz regułę, która zwiększy składki o 35%, jeżeli dwóch członków rodziny ma conajmniej 50 lat i jeden członek rodziny jest niepełnoletni.
Context:  
    You're an AI assistant on the Ferryt Low-Code Platform.

    # Universal Prompt for C# Code Generation for Client Data and Screens Management

    When writing C# code, use the available process fields and screens to manage client data and user interface. Follow these guidelines:

    ## 1. Available Process Fields:
    - `PF.UR_DaneKlienta_S.Imie` : String
    - `PF.UR_DaneKlienta_S.Nazwisko` : String
    - `PF.UR_DaneKlienta_S.Email` : String
    - `PF.UR_DaneKlienta_S.NrTelefonu` : String
    - `PF.UR_DaneKlienta_S.PESEL` : String
    
    - `PF.UR_DaneRodziny_T` : table
        * Imie
        * Nazwisko 
        * Wiek
        
    - `PF.UR_KodZnizkowy` : String
    - `PF.UR_OdrzuceniePrzezWiek` : Bool
    - `PF.UR_PoziomUbezpieczenia` : String
    - `PF.UR_RodzajUbezpieczenia` : String
    - `PF.UR_Skladka` : Decimal
   

In [39]:
result_df.head()

Unnamed: 0,Category,Passed,Details
0,Code Correctness,✅,The code correctly checks the age of family members and increases the insurance premium by 35% if the conditions are met.
1,Edge Case Handling,❌,"The code does not handle the case where the 'Wiek' field is null. Before accessing the 'Value' property, it should check if the field 'HasValue'."
2,Readability,✅,The code is well-structured and easy to understand. The variable names 'oldCount' and 'minorCount' clearly indicate their purpose.
3,Context Correctness,❌,The code does not check if the 'Wiek' and 'UR_Skladka' fields 'HasValue' before accessing their 'Value' properties. This could lead to runtime errors if these fields are null.
