In [1]:
import os  
import json  
from pathlib import Path  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt, stop_after_delay  
from openai import AzureOpenAI  
  
# Load environment variables  
env_path = Path('..') / 'secrets.env'  
load_dotenv(dotenv_path=env_path)  
  
openaikey = os.getenv("AZURE_OPENAI_API_KEY")  
openaiservice = os.getenv("AZURE_OPENAI_ENDPOINT")  
  
# Initialize OpenAI client  
client = AzureOpenAI(api_key=openaikey, api_version=os.getenv("AZURE_OPENAI_API_VERSION"), azure_endpoint=openaiservice)  
  
@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=(stop_after_attempt(10) | stop_after_delay(300)))  
def generate_business_analytic_requests(business_question):  
    user_message = f"""  
    Given the following business question, update it to include a data visualization requirement if it would enhance the understanding of the query results. Keep the question unchanged if visualization is unnecessary.  
    ## Business Question  
    {json.dumps(business_question, indent=4)}  
    ## Output format:  
    Provide the updated question without additional comments or explanations.  
    Your response:  
    """  
    response = client.chat.completions.create(  
        model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),  
        messages=[  
            {"role": "system", "content": "You are a smart AI data analytic assistant."},  
            {"role": "user", "content": user_message},  
        ],  
    )  
    return response.choices[0].message.content.strip('"')  
  
@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=(stop_after_attempt(10) | stop_after_delay(300)))  
def generate_python_code(business_analytic_request, sql_query):  
    user_message = f"""  
    Generate syntactically correct Python code based on the given business analytic request and SQL query. Ensure the code adheres to the specified Python environment and fulfills the business requirements. Use graphical visualization only if it enhances understanding or is explicitly requested.  
    # Business Analytic Request  
    {business_analytic_request}  
    ## SQL Query  
    {sql_query}  
    # Python Environment Constraints:  
    ## Utility Functions:  
    1. execute_sql_query(sql_query: str): Executes an SQL query and returns a pandas DataFrame for data analysis and visualization.  
    2. show_to_user(data): Displays data analysis or visualization results. Accepts a pandas DataFrame or Plotly figure. Use Plotly exclusively for graph visualization (e.g., fig=px.line(some_df); show_to_user(fig)).  
    # Output Format:  
    Provide the executable Python code in the format:  
    ```python  
    YOUR CODE HERE  
    ```  
    Your output:  
    """  
    response = client.chat.completions.create(  
        model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),  
        messages=[  
            {"role": "system", "content": "You are an expert data analyst with strong python programming background and business analysis skill"},  
            {"role": "user", "content": user_message},  
        ]  
    )  
    return response.choices[0].message.content  
  
@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=(stop_after_attempt(10) | stop_after_delay(300)))  
def validate_python_code(business_analytic_request, sql_query, python_code):  
    user_message = f"""  
    # Evaluate the correctness of the Python code for each business analytic request and provide corrections if needed.  
    ## Business Analytic Request  
    {business_analytic_request}  
    ## SQL Query  
    {sql_query}  
    ## Generated Python Code  
    {python_code}  
    ## Python Environment Constraints:  
    # Utility functions available for use:  
    # 1. execute_sql_query(sql_query: str) - Executes an SQL query and returns a pandas DataFrame.  
    # 2. show_to_user(data): Displays data analysis or visualization results. Accepts a pandas DataFrame or Plotly figure. Use Plotly exclusively for graph visualization (e.g., fig=px.line(some_df); show_to_user(fig)).  
    ## Output Format:  
    # The response should be in JSON format with two key elements:  
    # 1. "reviewed_python_code": This should contain the corrected Python code if any issues are found in the original code. If the original code is correct, return it unchanged.  
    # 2. "review_note": This should include a detailed evaluation of the original Python code. Highlight any errors or inefficiencies and explain the corrections made. If the code is correct, confirm its accuracy and functionality.  
    """  
    response = client.chat.completions.create(  
        model=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),  
        messages=[  
            {"role": "system", "content": "You are an expert data analyst with strong python programming background and business analysis skill"},  
            {"role": "user", "content": user_message},  
        ],  
        response_format={"type": "json_object"},  
    )  
    response_message = json.loads(response.choices[0].message.content)  
    assert "reviewed_python_code" in response_message and "review_note" in response_message  
    return response_message  
  
def process_data(data):  
    final_data = []  
    for item in data:  
        business_question = item["input"]  
        updated_question = generate_business_analytic_requests(business_question)  
          
        sql_query = item["output"]  
        python_code = generate_python_code(updated_question, sql_query)  
        validated_python_code = validate_python_code(updated_question, sql_query, python_code)  
          
        final_data.append({  
            "scenario": item["scenario"],  
            "input": updated_question,  
            "output": validated_python_code['reviewed_python_code'],  
            "review_note": validated_python_code['review_note'],  
            "difficulty": item["difficulty"]  
        })  
      
    return final_data  
  
def create_message_format(item):  
    return {  
        "messages": [  
            {"role": "system", "content": "You are an expert data analyst with strong Python programming and data analysis background.You are working on your company's MDDX datawarehouse. Write Python code that can provide answer for user request"},  
            {"role": "user", "content": '## Request: ' + item['input']},  
            {"role": "assistant", "content": item["output"]}  
        ]  
    }  
  
def process_and_write_data(processed_data, output_file):  
    with open(output_file, "w") as f:  
        for item in processed_data:  
            record = create_message_format(item)  
            f.write(json.dumps(record) + "\n")  
  
# Load the train and test data  
with open("../sql_gen/data/train_data_v6.json", "r") as f:  
    train_data = json.load(f)  
    
with open("../sql_gen/data/test_data_v6.json", "r") as f:  
    test_data = json.load(f)  
    
# Process the train and test data  
processed_train_data = process_data(train_data)  
processed_test_data = process_data(test_data)  
    
# Save the processed data  
with open("./data/processed_train_data_v6.json", "w") as f:  
    json.dump(processed_train_data, f, indent=4)  
    
with open("./data/processed_test_data_v6.json", "w") as f:  
    json.dump(processed_test_data, f, indent=4)  
    
print("Processed train and test data saved to ./data/processed_train_data_v6.json and ./data/processed_test_data_v6.json")  
    
# Load the processed train and test data  
with open("./data/processed_train_data_v6.json", "r") as f:  
    processed_train_data = json.load(f)  

with open("./data/processed_test_data_v6.json", "r") as f:  
    processed_test_data = json.load(f)  

# Process and write the train and test data to jsonl files  
process_and_write_data(processed_train_data, "./data/python_train_0909.jsonl")  
process_and_write_data(processed_test_data, "./data/python_test_0909.jsonl")  

print("Data ready for training saved to ./data/python_train_0909.jsonl and ./data/python_test_0909.jsonl")  
  


Processed train and test data saved to ./data/processed_train_data_v6.json and ./data/processed_test_data_v6.json
Data ready for training saved to ./data/python_train_0909.jsonl and ./data/python_test_0909.jsonl
