In [None]:
def execute_sql_script_from_file(file_path, dbname="omop", user="admin", password="adminpassword", host="localhost", port="5432"):
    if not os.path.isfile(file_path):
        return [False, f"SQL file not found: {file_path}"]
    
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            sql_script = f.read()
        conn = psycopg2.connect(
            dbname=dbname,
            user=user,
            password=password,
            host=host,
            port=port
        )
        conn.autocommit = True
        cursor = conn.cursor()
        print(f"Executing SQL script for database '{dbname}' from file: {file_path}")
        cursor.execute(sql_script)
        if cursor.description:
            colnames = [desc[0] for desc in cursor.description]
            rows = cursor.fetchall()
            df = pd.DataFrame(rows, columns=colnames)
            return [True, df.to_string()]
        
        return [True, "SQL executed successfully with no results to display."]
    except Exception as e:
        return [False, f"SQL Execution Error: {str(e)}"]

    except psycopg2.Error as e:
        error_msg = f"[{dbname}] SQL Execution Error: {e}"
        print(error_msg)
        return [False, error_msg]

    except Exception as e:
        error_msg = f"[{dbname}] Unexpected Error: {e}"
        print(error_msg)
        return [False, error_msg]

    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()

def execute_sql_script_cross_schema(file_path, target_db="omop", user="admin", password="adminpassword", host="localhost", port="5432"):
    if not os.path.isfile(file_path):
        return [False, f"SQL file not found: {file_path}"]
    
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            sql_script = f.read()
        
        # Se connecter directement à la base qui contient tous les schémas
        # (assumant que synthea et omop sont des schémas dans la même base)
        conn = psycopg2.connect(
            dbname=target_db,
            user=user,
            password=password,
            host=host,
            port=port
        )
        conn.autocommit = True
        cursor = conn.cursor()
        
        # Définir le search_path pour inclure les deux schémas
        cursor.execute("SET search_path TO omop, synthea, public;")
        
        print(f"Executing cross-schema SQL script on database '{target_db}' from file: {file_path}")
        cursor.execute(sql_script)
        
        if cursor.description:
            colnames = [desc[0] for desc in cursor.description]
            rows = cursor.fetchall()
            df = pd.DataFrame(rows, columns=colnames)
            return [True, df.to_string()]
        
        return [True, "SQL executed successfully with no results to display."]
        
    except psycopg2.Error as e:
        error_msg = f"[{target_db}] SQL Execution Error: {e}"
        print(error_msg)
        return [False, error_msg]
    except Exception as e:
        error_msg = f"[{target_db}] Unexpected Error: {e}"
        print(error_msg)
        return [False, error_msg]
    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()

def groq_chat_completion_stream_clean(prompt, model="llama3-70b-8192"):
    GROQ_API_KEY = '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'
    if not GROQ_API_KEY:
        raise ValueError("La clé API Groq n'est pas configurée dans le fichier .env")

    time.sleep(4)
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQ_API_KEY}"
    }
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": True  
    }
    response_text = ""
    with requests.post(url, headers=headers, json=data, stream=True) as response:
        if response.status_code != 200:
            raise Exception(f"Erreur API: {response.status_code} - {response.text}")
        
        for chunk in response.iter_lines():
            if chunk:
                decoded_chunk = chunk.decode('utf-8')
                if decoded_chunk.startswith("data:"):
                    try:
                        parsed = json.loads(decoded_chunk[5:].strip())
                        content = parsed.get("choices", [{}])[0].get("delta", {}).get("content")
                        if content:
                            response_text += content
                    except json.JSONDecodeError:
                        continue  

    return response_text.strip()

def extract_and_append_sql(text, json_data, sql_file_path):
    # Pattern corrigé : \s* au lieu de \s+ pour accepter zéro ou plusieurs espaces/nouvelles lignes
    sql_blocks = re.findall(r"```sql\s*(.*?)```", text, re.DOTALL | re.IGNORECASE)
    
    # Si le premier pattern ne fonctionne pas, essayer sans 'sql'
    if not sql_blocks:
        sql_blocks = re.findall(r"```\s*sql\s*(.*?)```", text, re.DOTALL | re.IGNORECASE)
    
    # Dernier essai avec n'importe quelles balises ```
    if not sql_blocks:
        sql_blocks = re.findall(r"```[a-zA-Z]*\s*(.*?)```", text, re.DOTALL | re.IGNORECASE)
    
    if not sql_blocks:
        print(f"❌ Aucun bloc SQL trouvé pour l'étape {json_data.get('id', 'unknown')}")
        print(f"Texte analysé (premiers 200 chars): {text[:200]}")
        return
    
    step_id = json_data.get("id", "unknown")
    
    # Créer le dossier s'il n'existe pas
    os.makedirs(os.path.dirname(sql_file_path), exist_ok=True)
    
    with open(sql_file_path, "a", encoding="utf-8") as f:
        for block in sql_blocks:
            f.write(f"-- BEGIN STEP: {step_id}\n")
            f.write(block.strip() + "\n")
            f.write(f"-- END STEP: {step_id}\n\n")
    
    print(f"✅ Ajouté {len(sql_blocks)} bloc(s) SQL pour l'étape {step_id} dans {sql_file_path}")

def edit_sql_file(id: int, sql_script: str, file_named: str):
    begin_marker = f"-- BEGIN STEP: {id}"
    end_marker = f"-- END STEP: {id}"

    with open(file_named, 'r') as file:
        lines = file.readlines()

    start_index = None
    end_index = None

    for i, line in enumerate(lines):
        if line.strip() == begin_marker:
            start_index = i
        elif line.strip() == end_marker:
            end_index = i
            break

    if start_index is None or end_index is None or start_index >= end_index:
        raise ValueError(f"STEP with id {id} not found in the file.")

    # Replace the lines between the markers with the new SQL script
    new_script_lines = [begin_marker + '\n'] + [line + '\n' for line in sql_script.strip().split('\n')] + [end_marker + '\n']
    lines = lines[:start_index] + new_script_lines + lines[end_index + 1:]

    with open(file_named, 'w') as file:
        file.writelines(lines)

def contains_task_completed(text):
    return re.search(r"task\s+completed", text, re.IGNORECASE)


In [32]:
import requests
import json
import re
import psycopg2
import pandas as pd
import os 
import time 

folder = "json/location/"
conversation_history = []

with open("prompt/prompt1.txt", "r", encoding="utf-8") as f:
    prompt = f.read()

system_message = {
    "role": "system",
    "content": prompt
}

conversation_history.append(system_message)

for idx, file_name in enumerate(sorted(os.listdir(folder))):
    path = os.path.join(folder, file_name)
    if not os.path.isfile(path):
        continue

    with open(path, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    json_str = json.dumps(json_data, indent=2)

    # FIX: Only add JSON once, not twice
    conversation_history.append({
        "role": "user",
        "content": f"Here is the JSON:\n\n{json_str}"
    })

    full_context = "\n\n".join([m["content"] for m in conversation_history])
    llm_response = groq_chat_completion_stream_clean(full_context)

    extract_and_append_sql(llm_response, json_data, "SQL/location.sql")

    conversation_history.append({
        "role": "assistant",
        "content": llm_response
    })

with open("conversation_history.json", "w") as f:
    json.dump(conversation_history, f, indent=2)


✅ Ajouté 1 bloc(s) SQL pour l'étape 0 dans SQL/location.sql
✅ Ajouté 1 bloc(s) SQL pour l'étape 1 dans SQL/location.sql


In [33]:

i=0

while i < 3 :
    
    result = execute_sql_script_from_file("SQL/location.sql", dbname="omop")

    if result[0]:
        success_msg = f"# ✅ SQL executed successfully.\n\n{result[1]}" 
        conversation_history.append({
            "role": "ipython",
            "content": success_msg
        })
    else:
        error_msg = f"# ❌ SQL Execution Error:\n{result[1]}"
        conversation_history.append({
            "role": "ipython",
            "content": error_msg
        })

    full_context = "\n\n".join([m["content"] for m in conversation_history])
    llm_response = groq_chat_completion_stream_clean(full_context)

    conversation_history.append({
        "role": "assistant",
        "content": llm_response
    })
    
    pattern = r'<function=([a-zA-Z_]+)>({.*?})<\/function>'
    
    # FIX: Process function calls properly
    function_calls_found = False
    for match in re.finditer(pattern, llm_response, re.DOTALL):
        function_calls_found = True
        function_name = match.group(1)
        function_args_raw = match.group(2)
        
        try:
            function_args = json.loads(function_args_raw)
            
            if function_name == "edit_sql_file":
                sql_script = function_args.get("sql_script")
                id = function_args.get("id")
                file_named = function_args.get("file_named")
                edit_sql_file(id, sql_script, file_named)
                print(f"Edited SQL for step {id}")
        except Exception as e:
            print(f"Error processing function call: {e}")
        
    if contains_task_completed(llm_response):
        print("✅ Task completed.")
        break
    i = i +1 

with open("conversation_history.json", "w") as f:
    json.dump(conversation_history, f, indent=2)

print(f"\nProcessed {len(conversation_history)} messages.")

Executing SQL script for database 'omop' from file: SQL/location.sql
Executing SQL script for database 'omop' from file: SQL/location.sql
Executing SQL script for database 'omop' from file: SQL/location.sql

Processed 11 messages.


In [30]:
print(prompt)

**You are a highly capable AI specialized in generating optimized SQL code.**

## Objective

Generate SQL code that transforms data from a **source database (A)** into a **target database (B)** named `omop`.
Each input will include one or more JSON objects describing the transformation logic for a **specific target table**.

## Input Details

Each JSON input contains:

* The structure of the source table(s) involved
* Column-level transformation rules
* Optional metadata (e.g., view names, output types, primary-key logic)
* **A unique `id` identifying the JSON object**

## Constraints

* Generate SQL **only** for the target table defined in the JSON.
* Do **not** infer or generate code for other target tables.
* All SQL must be limited to transforming data for the **one specified target table**.

## Instructions

* Analyze the input JSON carefully to understand the transformation logic.
* Generate **clean, well-structured, and readable PostgreSQL SQL code**.
* Include **inline SQL comm

In [11]:
import pandas as pd
import json

with open("prompt/prompt1.txt", "r", encoding="utf-8") as f:
    prompt = f.read()

try:

    with open("json/location/location_test_1.json", "r", encoding="utf-8") as f:
        json_data = json.load(f)
    json_str = json.dumps(json_data, indent=2)
    
except FileNotFoundError:
    print("JSON file not found. Please check the file path.")
    json_str = "{}"
except json.JSONDecodeError as e:
    print(f"Invalid JSON format: {e}")
    json_str = "{}"

final_prompt = prompt + "\n\n" + json_str
result = groq_chat_completion_stream_clean(final_prompt)
print(result)


Here is the generated SQL code:

```
INSERT INTO omop.location (
    city,
    state,
    zip,
    location_source_value,
    address_1,
    address_2,
    county
)
SELECT 
    l.city,
    l.state_abbreviation AS state,
    l.zip,
    l.zip AS location_source_value,
    NULL AS address_1, -- since column-level transformation rule is to null
    NULL AS address_2, -- since column-level transformation rule is to null
    NULL AS county   -- since column-level transformation rule is to null
FROM 
    location_enriched_view AS l;
```

Note: I've included inline comments to clarify the logic of column-level transformation rules that result in `NULL` values. I've also assumed that the `cast_type` values for `address_1`, `address_2`, and `county` are not necessary since they don't seem to have a specific transformation rule. If there's an error in the execution, I'd be happy to review and correct!


Here is the generated SQL code:

```
CREATE OR REPLACE VIEW omop.location_enriched_view AS 
SELECT 
  s.city, 
  s.state, 
  s.zip, 
  sm.state_abbreviation
FROM 
  synthea.patients s 
  LEFT JOIN omop.states_map ON s.state = sm.state;
```
```
INSERT INTO omop.location (
  location_id,
  city,
  state,
  zip,
  location_source_value,
  address_1,
  address_2,
  county
)
SELECT 
  MD5HASH(city || state_abbreviation || zip)::uuid AS location_id,
  city,
  state_abbreviation AS state,
  zip,
  zip AS location_source_value,
  NULL::VARCHAR AS address_1,
  NULL::VARCHAR AS address_2,
  NULL::VARCHAR AS county
FROM 
  location_enriched_view;
```
Let me know if this generates any execution errors!

In [8]:
import json
from datetime import datetime

def parse_conversation(json_input):
    """
    Transforme une conversation JSON complexe en format lisible
    
    Args:
        json_input: peut être:
            - Un chemin vers un fichier JSON
            - Un string JSON
            - Un dict ou une liste de messages
    
    Returns:
        String formaté de la conversation
    """
    
    # Si c'est un chemin de fichier
    if isinstance(json_input, str) and (json_input.endswith('.json') or '/' in json_input or '\\' in json_input):
        try:
            with open(json_input, 'r', encoding='utf-8') as file:
                data = json.load(file)
        except FileNotFoundError:
            return f"❌ Erreur: Fichier '{json_input}' non trouvé"
        except json.JSONDecodeError as e:
            return f"❌ Erreur: JSON invalide dans le fichier '{json_input}'\nDétail: {e}"
        except Exception as e:
            return f"❌ Erreur lors de la lecture du fichier '{json_input}': {e}"
    
    # Si c'est un string JSON
    elif isinstance(json_input, str):
        try:
            data = json.loads(json_input)
        except json.JSONDecodeError:
            return "❌ Erreur: JSON invalide"
    else:
        data = json_input
    
    # Si ce n'est pas une liste, on l'encapsule
    if not isinstance(data, list):
        data = [data]
    
    # Mapping des rôles vers des symboles/couleurs
    role_symbols = {
        'system': '🤖 SYSTÈME',
        'user': '👤 UTILISATEUR', 
        'assistant': '🔧 ASSISTANT',
        'ipython': '⚡ EXÉCUTION'
    }
    
    conversation = []
    conversation.append("=" * 80)
    conversation.append("📋 ANALYSE DE CONVERSATION")
    conversation.append("=" * 80)
    conversation.append("")
    
    for i, message in enumerate(data, 1):
        if not isinstance(message, dict) or 'role' not in message:
            conversation.append(f"❌ Message {i}: Format invalide")
            continue
            
        role = message.get('role', 'unknown')
        content = message.get('content', '[Pas de contenu]')
        
        # En-tête du message
        symbol = role_symbols.get(role, f'❓ {role.upper()}')
        conversation.append(f"\n{'-' * 60}")
        conversation.append(f"{symbol} - Message {i}")
        conversation.append(f"{'-' * 60}")
        
        # Traitement spécialisé selon le rôle
        if role == 'system':
            conversation.append("📋 Instructions système:")
            conversation.append(format_system_content(content))
            
        elif role == 'user':
            conversation.append("💬 Demande utilisateur:")
            conversation.append(format_user_content(content))
            
        elif role == 'assistant':
            conversation.append("🔧 Réponse assistant:")
            conversation.append(format_assistant_content(content))
            
        elif role == 'ipython':
            conversation.append("⚡ Résultat d'exécution:")
            conversation.append(format_ipython_content(content))
            
        else:
            conversation.append(f"Contenu ({role}):")
            conversation.append(content[:500] + "..." if len(content) > 500 else content)
    
    conversation.append(f"\n{'=' * 80}")
    conversation.append(f"📊 RÉSUMÉ: {len(data)} messages analysés")
    conversation.append(f"{'=' * 80}")
    
    return "\n".join(conversation)

def format_system_content(content):
    """Formate le contenu système de manière plus lisible"""
    lines = content.split('\n')
    formatted = []
    
    for line in lines:
        line = line.strip()
        if line.startswith('##'):
            formatted.append(f"\n🎯 {line[2:].strip()}")
        elif line.startswith('-'):
            formatted.append(f"  • {line[1:].strip()}")
        elif line and not line.startswith('#'):
            formatted.append(f"   {line}")
    
    return "\n".join(formatted) if formatted else content[:300] + "..."

def format_user_content(content):
    """Formate le contenu utilisateur"""
    if "JSON:" in content:
        parts = content.split("JSON:")
        if len(parts) == 2:
            intro = parts[0].strip()
            json_part = parts[1].strip()
            
            formatted = []
            if intro:
                formatted.append(f"📝 {intro}")
            
            formatted.append("\n📄 Données JSON fournies:")
            try:
                parsed_json = json.loads(json_part)
                formatted.append(format_json_nicely(parsed_json))
            except:
                formatted.append(json_part[:200] + "..." if len(json_part) > 200 else json_part)
            
            return "\n".join(formatted)
    
    return content[:300] + "..." if len(content) > 300 else content

def format_assistant_content(content):
    """Formate le contenu de l'assistant"""
    if "SQL" in content.upper():
        return f"💾 Code SQL généré:\n{content}"
    return content[:400] + "..." if len(content) > 400 else content

def format_ipython_content(content):
    """Formate le contenu d'exécution Python/SQL"""
    if "Error" in content:
        return f"❌ {content}"
    elif "Success" in content:
        return f"✅ {content}"
    else:
        return f"📋 {content}"

def format_json_nicely(json_obj, indent=2):
    """Formate un objet JSON de manière lisible"""
    if isinstance(json_obj, dict):
        formatted = []
        for key, value in json_obj.items():
            if isinstance(value, (dict, list)):
                formatted.append(f"  {key}: {type(value).__name__} avec {len(value)} éléments")
            else:
                formatted.append(f"  {key}: {value}")
        return "\n".join(formatted)
    
    return json.dumps(json_obj, indent=indent, ensure_ascii=False)

# Fonction utilitaire pour analyser directement un fichier
def analyze_conversation_file(file_path, save_output=False, output_path=None):
    """
    Analyse un fichier JSON de conversation et optionnellement sauvegarde le résultat
    
    Args:
        file_path: chemin vers le fichier JSON
        save_output: si True, sauvegarde le résultat dans un fichier texte
        output_path: chemin de sortie (optionnel, sinon utilise le nom du fichier d'entrée)
    
    Returns:
        String formaté de la conversation
    """
    result = parse_conversation(file_path)
    
    if save_output:
        if output_path is None:
            # Génère un nom de fichier de sortie basé sur l'entrée
            base_name = file_path.replace('.json', '').replace('\\', '/').split('/')[-1]
            output_path = f"{base_name}_analyzed.txt"
        
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(result)
            print(f"✅ Analyse sauvegardée dans: {output_path}")
        except Exception as e:
            print(f"❌ Erreur lors de la sauvegarde: {e}")
    
    return result

# Exemple d'utilisation
if __name__ == "__main__":
    result = parse_conversation('conversation_history.json')
    print(result)


📋 ANALYSE DE CONVERSATION


------------------------------------------------------------
🤖 SYSTÈME - Message 1
------------------------------------------------------------
📋 Instructions système:
   You are a highly capable AI specialized in generating optimized SQL code.

🎯 Objective:
   Generate SQL code that transforms data from a **source database (A)** into a **target database (B)** named `omop`.
   Each input will include one or more JSON objects describing the transformation logic for a **single specific target table**.

🎯 Input Details:
   Each JSON file will contain:
  • The structure of the source table(s) involved
  • Column-level transformation rules
  • Optional metadata (e.g. view names, output type, primary key logic)

🎯 Constraints:
  • Only generate SQL for the specific target table defined in the JSON.
  • Do not infer or process transformations for any other target tables.
  • All SQL must be limited to transforming data for the **one target table** described.

🎯 Ins