In [14]:
import requests
import json
import re
import psycopg2

def execute_sql_code_from_string(input_string):
    sql_blocks = re.findall(r'```sql\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
    
    if not sql_blocks:
        print("Aucun bloc SQL trouvé.")
        return
    
    # Connexion à PostgreSQL
    try:
        conn = psycopg2.connect(
            dbname="postgres",
            user="admin",
            password="adminpassword",
            host="localhost",
            port="5432"
        )
        conn.autocommit = True
        cursor = conn.cursor()

        for sql_code in sql_blocks:
            print(f"Exécution du SQL :\n{sql_code}\n---")
            cursor.execute(sql_code)
        
        print("Exécution terminée avec succès.")

    except psycopg2.Error as e:
        print(f"Erreur lors de la connexion ou exécution SQL : {e}")
    
    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()
            
def groq_chat_completion_stream_clean(prompt, model="llama3-70b-8192"):
    GROQ_API_KEY = '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'
    if not GROQ_API_KEY:
        raise ValueError("La clé API Groq n'est pas configurée dans le fichier .env")
    
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQ_API_KEY}"
    }
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": True  
    }
    response_text = ""
    with requests.post(url, headers=headers, json=data, stream=True) as response:
        if response.status_code != 200:
            raise Exception(f"Erreur API: {response.status_code} - {response.text}")
        
        for chunk in response.iter_lines():
            if chunk:
                decoded_chunk = chunk.decode('utf-8')
                if decoded_chunk.startswith("data:"):
                    try:
                        parsed = json.loads(decoded_chunk[5:].strip())
                        content = parsed.get("choices", [{}])[0].get("delta", {}).get("content")
                        if content:
                            response_text += content
                    except json.JSONDecodeError:
                        continue  

    return response_text.strip()


prompt = """
SYSTEM PROMPT :

You are a highly capable AI specialized in generating optimized SQL code.

## Objective:
	Your task is to generate SQL code that transforms data from a **source database (A)** to a **target database (B)**.
	You will receive, for each step, a JSON that describes the transformation needed for **one specific target table only**, including:
	  - The structure of the source table(s) involved (columns, data types).
	  - The details of the transformation for each target column ("transformation_type" and "description").

## Important constraints:
	- The migration will be handled **table by table**, not globally.
	- You must focus **only on the target table provided in the JSON**, ignoring any other tables until explicitly given.
	- Do not attempt to infer transformations for other target tables.

## Instructions:
	- Carefully analyze the JSON to fully understand the data model and the transformation rules for the target table.
	- Generate clean, well-structured, and **highly optimized SQL code** that performs the described transformation **only for the specified target table**.

## Best practices to strictly follow.
To ensure the SQL code is efficient, readable, and maintainable, please follow these best practices:
	- Prioritize code efficiency and readability.
	- Use **views** for complex queries to improve modularity.
	- Use **stored procedures** if tasks are repetitive or part of a workflow.
	- Prefer **JOINs** over subqueries whenever possible.
	- Always **limit selected columns** explicitly (never use `SELECT *`).
	- Apply **indexing strategies** where relevant to improve performance.
	- Comment your SQL code where necessary to explain complex logic.

## Output:
	- Provide only the SQL code that builds the **target table transformation as described in the JSON**.
	- Include inline comments if needed to clarify complex operations.
	- Do not generate explanations or verbal outputs SQL code only.
	
## Errors Handling :

Your code will be tested in real time after you give the output. If there are any errors, they will be sent to you through the IPython role

USER INPUT 

{
"target_table": "clients",
"source_table_involve": "CLIENTS_SOURCE",
"columns": [
{
"target_column": "client_id",
"source_column": "ID_CLIENT",
"transformation_type": "direct_copy",
"description": "Client ID kept as is."
},
{
"target_column": "last_name",
"source_column": "NOM_PRENOM",
"transformation_type": "split_string",
"delimiter": [" ", "_" , "-" ],
"part_index": 0,
"description": "Extracting the last name from the composite field NOM_PRENOM."
},
{
"target_column": "first_name",
"source_column": "NOM_PRENOM",
"transformation_type": "split_string",
"delimiter": " ",
"part_index": 1,
"description": "Extracting the first name from the composite field NOM_PRENOM."
},
{
"target_column": "birth_date",
"source_column": "DATE_NAISSANCE",
"transformation_type": "date_format",
"source_format": 
"target_format": "YYYY-MM-DD",
"description": "Converting the date to ISO format. source format DD/MM/YYYY tqr"
},
{
"target_column": "postal_code",
"source_column": "ADRESSE_COMPLETE",
"transformation_type": "regex_extract",
"description": "Extracting the postal code using a regex using this pattern "\\b(\\d{5})\\b" "
},
{
"target_column": "email",
"source_column": "EMAIL",
"transformation_type": "lowercase",
"description": "Converting the email to lowercase."
}
]
}
"""

result = groq_chat_completion_stream_clean(prompt)
print(result)

```sql
-- Create the target table with the required structure
CREATE TABLE clients (
    client_id INT,
    last_name VARCHAR(255),
    first_name VARCHAR(255),
    birth_date DATE,
    postal_code VARCHAR(10),
    email VARCHAR(255)
);

-- Create a view to handle complex transformations
CREATE VIEW clients_source_view AS
SELECT 
    ID_CLIENT,
    NOM_PRENOM,
    DATE_NAISSANCE,
    ADRESSE_COMPLETE,
    EMAIL
FROM CLIENTS_SOURCE;

-- Insert into target table with transformations
INSERT INTO clients (
    client_id,
    last_name,
    first_name,
    birth_date,
    postal_code,
    email
)
SELECT 
    CSV.ID_CLIENT AS client_id,
    -- Split NOM_PRENOM to extract last name
    SPLIT_PART(CSV.NOM_PRENOM, ' ', 1) AS last_name,
    -- Split NOM_PRENOM to extract first name (handle multiple delimiters)
    CASE 
        WHEN POSITION('_', CSV.NOM_PRENOM) > 0 THEN SPLIT_PART(REPLACE(CSV.NOM_PRENOM, '_', ' '), ' ', 2)
        WHEN POSITION('-', CSV.NOM_PRENOM) > 0 THEN SPLIT_PART(REPLACE(C

llama3.1

Here is the optimized SQL code that transforms the data from the source database to the target database for the target table "clients":

```sql
CREATE OR REPLACE VIEW clients_view AS
SELECT 
    c.ID_CLIENT AS client_id,
    SPLIT(NOM_PRENOM, ' ', 1) AS first_name,
    SPLIT(NOM_PRENOM, ' ', 2) AS last_name,
    TO_DATE(SUBSTRING(DATE_NAISSANCE FROM '^\d{2}/\d{2}/\d{4}$'), 'DD/MM/YYYY') AS birth_date,
    REGEXP_EXTRACT(ADRESSE_COMPLETE, r'\b(\d{5})\b') AS postal_code,
    LOWER(EMAIL) AS email
FROM 
    CLIENTS_SOURCE c;
```

This SQL code creates a view called `clients_view` that performs the described transformations for the target table "clients".

In [None]:
location = df[['ADDRESS', 'CITY', 'STATE', 'ZIP']].copy()

state_map = df[['STATE', 'COUNTY']].copy()
state_map.columns = ['STATE', 'STATE_ABBREVIATION'] 

state_map.to_csv("/home/petriscyril/Desktop/Agent_ETL/Synthea/state_map.csv", index=False)
location.to_csv("/home/petriscyril/Desktop/Agent_ETL/Synthea/location.csv", index=False)

In [28]:
print(df.columns.tolist())

['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'MIDDLE', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE', 'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'FIPS', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']


In [32]:
df[['ADDRESS','CITY','STATE','ZIP','MAIDEN']]

Unnamed: 0,ADDRESS,CITY,STATE,ZIP,MAIDEN
0,964 Johnson Throughway,GIRALDO,ANTIOQUIA,5306,
1,1041 Deckow Viaduct Suite 7,BETANIA,ANTIOQUIA,5091,
2,641 Orn Trafficway Unit 91,SABANETA,ANTIOQUIA,5631,
3,324 Cronin Street,LA ESTRELLA,ANTIOQUIA,5380,
4,380 Kuhn Key Unit 52,MEDELLIN,ANTIOQUIA,5001,
...,...,...,...,...,...
10632,207 Olson Gate,TUNJA,BOYACA,15001,
10633,563 Ortiz Trace,PAUNA,BOYACA,15531,
10634,393 Shields Route,SAMACA,BOYACA,15646,
10635,824 Gulgowski Hollow,DUITAMA,BOYACA,15238,
