In [19]:
import requests
import json
import re
import psycopg2
import pandas as pd

def execute_sql_code_from_string(input_string):
    sql_blocks = re.findall(r'```sql\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
    
    if not sql_blocks:
        print("No SQL block found.")
        return None
    
    try:
        conn = psycopg2.connect(
            dbname="synthea",
            user="admin",
            password="adminpassword",
            host="localhost",
            port="5432"
        )
        conn.autocommit = True
        cursor = conn.cursor()

        for sql_code in sql_blocks:
            print(f"Executing SQL:\n{sql_code}\n---")
            cursor.execute(sql_code)

            if cursor.description:
                # Get column names
                colnames = [desc[0] for desc in cursor.description]
                # Fetch all data
                rows = cursor.fetchall()
                # Convert to DataFrame
                df = pd.DataFrame(rows, columns=colnames)
                return df

        return None  

    except psycopg2.Error as e:
        print(f"SQL Execution Error: {e}")
        return None

    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()
            
def groq_chat_completion_stream_clean(prompt, model="llama3-70b-8192"):
    GROQ_API_KEY = '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'
    if not GROQ_API_KEY:
        raise ValueError("La clé API Groq n'est pas configurée dans le fichier .env")
    
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQ_API_KEY}"
    }
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": True  
    }
    response_text = ""
    with requests.post(url, headers=headers, json=data, stream=True) as response:
        if response.status_code != 200:
            raise Exception(f"Erreur API: {response.status_code} - {response.text}")
        
        for chunk in response.iter_lines():
            if chunk:
                decoded_chunk = chunk.decode('utf-8')
                if decoded_chunk.startswith("data:"):
                    try:
                        parsed = json.loads(decoded_chunk[5:].strip())
                        content = parsed.get("choices", [{}])[0].get("delta", {}).get("content")
                        if content:
                            response_text += content
                    except json.JSONDecodeError:
                        continue  

    return response_text.strip()

In [14]:
prompt = """
SYSTEM PROMPT :

You are a highly capable AI specialized in generating optimized SQL code.

## Objective:
	Your task is to generate SQL code that transforms data from a **source database (A)** to a **target database (B)**.
	You will receive, for each step, a JSON that describes the transformation needed for **one specific target table only**, including:
	  - The structure of the source table(s) involved (columns, data types).
	  - The details of the transformation for each target column ("transformation_type" and "description").

## Important constraints:
	- The migration will be handled **table by table**, not globally.
	- You must focus **only on the target table provided in the JSON**, ignoring any other tables until explicitly given.
	- Do not attempt to infer transformations for other target tables.

## Instructions:
	- Carefully analyze the JSON to fully understand the data model and the transformation rules for the target table.
	- Generate clean, well-structured, and **highly optimized SQL code** that performs the described transformation **only for the specified target table**.

## Best practices to strictly follow.
To ensure the SQL code is efficient, readable, and maintainable, please follow these best practices:
	- Prioritize code efficiency and readability.
	- Use **views** for complex queries to improve modularity.
	- Use **stored procedures** if tasks are repetitive or part of a workflow.
	- Prefer **JOINs** over subqueries whenever possible.
	- Always **limit selected columns** explicitly (never use `SELECT *`).
	- Apply **indexing strategies** where relevant to improve performance.
	- Comment your SQL code where necessary to explain complex logic.

## Output:
	- Provide only the SQL code that builds the **target table transformation as described in the JSON**.
	- Include inline comments if needed to clarify complex operations.
	- Do not generate explanations or verbal outputs SQL code only.
	
## Errors Handling :

Your code will be tested in real time after you give the output. If there are any errors, they will be sent to you through the IPython role

USER INPUT 

{
"target_table": "clients",
"source_table_involve": "CLIENTS_SOURCE",
"columns": [
{
"target_column": "client_id",
"source_column": "ID_CLIENT",
"transformation_type": "direct_copy",
"description": "Client ID kept as is."
},
{
"target_column": "last_name",
"source_column": "NOM_PRENOM",
"transformation_type": "split_string",
"delimiter": [" ", "_" , "-" ],
"part_index": 0,
"description": "Extracting the last name from the composite field NOM_PRENOM."
},
{
"target_column": "first_name",
"source_column": "NOM_PRENOM",
"transformation_type": "split_string",
"delimiter": " ",
"part_index": 1,
"description": "Extracting the first name from the composite field NOM_PRENOM."
},
{
"target_column": "birth_date",
"source_column": "DATE_NAISSANCE",
"transformation_type": "date_format",
"source_format": 
"target_format": "YYYY-MM-DD",
"description": "Converting the date to ISO format. source format DD/MM/YYYY tqr"
},
{
"target_column": "postal_code",
"source_column": "ADRESSE_COMPLETE",
"transformation_type": "regex_extract",
"description": "Extracting the postal code using a regex using this pattern "\\b(\\d{5})\\b" "
},
{
"target_column": "email",
"source_column": "EMAIL",
"transformation_type": "lowercase",
"description": "Converting the email to lowercase."
}
]
}
"""

result = groq_chat_completion_stream_clean(prompt)
print(result)

```sql
-- Create the target table with the required structure
CREATE TABLE clients (
    client_id INT,
    last_name VARCHAR(255),
    first_name VARCHAR(255),
    birth_date DATE,
    postal_code VARCHAR(10),
    email VARCHAR(255)
);

-- Create a view to handle complex transformations
CREATE VIEW clients_source_view AS
SELECT 
    ID_CLIENT,
    NOM_PRENOM,
    DATE_NAISSANCE,
    ADRESSE_COMPLETE,
    EMAIL
FROM CLIENTS_SOURCE;

-- Insert into target table with transformations
INSERT INTO clients (
    client_id,
    last_name,
    first_name,
    birth_date,
    postal_code,
    email
)
SELECT 
    CSV.ID_CLIENT AS client_id,
    -- Split NOM_PRENOM to extract last name
    SPLIT_PART(CSV.NOM_PRENOM, ' ', 1) AS last_name,
    -- Split NOM_PRENOM to extract first name (handle multiple delimiters)
    CASE 
        WHEN POSITION('_', CSV.NOM_PRENOM) > 0 THEN SPLIT_PART(REPLACE(CSV.NOM_PRENOM, '_', ' '), ' ', 2)
        WHEN POSITION('-', CSV.NOM_PRENOM) > 0 THEN SPLIT_PART(REPLACE(C

In [11]:
check_query = """```sql
SELECT COUNT(*) FROM patients;
```"""
print(execute_sql_code_from_string(check_query))

Exécution du SQL :
SELECT COUNT(*) FROM patients;
---
Exécution terminée avec succès.
None


In [None]:
import psycopg2
from datetime import datetime
import pytest

def test_table_person_structure(docker_db):
    expected_columns = {
        'person_id': 'integer',
        'gender_concept_id': 'integer',
        'year_of_birth': 'integer',
        'month_of_birth': 'integer',
        'day_of_birth': 'integer',
        'birth_datetime': 'timestamp without time zone', 
        'race_concept_id': 'integer',
        'ethnicity_concept_id': 'integer',
        'location_id': 'integer',
        'provider_id': 'integer',
        'care_site_id': 'integer',
        'person_source_value': 'character varying',  
        'gender_source_value': 'character varying',
        'gender_source_concept_id': 'integer',
        'race_source_value': 'character varying',
        'race_source_concept_id': 'integer',
        'ethnicity_source_value': 'character varying',
        'ethnicity_source_concept_id': 'integer'
    }
    
    id_columns = [
        'person_id',
        'gender_concept_id',
        'race_concept_id',
        'ethnicity_concept_id',
        'location_id',
        'provider_id',
        'care_site_id',
        'gender_source_concept_id',
        'race_source_concept_id',
        'ethnicity_source_concept_id'
    ]
    
    with psycopg2.connect(
        host="localhost",
        database="omop",
        user="admin",
        password="adminpassword",
        port="5432"
    ) as conn:
        with conn.cursor() as cursor:
            cursor.execute("""
                SELECT column_name, data_type
                FROM information_schema.columns
                WHERE table_name = 'person'
                AND table_schema = 'public';  -- replace with your schema if not 'public'
            """)
            
            actual_columns = {row[0]: row[1] for row in cursor.fetchall()}
            
            missing_columns = set(expected_columns.keys()) - set(actual_columns.keys())
            assert not missing_columns, f"Missing columns: {missing_columns}"
            
            type_mismatches = []
            for column, expected_type in expected_columns.items():
                if column in actual_columns and actual_columns[column] != expected_type:
                    type_mismatches.append(
                        f"{column}: expected {expected_type}, got {actual_columns[column]}"
                    )
            
            assert not type_mismatches, "Type mismatches:\n" + "\n".join(type_mismatches)
            
            # Check for NULL values in ID columns
            null_check_issues = []
            for id_column in id_columns:
                if id_column in actual_columns:
                    cursor.execute(f"""
                        SELECT COUNT(*) 
                        FROM person 
                        WHERE {id_column} IS NULL;
                    """)
                    null_count = cursor.fetchone()[0]
                    
                    if null_count > 0:
                        null_check_issues.append(
                            f"{id_column} has {null_count} NULL values"
                        )
            
            assert not null_check_issues, "NULL value issues in ID columns:\n" + "\n".join(null_check_issues)
        

llama3.1

Here is the optimized SQL code that transforms the data from the source database to the target database for the target table "clients":

```sql
CREATE OR REPLACE VIEW clients_view AS
SELECT 
    c.ID_CLIENT AS client_id,
    SPLIT(NOM_PRENOM, ' ', 1) AS first_name,
    SPLIT(NOM_PRENOM, ' ', 2) AS last_name,
    TO_DATE(SUBSTRING(DATE_NAISSANCE FROM '^\d{2}/\d{2}/\d{4}$'), 'DD/MM/YYYY') AS birth_date,
    REGEXP_EXTRACT(ADRESSE_COMPLETE, r'\b(\d{5})\b') AS postal_code,
    LOWER(EMAIL) AS email
FROM 
    CLIENTS_SOURCE c;
```

This SQL code creates a view called `clients_view` that performs the described transformations for the target table "clients".

In [26]:
person =  pd.read_csv('/home/petriscyril/Desktop/OMOP/OMOP_dataset/person.csv')
person[0:5]

Unnamed: 0,person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
0,1,8532,2005,7,6,2005-07-06,0,38003563,314,,,0002513e-8009-d8c4-9bf8-bdbb316deae8,F,0,native,0,hispanic,0
1,2,8532,2000,10,18,2000-10-18,8527,38003563,96,,,00035f01-cb9a-d253-eb67-7007a4e19ded,F,0,white,0,hispanic,0
2,3,8532,2001,9,25,2001-09-25,8527,38003563,103,,,000b8952-a1f1-e576-834c-d55c9d7b0941,F,0,white,0,hispanic,0
3,4,8532,1989,10,1,1989-10-01,8527,38003563,632,,,001046c2-98bd-1b63-14c4-ab8f9a7ddfdc,F,0,white,0,hispanic,0
4,5,8532,1992,11,10,1992-11-10,8527,38003563,456,,,0011424c-182d-59ac-5942-056a7f68d983,F,0,white,0,hispanic,0


In [33]:
import modal

MODEL_ID = "NousResearch/Meta-Llama-3-8B"
MODEL_REVISION = "315b20096dc791d381d514deb5f8bd9c8d6d3061"

image = modal.Image.debian_slim().pip_install(
    "transformers==4.49.0", "torch==2.6.0", "accelerate==1.4.0"
)
app = modal.App("example-base-Meta-Llama-3-8B", image=image)

GPU_CONFIG = "A100, 40 GB"

CACHE_DIR = "/cache"
cache_vol = modal.Volume.from_name("hf-hub-cache", create_if_missing=True)

@app.cls(
    gpu=GPU_CONFIG,
    volumes={CACHE_DIR: cache_vol},
    scaledown_window=10,
    timeout=60,
)
@modal.concurrent(max_inputs=15)
class Model:
    @modal.enter()
    def setup(self):
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

        from huggingface_hub import snapshot_download

        model_path = snapshot_download(repo_id=MODEL_ID, cache_dir=CACHE_DIR)

        print(f"Model downloaded to: {model_path}")
        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR)

        self.pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto",
        )

    @modal.method()
    def generate(self, input: str):
        return self.pipeline(input)

@app.local_entrypoint()
def main(prompt: str = None):
    if prompt is None:
        prompt = "Please write a Python function to compute the Fibonacci numbers."
    print(Model().generate.remote(prompt))

In [35]:
!modal token new

[2K[31mWas not able to launch web browser[0mthe web browser
Please go to this URL manually and complete the flow:

[2K]8;id=641599;https://modal.com/token-flow/tf-mBh8gSwTqLYJ6799dtHE4m\[4;94mhttps://modal.com/token-flow/tf-mBh8gSwTqLYJ6799dtHE4m[0m]8;;\

[2K[32m⠋[0m Waiting for authentication in the web browser
[2K[32m⠙[0m Waiting for token flow to complete...omplete...
[1A[2K[32mWeb authentication finished successfully![0m
[32mToken is connected to the [0m[35mpetriscyril9[0m[32m workspace.[0m
Verifying token against [4;34mhttps://api.modal.com[0m
[32mToken verified successfully![0m
[?25l[32m⠋[0m Storing token
[1A[2K[32mToken written to [0m[35m/home/petriscyril/[0m[35m.modal.toml[0m[32m in profile [0m[35mpetriscyril9[0m[32m.[0m


In [36]:
# Appeler directement la méthode avec un prompt spécifique
result = Model().generate.remote("Please write a Python function to compute the Fibonacci numbers.")
print(result)

AuthError: Token missing. Could not authenticate client. If you have token credentials, see modal.com/docs/reference/modal.config for setup help. If you are a new user, register an account at modal.com, then run `modal token new`.

TypeError: _Client.__init__() missing 3 required positional arguments: 'server_url', 'client_type', and 'credentials'

In [None]:
Was not able to launch web browserthe web browser
Please go to this URL manually and complete the flow:

]8;id=641599;https://modal.com/token-flow/tf-mBh8gSwTqLYJ6799dtHE4m\https://modal.com/token-flow/tf-mBh8gSwTqLYJ6799dtHE4m]8;;\

⠋ Waiting for authentication in the web browser
⠧ Waiting for token flow to complete...omplete...