In [2]:
import requests
import json
import re
import psycopg2
import pandas as pd

def execute_sql_code_from_string(input_string):
    sql_blocks = re.findall(r'```sql\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
    
    if not sql_blocks:
        print("No SQL block found.")
        return None
    
    try:
        conn = psycopg2.connect(
            dbname="synthea",
            user="admin",
            password="adminpassword",
            host="localhost",
            port="5432"
        )
        conn.autocommit = True
        cursor = conn.cursor()

        for sql_code in sql_blocks:
            print(f"Executing SQL:\n{sql_code}\n---")
            cursor.execute(sql_code)

            if cursor.description:
                # Get column names
                colnames = [desc[0] for desc in cursor.description]
                # Fetch all data
                rows = cursor.fetchall()
                # Convert to DataFrame
                df = pd.DataFrame(rows, columns=colnames)
                return df

        return None  

    except psycopg2.Error as e:
        print(f"SQL Execution Error: {e}")
        return None

    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()
            
def groq_chat_completion_stream_clean(prompt, model="llama3-70b-8192"):
    GROQ_API_KEY = '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'
    if not GROQ_API_KEY:
        raise ValueError("La clé API Groq n'est pas configurée dans le fichier .env")
    
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {GROQ_API_KEY}"
    }
    data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": True  
    }
    response_text = ""
    with requests.post(url, headers=headers, json=data, stream=True) as response:
        if response.status_code != 200:
            raise Exception(f"Erreur API: {response.status_code} - {response.text}")
        
        for chunk in response.iter_lines():
            if chunk:
                decoded_chunk = chunk.decode('utf-8')
                if decoded_chunk.startswith("data:"):
                    try:
                        parsed = json.loads(decoded_chunk[5:].strip())
                        content = parsed.get("choices", [{}])[0].get("delta", {}).get("content")
                        if content:
                            response_text += content
                    except json.JSONDecodeError:
                        continue  

    return response_text.strip()

In [6]:
with open("/prompt/prompt1.txt", "r", encoding="utf-8") as f:
    prompt = f.read()

json_format = pd.read_json("location.json")
json_str = json_format.to_json(orient="records", indent=2)

final_prompt = prompt + "\n\n" + json_str

result = groq_chat_completion_stream_clean(prompt)
print(result)

ValueError: Mixing dicts with non-Series may lead to ambiguous ordering.

In [11]:
check_query = """```sql
SELECT COUNT(*) FROM patients;
```"""
print(execute_sql_code_from_string(check_query))

Exécution du SQL :
SELECT COUNT(*) FROM patients;
---
Exécution terminée avec succès.
None


In [None]:
import psycopg2
from psycopg2.extras import RealDictCursor

def location_table_test(dbname, user, password, host='localhost', port=5432, cdm_schema='cdm') -> bool:
    query = f"""
        WITH duplicates AS (
            SELECT
                address_1,
                address_2,
                city,
                state,
                county,
                zip,
                location_source_value,
                COUNT(*) as count
            FROM {cdm_schema}.location
            GROUP BY
                address_1, address_2, city, state, county, zip, location_source_value
            HAVING COUNT(*) > 1
        )
        SELECT COUNT(*) as duplicate_count FROM duplicates;
    """
    query = f"""
            SELECT COUNT(*) AS total_rows,
               COUNT(DISTINCT location_id) AS unique_location_ids
            FROM omop.location;
    """
    try:
        with psycopg2.connect(
            dbname=dbname,
            user=user,
            password=password,
            host=host,
            port=port
        ) as conn:
            with conn.cursor(cursor_factory=RealDictCursor) as cur:
                cur.execute(query)
                result = cur.fetchone()
                return result['duplicate_count'] == 0
    except Exception as e:
        print(f"Erreur lors de la vérification : {e}")
        return False



In [2]:
import unittest
import pandas as pd
from sqlalchemy import create_engine

def check_duplicate_locations(database_uri: str) -> pd.DataFrame:
    engine = create_engine(database_uri)
    query = """
    SELECT
        city,
        state,
        zip,
        address_1,
        address_2,
        county,
        location_source_value,
        COUNT(*) AS occurrence_count
    FROM omop.location
    GROUP BY
        city,
        state,
        zip,
        address_1,
        address_2,
        county,
        location_source_value
    HAVING COUNT(*) > 1;
    """
    with engine.connect() as conn:
        return pd.read_sql(query, conn)

class TestOMOPLocationTable(unittest.TestCase):

    def setUp(self):
        self.database_uri = "postgresql://user:password@localhost:5432/your_database"

    def test_no_duplicate_locations(self):
        duplicates = check_duplicate_locations(self.database_uri)
        self.assertTrue(
            duplicates.empty(
            f"❌ Duplicate location entries found in omop.location:\n{duplicates}"
        )

SyntaxError: incomplete input (1051297758.py, line 41)

prompt1 avec location 8

Here is the generated SQL code:

```sql
CREATE VIEW omop.location_view AS
WITH distinct_locations AS (
    SELECT 
      ROW_NUMBER() OVER (PARTITION BY city, state, zip ORDER BY city, state, zip) AS location_id,
      city,
      s.state_abbreviation AS state, 
      zip,
      zip AS location_source_value
    FROM 
      synthea.patients p
      LEFT JOIN omop.states_map s ON p.state = s.state
  )
SELECT 
  location_id,
  city,
  state,
  zip,
  location_source_value,
  CAST(NULL AS VARCHAR) AS address_1,
  CAST(NULL AS VARCHAR) AS address_2,
  CAST(NULL AS VARCHAR) AS county
FROM 
  distinct_locations;

CREATE TABLE omop.location AS 
SELECT * FROM omop.location_view;
```

This SQL code generates a `VIEW` that performs the desired transformation, and then creates the `omop.location` table based on the view. Note that I've used a CTE (Common Table Expression) to compute the `location_id` and perform the `state` lookup. I've also explicitly cast the `NULL` values to `VARCHAR` type as specified in the JSON.

prompt1 avec location llama8b mark = 8

Here is the optimized SQL code that transforms the data from the source table "synthea.patients" to the target table "omop.location" according to the provided JSON:
```sql
-- Create a view to perform the distinct grouping and transformations
CREATE VIEW location_transform AS
SELECT 
  -- Use ROW_NUMBER() to generate a unique location_id
  ROW_NUMBER() OVER (PARTITION BY p.city, p.state, p.zip) AS location_id,
  p.city,
  -- Perform the lookup on the omop.states_map table
  sm.state_abbreviation AS state,
  p.zip,
  p.zip AS location_source_value,
  -- Set address_1, address_2, and county to NULL
  CAST(NULL AS VARCHAR) AS address_1,
  CAST(NULL AS VARCHAR) AS address_2,
  CAST(NULL AS VARCHAR) AS county
FROM 
  synthea.patients p
  -- Join with the omop.states_map table for lookup
  LEFT JOIN omop.states_map ON p.state = sm.state;

-- Create the target table omop.location from the view
CREATE TABLE omop.location AS
SELECT 
  *
FROM 
  location_transform;
```
Note that I've followed the best practices to make the code efficient, readable, and maintainable. I've used a view to modularize the complex transformation and made sure to limit the selected columns. I've also used a join instead of a subquery for the lookup operation. Additionally, I've applied casting to ensure the correct data type for the address fields.