# Open source local Chatbot

Database connection

In [None]:
import clickhouse_connect

client = clickhouse_connect.get_client(
    host='clickhouse',
    port=8123,  # HTTP interface
    username='abc',
    password='xyz'
)

In [2]:
schema_info = """superficie_bd.superficie_forestal (
    entidad_federativa String,
    poblacion UInt32,
    superficie_total Float64,
    superficie_forestal Float64,
    superficie_no_forestal Float64,
    superficie_con_arbolado Float64,
    area_cubierta_por_bosque Float64,
    area_cubierta_por_selva Float64,
    area_cubierta_por_manglar Float64,
    superficie_cubierta_por_otras_areas_arboladas Float64,
    area_cubierta_por_matorral_xerofilo Float64,
    area_cubierta_por_otras_areas_forestales Float64,
    superficie_destinada_a_actividades_agricolas_de_humedad Float64,
    superficie_destinada_a_actividades_agricolas_de_riego Float64,
    superficie_destinada_a_actividades_agricolas_de_temporal Float64,
    superficie_de_cuerpos_de_agua Float64,
    superficie_destinada_a_actividades_acuicolas Float64,
    area_de_pastizales_cultivados Float64,
    area_de_pastizales_inducidos Float64,
    superficie_sin_vegetacion_visible Float64,
    superficie_desprovista_de_vegetacion Float64,
    superficie_ocupada_por_asentamientos_humanos Float64
)
"""

In [3]:
entidad_federativa = "['Aguascalientes', 'Baja California Norte', 'Baja California Sur', 'Campeche', 'Coahuila', 'Colima', 'Chiapas', 'Chihuahua', 'Ciudad de Mexico', 'Durango', 'Guanajuato', 'Guerrero', 'Hidalgo', 'Jalisco', 'Estado de Mexico', 'Michoacan', 'Morelos', 'Nayarit', 'Nuevo Leon', 'Oaxaca', 'Puebla', 'Queretaro', 'Quintana Roo', 'San Luis Potosi', 'Sinaloa', 'Sonora', 'Tabasco', 'Tamaulipas', 'Tlaxcala', 'Veracruz', 'Yucatan', 'Zacatecas']"

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList

# load globally so it's not reloaded every call
device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "merged_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)


model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

torch.set_float32_matmul_precision('high')


class StopOnSemicolon(StoppingCriteria):
    def __call__(self, input_ids, scores, **kwargs):
        decoded = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        return ";" in decoded

stopping_criteria = StoppingCriteriaList([StopOnSemicolon()])

def nl_to_sql(request: str) -> str:
    """
    Generate a SQL query from a natural language request using the fine-tuned model.
    """
    prompt = f'''You are a SQL generator for a ClickHouse database.
    Given a user request in natural language, you will respond with exactly one valid SQL query, nothing else, no explanations.
    Respond with exactly ONE SQL query, do not repeat it.
    End your response immediately after the first semicolon.
    Use proper table and column names from the schema.
    Handle aggregations, and filtering appropriately.
    User natural language questions will be in spanish.
    
    Schema info: {schema_info}

    Values for entidad_federativa column in superficie_bd.superficie_forestal, these names don't have accent marks: {entidad_federativa}

    México, Estado de México, Ciudad de México are different concepts.
    Superficie is the same as superficie total.
    Área forestal, área de bosque, otras áreas arboladas, otras áreas forestales are diffent concepts.

    Examples:
    Q: "¿Cuál es la entidad federativa con mayor superficie forestal?"
    A: SELECT
      entidad_federativa,
      superficie_forestal
    FROM 
      superficie_bd.superficie_forestal
    ORDER BY
      superficie_forestal
    DESC
    LIMIT 1



    User request: {request}
    
    SQL:'''
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            stopping_criteria=stopping_criteria
        )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    sql = full_output.replace(prompt, "").strip()
    return sql

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
sql = nl_to_sql("Dame la suma de la superficie forestal")
print("Generated SQL:", sql)

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated SQL: SELECT
      SUM(superficie_forestal)
    FROM 
      superficie_bd.superficie_forestal;


In [25]:
result = client.query(sql)

print(result.column_names)
result.result_rows

('SUM(superficie_forestal)',)


[(138695229.78034,)]

In [36]:
sql = nl_to_sql("Dame la entidad federativa y la superficie no forestal")
print("Generated SQL:", sql)

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated SQL: SELECT
      entidad_federativa,
      superficie_no_forestal
    FROM 
      superficie_bd.superficie_forestal;


In [37]:
result = client.query(sql)

print(result.column_names)
result.result_rows

('entidad_federativa', 'superficie_no_forestal')


[('Aguascalientes', 302000.0238),
 ('Baja California Norte', 1014011.539),
 ('Baja California Sur', 643093.549),
 ('Campeche', 1421252.35),
 ('Chiapas', 3745862.899),
 ('Chihuahua', 3134481.14),
 ('Ciudad de Mexico', 104798.0108),
 ('Coahuila', 1419361.19),
 ('Colima', 253809.9829),
 ('Durango', 1719431.78),
 ('Estado de Mexico', 1447249.881),
 ('Guanajuato', 1798867.707),
 ('Guerrero', 2020835.556),
 ('Hidalgo', 1120311.809),
 ('Jalisco', 3313903.235),
 ('Michoacan', 2341546.799),
 ('Morelos', 304197.0317),
 ('Nayarit', 892058.349),
 ('Nuevo Leon', 2220215.189),
 ('Oaxaca', 2683852.004),
 ('Puebla', 1858144.56),
 ('Queretaro', 472810.9523),
 ('Quintana Roo', 459235.702),
 ('San Luis Potosi', 1866171.886),
 ('Sinaloa', 2534255.666),
 ('Sonora', 3118964.68),
 ('Tabasco', 1656378.312),
 ('Tamaulipas', 4204340.169),
 ('Tlaxcala', 325534.9012),
 ('Veracruz', 5712305.168),
 ('Yucatan', 1259353.517),
 ('Zacatecas', 2303999.876)]

In [29]:
sql = nl_to_sql("Dame el índice de superficie forestal entre población en la ciudad de México")
print("Generated SQL:", sql)

The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated SQL: SELECT
      superficie_forestal/poblacion
    FROM 
      superficie_bd.superficie_forestal
    WHERE 
      entidad_federativa = 'Ciudad de Mexico';


In [30]:
result = client.query(sql)

print(result.column_names)
result.result_rows

('divide(superficie_forestal, poblacion)',)


[(0.004855097628172332,)]