In [404]:
import openai
import numpy as np
import faiss
import math
import pandas as pd

In [450]:
openai.api_key = 'XXXXXXXXXX'

def gerar_descricao(c):
    return f"Cliente de porte {c['porte']}, ramo {c['ramo']}, localizado em {c['cidade']}, {c['uf']}"

def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    
    phi1     = math.radians(lat1)
    phi2     = math.radians(lat2)
    d_phi    = math.radians(lat2 - lat1)
    d_lambda = math.radians(lon2 - lon1)

    a = math.sin(d_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(d_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    return R * c

def score_geo_exp(dist_km):
    if dist_km <= 5:
        return 1.00 * np.exp(-np.log(1.00 / 0.75) / (  5 -   0) * (dist_km -   0))
    elif dist_km <= 30:
        return 0.75 * np.exp(-np.log(0.75 / 0.50) / ( 30 -   5) * (dist_km -   5))
    elif dist_km <= 100:
        return 0.50 * np.exp(-np.log(0.50 / 0.25) / (100 -  30) * (dist_km -  30))
    elif dist_km <= 500:
        return 0.25 * np.exp(-np.log(0.25 / 0.01) / (500 - 100) * (dist_km - 100))
    else:
        return 0.00

In [452]:
dados = np.load("clusters_base.npz", allow_pickle=True)
vetores = dados["vetores"]
clusters = dados["clusters"]

# Cria índice FAISS
d = vetores.shape[1]
index = faiss.IndexFlatL2(d)
index.add(vetores)

In [454]:
df_clientes = pd.read_excel('base_clientes.xlsx')
df_clientes

Unnamed: 0,id,cluster,latitude,longitude
0,C001,0,-22.9,-47.06
1,C002,1,-25.43,-49.27
2,C003,2,-12.97,-38.51
3,C004,0,-22.9,-47.06
4,C005,1,-25.43,-49.27
5,C006,2,-12.97,-38.51
6,C007,0,-22.9,-47.06
7,C008,1,-25.43,-49.27
8,C009,2,-12.97,-38.51
9,C010,0,-22.9,-47.06


In [457]:
novo_cliente = {
    "porte": "pequeno",
    "ramo": "alimentício",
    "cidade": "Sorocaba",
    "uf": "SP",
    "latitude": -22.95,
    "longitude": -47.06
}

In [459]:
descricao = gerar_descricao(novo_cliente)
descricao

'Cliente de porte pequeno, ramo alimentício, localizado em Sorocaba, SP'

In [461]:
response = openai.embeddings.create(
    model="text-embedding-ada-002",
    input=descricao
)
embedding = np.array(response.data[0].embedding, dtype='float32').reshape(1, -1)
embedding

array([[-0.00335742, -0.02344392,  0.00264534, ..., -0.01109688,
        -0.01540159, -0.02345681]], dtype=float32)

In [462]:
D, I = index.search(embedding, k=5)

In [465]:
melhor_score = -1
melhor_id = None
peso_semantico = 0.7
peso_geo = 0.3

resultados = []

for pos, idx in enumerate(I[0]):
    cluster = clusters[idx]
    score_semantico = 1 / (1 + D[0][pos])

    df_cluster = df_clientes[df_clientes['cluster'] == cluster].copy()
    df_cluster['cluster'] = cluster
    df_cluster['score_semantico'] = score_semantico

    for i, r in df_cluster.iterrows():
        distancia_geo = haversine(novo_cliente["latitude"], novo_cliente["longitude"],
                                  r["latitude"], r["longitude"])
        score_geo = score_geo_exp(distancia_geo)
        score_final = peso_semantico * score_semantico + peso_geo * score_geo
    
        resultados.append({
            "id_cliente": r.id,
            "cluster": r.cluster,
            "distancia_geo": round(distancia_geo, 4),
            "score_geo": round(score_geo, 4),
            "score_semantico": round(score_semantico, 4),
            "score_final": round(score_final, 4)
        })
        
        if score_final > melhor_score:
            melhor_score = score_final
            melhor_id = r.id

pd.DataFrame(resultados)

Unnamed: 0,id_cliente,cluster,distancia_geo,score_geo,score_semantico,score_final
0,C002,1,355.36,0.032,0.8926,0.6345
1,C005,1,355.36,0.032,0.8926,0.6345
2,C008,1,355.36,0.032,0.8926,0.6345
3,C011,1,355.36,0.032,0.8926,0.6345
4,C014,1,355.36,0.032,0.8926,0.6345
5,C017,1,355.36,0.032,0.8926,0.6345
6,C020,1,355.36,0.032,0.8926,0.6345
7,C023,1,355.36,0.032,0.8926,0.6345
8,C026,1,355.36,0.032,0.8926,0.6345
9,C029,1,355.36,0.032,0.8926,0.6345


In [467]:
melhor_id

'C001'