In [None]:
!pip install qdrant-client pandas streamlit

Collecting qdrant-client
  Downloading qdrant_client-1.14.3-py3-none-any.whl.metadata (10 kB)
Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading qdrant_client-1.14.3-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m80.2 MB/

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np

# Carga el CSV
file_path = "TC_Canon_CZC_RAD_MIN_HURJC_train_features.csv"
df = pd.read_csv(file_path)

# Reinicia índices para asegurar acceso por posición
df = df.reset_index(drop=True)

# Define columnas de vectores y metadatos
vector_columns = [f"feature_{i}" for i in range(1, 513)]
data_columns = ['label', 'sex', 'birthdate', 'date_rx', 'station_name']

# 👉 Calcula la norma L2 de cada vector original
normas = np.linalg.norm(df[vector_columns].values, axis=1)

# 👉 Añade la norma como nueva columna
df["vector_norm"] = normas

# 👉 Normaliza los vectores antes de subir
df[vector_columns] = normalize(df[vector_columns])

# Añade 'vector_norm' al payload
data_columns.append('vector_norm')

# Crea IDs únicos para los puntos
ids = list(range(len(df)))

# Conecta con tu instancia de Qdrant
client = QdrantClient(
    url="https://3b6368e4-5ce1-4125-a96d-d4c64eb8a7f3.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.bSoa1Ut2nHfmVs4R4hZ6wUYOpq8ksjq-8cjNHJ5ucr4",
)

# Crea la colección si no existe
collection_name = "rx_vectorizadas_2"

if collection_name not in [c.name for c in client.get_collections().collections]:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=512, distance=Distance.COSINE)
    )

# Prepara los puntos para subir
# Implement batching to avoid exceeding payload size limit
batch_size = 256  # Adjust batch size as needed
for i in range(0, len(df), batch_size):
    batch_points = []
    for j in range(i, min(i + batch_size, len(df))):
        batch_points.append(
            PointStruct(
                id=ids[j],
                vector=df.iloc[j][vector_columns].tolist(),
                payload=df.iloc[j][data_columns].to_dict()
            )
        )
    # Subir los datos a Qdrant in batches
    client.upsert(collection_name=collection_name, points=batch_points)
    print(f"✅ Uploaded batch {int(i/batch_size) + 1}/ {int(len(df)/batch_size) + 1}")

print(f"✅ Datos normalizados y subidos correctamente a Qdrant Cloud. Total de filas subidas para entrenamiento: {len(df)}")

✅ Uploaded batch 1/ 15
✅ Uploaded batch 2/ 15
✅ Uploaded batch 3/ 15
✅ Uploaded batch 4/ 15
✅ Uploaded batch 5/ 15
✅ Uploaded batch 6/ 15
✅ Uploaded batch 7/ 15
✅ Uploaded batch 8/ 15
✅ Uploaded batch 9/ 15
✅ Uploaded batch 10/ 15
✅ Uploaded batch 11/ 15
✅ Uploaded batch 12/ 15
✅ Uploaded batch 13/ 15
✅ Uploaded batch 14/ 15
✅ Uploaded batch 15/ 15
✅ Datos normalizados y subidos correctamente a Qdrant Cloud. Total de filas subidas para entrenamiento: 3768


In [None]:
import numpy as np

# --- Paso 1: Cargar el CSV original ---
file_path = "TC_Canon_CZC_RAD_MIN_HURJC_train_features.csv"
df = pd.read_csv(file_path).reset_index(drop=True)

# Columnas vectoriales
vector_columns = [f"feature_{i}" for i in range(1, 513)]

# --- Paso 2: Recuperar los primeros N vectores desde Qdrant ---
N = 5  # Número de vectores a comparar
response = client.scroll(
    collection_name=collection_name,
    limit=N,
    with_vectors=True,
    with_payload=False  # solo necesitamos los vectores
)

# --- Paso 3: Comparar vectores uno a uno ---
for i in range(N):
    vector_qdrant = response[0][i].vector
    vector_csv = df.iloc[i][vector_columns].values.astype(float)

    # Comparación
    is_close = np.allclose(vector_qdrant, vector_csv, atol=1e-6)
    diff = np.abs(vector_qdrant - vector_csv)

    print(f"\n🔎 Comparando vector {i}: {'✅ IGUALES' if is_close else '❌ DIFERENTES'}")
    if not is_close:
        print("Primeras diferencias:")
        for j, d in enumerate(diff[:10]):
            print(f"  feature_{j+1}: CSV={vector_csv[j]:.6f}, Qdrant={vector_qdrant[j]:.6f}, Δ={d:.6f}")



🔎 Comparando vector 0: ❌ DIFERENTES
Primeras diferencias:
  feature_1: CSV=7.587328, Qdrant=0.034292, Δ=7.553037
  feature_2: CSV=7.851153, Qdrant=0.035484, Δ=7.815669
  feature_3: CSV=11.710371, Qdrant=0.052926, Δ=11.657445
  feature_4: CSV=0.849412, Qdrant=0.003839, Δ=0.845573
  feature_5: CSV=5.860831, Qdrant=0.026489, Δ=5.834342
  feature_6: CSV=3.995653, Qdrant=0.018059, Δ=3.977594
  feature_7: CSV=8.937948, Qdrant=0.040396, Δ=8.897552
  feature_8: CSV=3.017040, Qdrant=0.013636, Δ=3.003405
  feature_9: CSV=8.756101, Qdrant=0.039574, Δ=8.716527
  feature_10: CSV=10.583647, Qdrant=0.047834, Δ=10.535813

🔎 Comparando vector 1: ❌ DIFERENTES
Primeras diferencias:
  feature_1: CSV=2.996469, Qdrant=0.020138, Δ=2.976332
  feature_2: CSV=5.021137, Qdrant=0.033745, Δ=4.987392
  feature_3: CSV=8.950255, Qdrant=0.060151, Δ=8.890104
  feature_4: CSV=0.227062, Qdrant=0.001526, Δ=0.225536
  feature_5: CSV=5.862631, Qdrant=0.039400, Δ=5.823231
  feature_6: CSV=3.727322, Qdrant=0.025050, Δ=3.7022

In [None]:
import numpy as np

normas = np.linalg.norm(df[vector_columns].values, axis=1)
print("Norma promedio:", np.mean(normas))

Norma promedio: 169.04345999587682


In [None]:
query_vector = df.loc[0, vector_columns].tolist()  # Ejemplo: usa el primer vector como consulta
search_result = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=5
)

for point in search_result:
    print(f"ID: {point.id}, Score: {point.score}, Payload: {point.payload}")


ID: 0, Score: 1.0, Payload: {'label': 1, 'sex': 'F', 'birthdate': '1971-07-11', 'date_rx': '2023-07-26', 'station_name': 'MININT-VSAMOTD', 'vector_norm': 221.2591599738342}
ID: 655, Score: 0.96347475, Payload: {'label': 1, 'sex': 'F', 'birthdate': '1971-07-11', 'date_rx': '2023-04-27', 'station_name': 'RADIOLOGIA-HP', 'vector_norm': 203.82394393238076}
ID: 2606, Score: 0.9570943, Payload: {'label': 1, 'sex': 'F', 'birthdate': '1971-07-11', 'date_rx': '2023-07-26', 'station_name': 'MININT-VSAMOTD', 'vector_norm': 206.8813641234663}
ID: 1283, Score: 0.9479747, Payload: {'label': 1, 'sex': 'F', 'birthdate': '1970-03-23', 'date_rx': '2023-07-13', 'station_name': 'MININT-VSAMOTD', 'vector_norm': 180.5868771968027}
ID: 587, Score: 0.9467755, Payload: {'label': 1, 'sex': 'F', 'birthdate': '1982-10-23', 'date_rx': '2023-03-31', 'station_name': 'RADIOLOGIA-HP', 'vector_norm': 184.3066015695913}
