In [2]:
import os
import sys
from datetime import datetime

In [3]:
# -------------------------------------------------------------
# Environment setup
# -------------------------------------------------------------
BASE_DIR = os.path.abspath("../")
sys.path.append(BASE_DIR)

# -------------------------------------------------------------
# Configuration
# -------------------------------------------------------------
from src.utils.config import load_config
config = load_config(base_dir=BASE_DIR)

In [1]:
import time
import dask.dataframe as dd
from dask_ml.preprocessing import StandardScaler
from dask_ml.decomposition import TruncatedSVD



# Embeding matematicos truncateSVD

In [None]:
start_time = time.time()

# === Leer CSV ===
individual_tensors = config["paths"]["tensors_convolution"]
data = dd.read_csv(individual_tensors)

row_id = data.iloc[:, 0]                     # primera columna
df_num = data.iloc[:, 1:].astype(float)      # num√©ricas

print(" to_dask_array ")
X = df_num.to_dask_array(lengths=True)

# === Escalado (Dask-ML) ===
print(" Escalado (Dask-ML) ")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === SVD sin cargar en memoria ===
print(" SVD ")
n_components = 20
tsvd = TruncatedSVD(n_components=n_components)
X_reduced = tsvd.fit_transform(X_scaled)

# Convertir resultado a Dask-DF sin computar en RAM
print(" to dask ")
svd_cols = [f"svd_{i}" for i in range(n_components)]
df_svd = dd.from_dask_array(X_reduced, columns=svd_cols)

# Concatenar IDs + componentes
print(" concat ")
result = dd.concat([
    row_id.reset_index(drop=True),
    df_svd.reset_index(drop=True)
], axis=1)

# === Guardar sin computar en RAM ===
print(" Write ")
tensors_svd = config["paths"]["tensors_svd"]
os.makedirs(os.path.dirname(tensors_svd), exist_ok=True)
result.to_parquet(tensors_svd, write_index=False)

end_time = time.time()
print(f"‚è∞ Start: {time.ctime(start_time)}")
print(f"üèÅ End:   {time.ctime(end_time)}")
print(f"‚è±Ô∏è Total: {end_time - start_time:.2f} sec ({(end_time - start_time)/60:.2f} min)")


In [None]:
result.head(2)

Unnamed: 0,id,client_id,mcc,amount_sol,client_age,client_gender,agency_ubigeo,debit_type,timestamp,mccg,windows_time,day_of_week,hour,turn_of_day
0,85912210,+cSS8MDLKuU=,0,183.5,40,F,120101.0,TC,2016-10-01 23:19:17-05:00,0,2016-09-26,5,23,3
1,85912211,+cSS8MDLKuU=,0,6.15,40,F,120101.0,TC,2016-10-17 23:08:58-05:00,0,2016-10-17,0,23,3


# end

| M√©todo de escalado      | Qu√© hace                                 | Preserva direcci√≥n | Ideal para Cosine | Ideal para Euclidean | Ideal para Manhattan (L1) | Ideal para Minkowski | Adecuado para K-Means | Comentarios clave |
|--------------------------|-------------------------------------------|---------------------|--------------------|------------------------|---------------------------|------------------------|------------------------|--------------------|
| **L2 Normalization**     | Normaliza cada vector a norma 1 (‚Äñx‚Äñ‚ÇÇ=1) | ‚úî S√≠               | ‚≠ê **S√≠** (equivalencia exacta) | ‚úî S√≠ (si magnitud no importa) | ‚ùå No (distorsiona L1) | ‚ùå No | ‚≠ê **S√≠ (cuando quieres imitar Cosine-KMeans)** | Convierte Euclidean en Cosine; ideal para embeddings y perfiles. |
| **StandardScaler**       | Centra y escala cada feature (z-score)   | ‚ùå No              | ‚ùå **No** | ‚≠ê **S√≠** (distancia euclidiana cl√°sica) | ‚≠ê **S√≠** | ‚≠ê **S√≠** | ‚≠ê **S√≠** | Mantiene variancia comparable entre features; est√°ndar para ML tradicional. |
| **MinMaxScaler**         | Escala cada feature a [0,1]              | ‚ùå No              | ‚ùå No | ‚≠ê S√≠ (cuando las escalas importan) | ‚≠ê S√≠ | ‚≠ê S√≠ | ‚≠ê S√≠ | Preserva relaciones relativas; √∫til en clustering basado en distancias mixtas. |
| **RobustScaler**         | Escala usando medianas y IQR             | ‚ùå No              | ‚ùå No | ‚≠ê S√≠ (robusto a outliers) | ‚≠ê S√≠ | ‚≠ê S√≠ | ‚≠ê S√≠ | Excelente cuando hay outliers fuertes; no apto para coseno. |
| **None (sin escala)**    | Deja los datos crudos                    | A veces            | ‚ùå No | ‚ùå No (si las escalas var√≠an entre features) | ‚ùå No | ‚ùå No | ‚ùå No | Solo √∫til si todas las features ya est√°n en la misma escala y sin outliers. |


In [None]:
import time
import dask.dataframe as dd
import dask.array as da

In [None]:
start_time = time.time()

# individual_tensors = config["paths"]["tensors_convolution"]
tensors_svd = config["paths"]["tensors_svd"]
inpust_path = tensors_svd
data = dd.read_parquet(inpust_path)
data = data.repartition(npartitions=data.npartitions)

id_col = "row_id"
value_cols = [c for c in data.columns if c != id_col]

X = data[value_cols].astype(float).to_dask_array(lengths=True)
row_ids = data[id_col].to_dask_array(lengths=True).reshape(-1, 1)

L1_norms = da.sum(da.abs(X), axis=1, keepdims=True)
L1_norms = da.where(L1_norms == 0, 1, L1_norms)
X_norm_L1 = X / L1_norms

full_array = da.hstack([row_ids, X_norm_L1])
df_norm_L1 = dd.from_dask_array(full_array, columns=[id_col] + value_cols)

output_path_L1 = "outputs/tensors_normalized_L1.parquet"
os.makedirs(os.path.dirname(tensors_svd), exist_ok=True)
df_norm_L1.to_parquet(output_path_L1, write_index=False)

end_time = time.time()
print(f"‚è∞ Start: {time.ctime(start_time)}")
print(f"üèÅ End:   {time.ctime(end_time)}")
print(f"‚è±Ô∏è Total: {end_time - start_time:.2f} sec ({(end_time - start_time)/60:.2f} min)")

In [None]:
start_time = time.time()

# individual_tensors = config["paths"]["tensors_convolution"]
tensors_svd = config["paths"]["tensors_svd"]
inpust_path = tensors_svd
data = dd.read_parquet(inpust_path)
data = data.repartition(npartitions=data.npartitions)

id_col = "row_id"
value_cols = [c for c in data.columns if c != id_col]

X = data[value_cols].astype(float).to_dask_array(lengths=True)
row_ids = data[id_col].to_dask_array(lengths=True).reshape(-1, 1)

L2_norms = da.linalg.norm(X, axis=1, keepdims=True)
L2_norms = da.where(L2_norms == 0, 1, L2_norms)
X_norm_L2 = X / L2_norms

full_array = da.hstack([row_ids, X_norm_L2])
df_norm_L2 = dd.from_dask_array(full_array, columns=[id_col] + value_cols)

output_path_L2 = "outputs/tensors_normalized_L2.parquet"
os.makedirs(os.path.dirname(tensors_svd), exist_ok=True)
df_norm_L2.to_parquet(output_path_L2, write_index=False)

end_time = time.time()
print(f"‚è∞ Start: {time.ctime(start_time)}")
print(f"üèÅ End:   {time.ctime(end_time)}")
print(f"‚è±Ô∏è Total: {end_time - start_time:.2f} sec ({(end_time - start_time)/60:.2f} min)")