# Implementación de strategia Ramdom Forest Federado para la Base de Datos Adidas US SALES

## Instalaciones y detalles

Este notebook adapta la estrategia de **Federated Random Forest** desarrollada por Cotorobai et al. (2025) al proyecto de Adidas, permitiendo particionar datos, levantar servidores federados y entrenar modelos de forma distribuida y segura.

Para empezar, instala las dependencias necesarias:



In [1]:
#pip install -r requirements.txt


In [2]:
#%pip install -e ..


A continuación se muestran los detalles de la publicación original y la licencia bajo la que se distribuye el código:


@inproceedings{cotorobai2025federated,
  title     = {A Federated Random Forest Solution for Secure Distributed Machine Learning},
  author    = {Cotorobai, Alexandre and Silva, Jorge Miguel and Oliveira, José Luis},
  booktitle = {Proceedings of the IEEE International Conference on Bioinformatics and Biomedicine (CBMS)},
  year      = {2025},
  pages     = {–},      % actualizar cuando esté disponible
  doi       = {–}       % actualizar cuando esté disponible
}

Licencia

MIT License

Copyright (c) 2024 Alexandre Cotorobai

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

## Imports y carga/limpieza

En esta sección vamos a cargar el fichero original de ventas de Adidas, aplicar limpieza y generar automáticamente un conjunto de particiones (nodos federados) según varios esquemas (`retailer_region`, `retailer_city`, `region`).  

La libreta está en src_rf/ImplementacionADatasetAdidadas/ por lo que los CSV resultantes se guardarán en una carpeta `nodos/` situada en la raíz de `src_rf`.

In [3]:
import random
import os
import pandas as pd
from pathlib import Path
from ucimlrepo import fetch_ucirepo
from sklearn.utils import resample
from pathlib import Path

In [4]:
# --------------------------------------------------------------------------
# 1.  Cargamos y limpiamos el Excel  (tu función se queda tal cual)
# --------------------------------------------------------------------------

#  Ruta al Excel de ventas
DATA_FILE = Path.cwd().parent.parent / "datos" / "Adidas US Sales Datasets.xlsx"

#  Función actualizada de carga y limpieza
def load_clean_adidas_data(ruta_excel: Path) -> pd.DataFrame:

    df = pd.read_excel(ruta_excel)
    
    # 1Eliminar las primeras tres filas que contienen datos irrelevantes
    df = df.drop([0, 1, 2])

    # Restablecer índices tras eliminar filas y renombrar columnas usando fila 3
    header_row = df.loc[3]
    df = df.rename(columns=header_row)
    
    # Eliminar primera columna vacía
    df = df.drop(df.columns[0], axis=1)
    
    # Eliminar fila 3 usada como cabecera y resetear índice
    df = df.drop(index=3).reset_index(drop=True)
    
    
    # Filtrar filas con 'Invoice Date' inválida y convertir a datetime
    valid_mask = pd.to_datetime(df['Invoice Date'], errors='coerce').notna()
    df = df[valid_mask]
    df['Invoice Date'] = pd.to_datetime(df['Invoice Date'])
    
    print(" Primeras filas tras conversión 'Invoice Date':")
    display(df.head())
    
    # Definir variables
    variables_categoricas = ['Retailer', 'Region', 'State', 'City', 'Product', 'Sales Method']
    variables_numericas = ['Price per Unit', 'Total Sales', 'Operating Profit', 'Operating Margin']
    variable_objetivo = ['Units Sold']
    
    print("• Variables categóricas:", variables_categoricas)
    print("• Variables numéricas:", variables_numericas)
    print("• Variable objetivo:", variable_objetivo)
    
    # Conversión de tipos
    for col in variables_categoricas:
        if col in df:
            df[col] = df[col].astype('category')
    for col in variables_numericas + variable_objetivo:
        if col in df:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Mostrar tipos de datos resultantes
    print("\n Tipos de datos final:")
    display(df.dtypes)
    
    return df
#  Prueba de la función
df = load_clean_adidas_data(DATA_FILE)
print("\n DataFrame limpio preparado.")

 Primeras filas tras conversión 'Invoice Date':


Unnamed: 0,Retailer,Retailer ID,Invoice Date,Region,State,City,Product,Price per Unit,Units Sold,Total Sales,Operating Profit,Operating Margin,Sales Method
0,Foot Locker,1185732,2020-01-01,Northeast,New York,New York,Men's Street Footwear,50,1200,600000,300000,0.5,In-store
1,Foot Locker,1185732,2020-01-02,Northeast,New York,New York,Men's Athletic Footwear,50,1000,500000,150000,0.3,In-store
2,Foot Locker,1185732,2020-01-03,Northeast,New York,New York,Women's Street Footwear,40,1000,400000,140000,0.35,In-store
3,Foot Locker,1185732,2020-01-04,Northeast,New York,New York,Women's Athletic Footwear,45,850,382500,133875,0.35,In-store
4,Foot Locker,1185732,2020-01-05,Northeast,New York,New York,Men's Apparel,60,900,540000,162000,0.3,In-store


• Variables categóricas: ['Retailer', 'Region', 'State', 'City', 'Product', 'Sales Method']
• Variables numéricas: ['Price per Unit', 'Total Sales', 'Operating Profit', 'Operating Margin']
• Variable objetivo: ['Units Sold']

 Tipos de datos final:


Retailer                  category
Retailer ID                 object
Invoice Date        datetime64[ns]
Region                    category
State                     category
City                      category
Product                   category
Price per Unit             float64
Units Sold                   int64
Total Sales                float64
Operating Profit           float64
Operating Margin           float64
Sales Method              category
dtype: object


 DataFrame limpio preparado.


In [5]:
# Crea la tabla de conteos de categorías por variable
categorical_cols = ['Retailer', 'Region', 'State', 'City', 'Product', 'Sales Method']
category_counts = pd.DataFrame({
    'Column': categorical_cols,
    'Unique Categories': [df[col].nunique() for col in categorical_cols]
})

# Mostrar conteos de categorías
from IPython.display import display

print(" Conteo de categorías por variable:")
display(category_counts)



 Conteo de categorías por variable:


Unnamed: 0,Column,Unique Categories
0,Retailer,6
1,Region,5
2,State,50
3,City,52
4,Product,6
5,Sales Method,3


In [6]:
# --------------------------------------------------------------------------
# 2.  Parámetros básicos de modelado
# --------------------------------------------------------------------------

data_params = {
    "target": "Units Sold",
    "ignored_columns": [
        "Invoice Date",         # metadato irrelevante para el modelo
        "Total Sales",          # = Units Sold * Price per Unit
        "Operating Profit",     # depende de Units Sold
        "Operating Margin",     # idem
    ],
}

In [7]:
# --------------------------------------------------------------------------
# 3.  Función de *pre-procesado único*
# --------------------------------------------------------------------------

def preprocess_features(raw: pd.DataFrame, dp: dict) -> tuple[pd.DataFrame, pd.Series]:
    """Devuelve X numérico + y (sin hacer train/test split)."""
    y = raw[dp["target"]]

    X = (
        raw.drop(dp["ignored_columns"] + [dp["target"]], axis=1)
        .copy()
    )

    # a) One-hot para categorías pequeñas
    small_cats = ["Retailer", "Region", "Product", "Sales Method"]
    X = pd.get_dummies(X, columns=small_cats, drop_first=True)

    # b) Label-encoding para State / City
    for col in ["State", "City"]:
        X[col] = X[col].astype("category").cat.codes

    # tipo numérico homogéneo
    return X.astype("float64"), y


In [8]:
# --------------------------------------------------------------------------
# 4.  Plantilla global de columnas (se calcula **una vez**)
# --------------------------------------------------------------------------
X_full, _        = preprocess_features(df, data_params)
COL_TEMPLATE     = sorted(X_full.columns)              # lista ordenada
print(f" Plantilla de columnas: {len(COL_TEMPLATE)} features")

# las guardamos por si las necesito en otro notebook
import json, os
os.makedirs("artefactos", exist_ok=True)
Path("artefactos/template_cols.json").write_text(json.dumps(COL_TEMPLATE))

 Plantilla de columnas: 20 features


445

In [9]:
# --------------------------------------------------------------------------
# 5.  Función para exportar cada partición ya re-indexada
# --------------------------------------------------------------------------
def export_partition(df_part: pd.DataFrame, path: Path) -> None:
    """
    • Aplica el mismo pre-procesado.
    • Re-indexa al orden/longitud de COL_TEMPLATE.
    • Guarda CSV con X homogéneo + target.
    """
    X, y    = preprocess_features(df_part, data_params)
    X_fixed = X.reindex(columns=COL_TEMPLATE, fill_value=0.0)
    out     = pd.concat([X_fixed, y], axis=1)
    out.to_csv(path, index=False)


In [10]:
# --------------------------------------------------------------------------
# 6.  Particionado + exportación homogénea
# --------------------------------------------------------------------------
def partition_by_retailer_region(df):
    print("🔀 Particionando por Retailer + Region…")
    grupos = df.groupby(['Retailer','Region'], observed=True)
    particiones = {}
    for (retailer, region), sub in grupos:
        key = f"{retailer} - {region}"
        particiones[key] = sub.reset_index(drop=True)
        print(f"   • Nodo '{key}': {len(sub)} registros")
    return particiones

def partition_by_retailer_city(df):
    print("🔀 Particionando por Retailer + City…")
    grupos = df.groupby(['Retailer','City'], observed=True)
    particiones = {}
    for (retailer, city), sub in grupos:
        key = f"{retailer} - {city}"
        particiones[key] = sub.reset_index(drop=True)
        print(f"   • Nodo '{key}': {len(sub)} registros")
    return particiones

def partition_by_region(df):
    print("🔀 Particionando por Region…")
    grupos = df.groupby('Region', observed=True)
    particiones = {}
    for region, sub in grupos:
        key = region
        particiones[key] = sub.reset_index(drop=True)
        print(f"   • Nodo '{key}': {len(sub)} registros")
    return particiones


In [11]:
# --------------------------------------------------------------------------
# 7.  Creación de nodos y del hold-out procesado
# --------------------------------------------------------------------------

# Parámetros de evaluación
from sklearn.model_selection import train_test_split
from pathlib import Path

EVAL_FRAC = 0.2  # 20% para evaluación
SCHEMES = {
    "retailer_region": partition_by_retailer_region,
    "retailer_city":   partition_by_retailer_city,
    "region":          partition_by_region
}

#  Para cada esquema: extraemos hold-out y luego particionamos el resto
for scheme, partition_fn in SCHEMES.items():
    print(f"\n=== Esquema: {scheme} ===")
    out_dir = Path.cwd() / "nodos" / scheme
    out_dir.mkdir(parents=True, exist_ok=True)

    # Hold-out global para evaluación
    df_train, df_eval = train_test_split(
        df,
        test_size=EVAL_FRAC,
        stratify=df["Region"],   # o "Region"
        random_state=42
    )

    # Exportación del hold-out
    eval_path = out_dir / "eval_holdout.csv"
    export_partition(df_eval.reset_index(drop=True), eval_path)
    print(f"➤ Hold-out procesado → {eval_path} ({len(df_eval)} filas)")

    #  Particiones federadas
    parts = partition_fn(df_train)
    for name, df_part in parts.items():
        fn        = name.replace(" ", "_").replace("/", "_") + ".csv"
        part_path = out_dir / fn
        export_partition(df_part, part_path)

        kb = part_path.stat().st_size / 1024
        print(f"   • Nodo '{name}' → {len(df_part)} filas, {kb:.2f} KB")

    print(f"✅ Esquema '{scheme}' completo: 1 eval + {len(parts)} nodos guardados.")





=== Esquema: retailer_region ===
➤ Hold-out procesado → c:\Users\PC\Desktop\s.o.e\Estudios\U-4\Segundo Cuatri\TFG\Codigo-Aplicacion de Aprendizaje Federado\src_rf\ImplementacioADatasetAdidas\nodos\retailer_region\eval_holdout.csv (1930 filas)
🔀 Particionando por Retailer + Region…
   • Nodo 'Amazon - Midwest': 102 registros
   • Nodo 'Amazon - Northeast': 444 registros
   • Nodo 'Amazon - South': 11 registros
   • Nodo 'Amazon - Southeast': 111 registros
   • Nodo 'Amazon - West': 107 registros
   • Nodo 'Foot Locker - Midwest': 675 registros
   • Nodo 'Foot Locker - Northeast': 665 registros
   • Nodo 'Foot Locker - South': 110 registros
   • Nodo 'Foot Locker - Southeast': 340 registros
   • Nodo 'Foot Locker - West': 322 registros
   • Nodo 'Kohl's - Midwest': 194 registros
   • Nodo 'Kohl's - Northeast': 132 registros
   • Nodo 'Kohl's - South': 54 registros
   • Nodo 'Kohl's - West': 435 registros
   • Nodo 'Sports Direct - Midwest': 231 registros
   • Nodo 'Sports Direct - North

In [12]:
# --------------------------------------------------------------------------
# 8.  Comprobación de columnas homogéneas
# --------------------------------------------------------------------------
# Esta sección comprueba que todos los nodos tienen las mismas columnas y en el mismo orden.
# Si hay discrepancias, informa de cuántos nodos tienen columnas diferentes.

from pathlib import Path
import pandas as pd

# carpeta raíz donde guardaste los nodos
NODOS_DIR = Path.cwd() / "nodos"

# recorrer los tres esquemas
for scheme_dir in NODOS_DIR.iterdir():
    if not scheme_dir.is_dir():
        continue

    print(f"\n=== {scheme_dir.name.upper()} ===")
    template_cols = None      # se fijará con el 1er CSV del esquema
    mismatches    = 0

    for csv_path in scheme_dir.glob("*.csv"):
        df = pd.read_csv(csv_path, nrows=1)   # basta leer la cabecera
        cols = list(df.columns)

        # establecer patrón
        if template_cols is None:
            template_cols = cols

        # comprobar
        same_len   = len(cols) == len(template_cols)
        same_order = cols == template_cols

        status = "OK" if same_len and same_order else "⚠️ DIFERENTE"
        if not same_order or not same_len:
            mismatches += 1

        print(f"• {csv_path.name:<35} → {len(cols):3d} cols  | {status}")

    if mismatches == 0:
        print("✅  Todas las columnas coinciden (longitud y orden)")
    else:
        print(f"⚠️  {mismatches} archivo(s) con columnas distintas")




=== REGION ===
• eval_holdout.csv                    →  21 cols  | OK
• Midwest.csv                         →  21 cols  | OK
• Northeast.csv                       →  21 cols  | OK
• South.csv                           →  21 cols  | OK
• Southeast.csv                       →  21 cols  | OK
• West.csv                            →  21 cols  | OK
✅  Todas las columnas coinciden (longitud y orden)

=== RETAILER_CITY ===
• Amazon_-_Anchorage.csv              →  21 cols  | OK
• Amazon_-_Birmingham.csv             →  21 cols  | OK
• Amazon_-_Boston.csv                 →  21 cols  | OK
• Amazon_-_Burlington.csv             →  21 cols  | OK
• Amazon_-_Charlotte.csv              →  21 cols  | OK
• Amazon_-_Columbus.csv               →  21 cols  | OK
• Amazon_-_Louisville.csv             →  21 cols  | OK
• Amazon_-_Manchester.csv             →  21 cols  | OK
• Amazon_-_Portland.csv               →  21 cols  | OK
• eval_holdout.csv                    →  21 cols  | OK
• Foot_Locker_-_Anchorage.csv 

# Lanzamiento de servidores

In [13]:
# Imports comunes
import threading, time
from pathlib import Path
from fed_rf_mk.server import FLServer

# Parámetros que cambian según lo que quieras probar
SCHEME = "retailer_region"   # ◀ cámbialo a "retailer_city" o "region"
BASE = Path.cwd() / "nodos" / SCHEME
PORT0 = 8081
SKIP_PORTS = {8090}          # este no funciona se usa para algo.

In [14]:
# Levanta todos los silos de entrenamiento
train_csvs = sorted(p for p in BASE.glob("*.csv") if "eval_holdout" not in p.name)
live_ports  = []             # los que SÍ arrancan bien

for i, csv_path in enumerate(train_csvs):
    port = PORT0 + i
    if port in SKIP_PORTS:
        print(f"⏩  No arranco nodo '{csv_path.stem}' (puerto {port} vetado)")
        continue
    # Levanta el servidor para este silo
    server = FLServer(csv_path.stem, port, str(csv_path), auto_accept=True)
    threading.Thread(target=server.start, daemon=True).start()
    live_ports.append(port)
    print(f" Silo de train '{csv_path.stem}' en puerto {port}")

# Levanta el silo de evaluación
eval_port = PORT0 + len(train_csvs)
while eval_port in SKIP_PORTS or eval_port in live_ports:
    eval_port += 1            # avanza hasta que quede libre

eval_path = BASE / "eval_holdout.csv"
eval_server = FLServer("eval_silo", eval_port, str(eval_path), auto_accept=True)
threading.Thread(target=eval_server.start, daemon=True).start()
print(f" Silo de eval en puerto {eval_port}")

# Espera  para asegurar que todos arrancaron
time.sleep(240)  # ponemos 4 min para que arranque todo. en region con 60 es suficiente, pero en retailer_region y retailer_city tarda más.
print(" Todos los DataSites están arriba")


Starting DataSite Amazon_-_Midwest on port 8081 with data at c:\Users\PC\Desktop\s.o.e\Estudios\U-4\Segundo Cuatri\TFG\Codigo-Aplicacion de Aprendizaje Federado\src_rf\ImplementacioADatasetAdidas\nodos\retailer_region\Amazon_-_Midwest.csv and mock at c:\Users\PC\Desktop\s.o.e\Estudios\U-4\Segundo Cuatri\TFG\Codigo-Aplicacion de Aprendizaje Federado\src_rf\ImplementacioADatasetAdidas\nodos\retailer_region\Amazon_-_Midwest.csv Silo de train 'Amazon_-_Midwest' en puerto 8081

Starting Amazon_-_Midwest server on 0.0.0.0:8081
Starting DataSite Amazon_-_Northeast on port 8082 with data at c:\Users\PC\Desktop\s.o.e\Estudios\U-4\Segundo Cuatri\TFG\Codigo-Aplicacion de Aprendizaje Federado\src_rf\ImplementacioADatasetAdidas\nodos\retailer_region\Amazon_-_Northeast.csv and mock at c:\Users\PC\Desktop\s.o.e\Estudios\U-4\Segundo Cuatri\TFG\Codigo-Aplicacion de Aprendizaje Federado\src_rf\ImplementacioADatasetAdidas\nodos\retailer_region\Amazon_-_Northeast.csv
Starting Amazon_-_Northeast server on 

.................................................................................

................

......

.......................

...

.....................Logged into <Amazon_-_Midwest: High side Datasite> as <info@openmined.org>


........

.............

..................

..

.Logged into <Kohl's_-_South: High side Datasite> as <info@openmined.org>


..............

.Logged into <Sports_Direct_-_Midwest: High side Datasite> as <info@openmined.org>


.

.....

.......Logged into <Kohl's_-_West: High side Datasite> as <info@openmined.org>
Logged into <Foot_Locker_-_Southeast: High side Datasite> as <info@openmined.org>


.........Logged into <Amazon_-_South: High side Datasite> as <info@openmined.org>
Logged into <Foot_Locker_-_Midwest: High side Datasite> as <info@openmined.org>
.

.Logged into <Kohl's_-_Northeast: High side Datasite> as <info@openmined.org>


........

..Logged into <Amazon_-_Southeast: High side Datasite> as <info@openmined.org>
.

......Logged into <Sports_Direct_-_Southeast: High side Datasite> as <info@openmined.org>
..

Logged into <Amazon_-_West: High side Datasite> as <info@openmined.org>...
Logged into <Sports_Direct_-_Northeast: High side Datasite> as <info@openmined.org>
.

...Logged into <Sports_Direct_-_West: High side Datasite> as <info@openmined.org>


.

..

.

.

.Logged into <Amazon_-_Northeast: High side Datasite> as <info@openmined.org>


.Logged into <Walmart_-_West: High side Datasite> as <info@openmined.org>


Logged into <Foot_Locker_-_Northeast: High side Datasite> as <info@openmined.org>


Logged into <Foot_Locker_-_South: High side Datasite> as <info@openmined.org>


Uploading:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]

.Logged into <Kohl's_-_Midwest: High side Datasite> as <info@openmined.org>


.Logged into <West_Gear_-_Midwest: High side Datasite> as <info@openmined.org>
Logged into <Sports_Direct_-_South: High side Datasite> as <info@openmined.org>


.Logged into <West_Gear_-_South: High side Datasite> as <info@openmined.org>


Logged into <eval_silo: High side Datasite> as <info@openmined.org>




Logged into <Walmart_-_South: High side Datasite> as <info@openmined.org>


Logged into <Walmart_-_Northeast: High side Datasite> as <info@openmined.org>


Logged into <West_Gear_-_Southeast: High side Datasite> as <info@openmined.org>


Logged into <Walmart_-_Southeast: High side Datasite> as <info@openmined.org>



[A

[A[A


[A[A[A

Logged into <West_Gear_-_Northeast: High side Datasite> as <info@openmined.org>






[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:09<00:00,  9.39s/it]

Logged into <West_Gear_-_West: High side Datasite> as <info@openmined.org>














Uploading:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.62s/it]

Datasite Amazon_-_Midwest is up and running: http://localhost:8081




[A
Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.05s/it]


[A[A

Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.98s/it]



[A[A[A


Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.97s/it]




[A[A[A[A



Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.13s/it]

[A




[A[A[A[A[A





[A[A[A[A[A[A




[A[A[A[A[A





Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.09s/it]
Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.22s/it]


Datasite Kohl's_-_South is up and running: http://localhost:8093




[A[A






[A[A[A[A[A[A[A






Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.01s/it]



[A[A[A

Datasite Sports_Direct_-_Midwest is up and running: http://localhost:8095
Datasite Kohl's_-_West is up and running: http://localhost:8094
Datasite Foot_Locker_-_Southeast is up and running: http://localhost:8089






[A[A[A[A







[A[A[A[A[A[A[A[A







Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.97s/it]


Datasite Foot_Locker_-_Midwest is up and running: http://localhost:8086
Datasite Kohl's_-_Northeast is up and running: http://localhost:8092
Datasite Amazon_-_South is up and running: http://localhost:8083







[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A

Datasite Amazon_-_Southeast is up and running: http://localhost:8084










Uploading: Asset:   0%|[32m          [0m| 0/1 [00:07<?, ?it/s]








Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.22s/it]








Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.31s/it]
Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.25s/it]
Uploading:   0%|[32m          [0m| 0/1 [00:00<?, ?it/s]









[A[A[A[A[A[A[A[A[A[A









Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.32s/it]











[A[A[A[A[A[A[A[A[A[A[A










Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.27s/it]


Datasite Sports_Direct_-_Southeast is up and running: http://localhost:8098














[A[A[A[A[A[A[A[A[A[A[A[A











Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.94s/it]













[A[A[A[A[A[A[A[A[A[A[A[A[A












Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.45s/it]














[A[A[A[A[A[A[A[A[A[A[A[A[A[A













Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.10s/it]
Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.90s/it]


Datasite Amazon_-_West is up and running: http://localhost:8085
Datasite Sports_Direct_-_Northeast is up and running: http://localhost:8096
Datasite Sports_Direct_-_West is up and running: http://localhost:8099
Datasite Amazon_-_Northeast is up and running: http://localhost:8082
Datasite Foot_Locker_-_Northeast is up and running: http://localhost:8087



[A
Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.96s/it]


Datasite Walmart_-_West is up and running: http://localhost:8103
Datasite Foot_Locker_-_South is up and running: http://localhost:8088




[A[A

Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.92s/it]


Datasite Kohl's_-_Midwest is up and running: http://localhost:8091





[A[A[A


Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.07s/it]




[A[A[A[A



Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.87s/it]


Datasite West_Gear_-_Midwest is up and running: http://localhost:8104







[A[A[A[A[A




Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.05s/it]

Datasite Sports_Direct_-_South is up and running: http://localhost:8097










[A[A[A[A[A[A[A






Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  7.00s/it]








[A[A[A[A[A[A[A[A







Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.01s/it]






[A[A[A[A[A[A





Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.69s/it]

Datasite West_Gear_-_South is up and running: http://localhost:8106



Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:07<00:00,  7.08s/it]


Datasite eval_silo is up and running: http://localhost:8109
Datasite Walmart_-_South is up and running: http://localhost:8101
Datasite West_Gear_-_Southeast is up and running: http://localhost:8107
Datasite Walmart_-_Southeast is up and running: http://localhost:8102
Datasite Walmart_-_Northeast is up and running: http://localhost:8100
Datasite West_Gear_-_Northeast is up and running: http://localhost:8105


Uploading: Asset: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.92s/it]


Datasite West_Gear_-_West is up and running: http://localhost:8108
 Todos los DataSites están arriba


Approving request on change ml_experiment for datasite Amazon_-_Midwest
Approving request on change ml_experiment for datasite Amazon_-_Northeast
Approving request on change ml_experiment for datasite Amazon_-_South
Approving request on change ml_experiment for datasite Amazon_-_Southeast
Approving request on change ml_experiment for datasite Amazon_-_West
Approving request on change ml_experiment for datasite Foot_Locker_-_Midwest
Approving request on change ml_experiment for datasite Foot_Locker_-_Northeast
Approving request on change ml_experiment for datasite Foot_Locker_-_South
Approving request on change ml_experiment for datasite Foot_Locker_-_Southeast
Approving request on change ml_experiment for datasite Kohl's_-_Midwest
Approving request on change ml_experiment for datasite Kohl's_-_Northeast
Approving request on change ml_experiment for datasite Kohl's_-_South
Approving request on change ml_experiment for datasite Kohl's_-_West
Approving request on change ml_experiment for 

# Configuración y ejecución de los clientes

In [15]:
from fed_rf_mk.client import FLClient

# 2.1 Inicializa el cliente
fl_client = FLClient(partition_scheme=SCHEME)

# 2.2 Añade cada silo de entrenamiento
for i in range(len(train_csvs)):
    fl_client.add_train_client(
        name=f"train_{i}",
        url=f"http://localhost:{PORT0 + i}",
        email="fedlearning@rf.com",
        password="****"
    )

# 2.3 Añade el silo de evaluación
fl_client.add_eval_client(
    name="eval_silo",
    url=f"http://localhost:{PORT0 + len(train_csvs)}",
    email="fedlearning@rf.com",
    password="****"
)

print("✅ Cliente configurado con silos de entrenamiento y evaluación")


Logged into <Amazon_-_Midwest: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_0 at http://localhost:8081
Logged into <Amazon_-_Northeast: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_1 at http://localhost:8082
Logged into <Amazon_-_South: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_2 at http://localhost:8083
Logged into <Amazon_-_Southeast: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_3 at http://localhost:8084
Logged into <Amazon_-_West: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_4 at http://localhost:8085
Logged into <Foot_Locker_-_Midwest: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_5 at http://localhost:8086
Logged into <Foot_Locker_-_Northeast: High side Datasite> as <fedlearning@rf.com>
Successfully connected to train_6 at http://localhost:8087
Logged into <Foot_Locker_-_South: High side Datasit

## Definir Parámetros de Datos y Modelo

In [16]:
# 3.1 Parámetros de los datos (ignoramos dependientes de Units Sold)
data_params = {
    "target": "Units Sold",
    "ignored_columns": [     # ya se lo hemos quitado en la función de preprocesado
    ]
}
fl_client.set_data_params(data_params)
print(" Data params:", data_params)


# 3.2 Parámetros del modelo
model_params = {
    "model": None,                  # primera ronda sin modelo
    "n_base_estimators": 100,
    "n_incremental_estimators": 10,
    "train_size": 0.8,
    "test_size": 0.2,
    "fl_epochs": 2,

    # ← nuevos hiper-parámetros del RandomForest
    "max_depth": 15,
    "max_features": "sqrt",
    "min_samples_leaf": 1,
}


fl_client.set_model_params(model_params)

print(" ✅ Parámetros de datos y modelo establecidos")


 Data params: {'target': 'Units Sold', 'ignored_columns': []}
 ✅ Parámetros de datos y modelo establecidos


## Entrenamiento y evaluación.

In [17]:
fl_client.send_request()

In [18]:
#fl_client.check_status_last_code_requests()

In [19]:
#server.list_pending_requests()

In [20]:
# Tras fl_client.send_request(), espera un par de segundos y refresca cada cliente
# Este paso es importante para que los clientes reciban la solicitud y puedan procesarla.
# No es realmente necesario, pero ayuda a evitar problemas de sincronización.
import time
time.sleep(2)

for client in fl_client.datasites.values():
    client.refresh()       # recarga la lista de endpoints
for client in fl_client.eval_datasites.values():
    client.refresh()

# Para visualizar si se crea 'ml_experiment' en datasite.code.*
print("Endpoints en train_0:", [m for m in dir(list(fl_client.datasites.values())[0].code) if not m.startswith("_")])

Endpoints en train_0: ['call', 'get_all', 'get_all_for_user', 'get_by_id', 'get_by_service_func_name', 'ml_experiment', 'path', 'request_code_execution', 'store_execution_output', 'submit']


In [21]:
# Y lanzamos el federated training
fl_client.run_model()

print("Entrenamiento federado completado")


All weights were None. Assigning equal weight: 0.037037037037037035
Plantilla global cargado: 20 columnas

Epoch 1/2
Lanzando epoch inicial en paralelo…
   ✔ train_20: n=240, mae=39.169, rmse=55.911, time=29.21s
   ✔ train_27: n=764, mae=18.587, rmse=28.375, time=29.83s
   ✔ train_17: n=273, mae=26.651, rmse=34.944, time=33.52s
   ✔ train_14: n=184, mae=21.087, rmse=31.995, time=39.44s
   ✔ train_16: n=468, mae=25.545, rmse=43.563, time=40.19s
   ✔ train_8: n=272, mae=36.079, rmse=48.432, time=45.35s
   ✔ train_11: n=105, mae=33.236, rmse=63.232, time=47.70s
   ✔ train_21: n=92, mae=39.433, rmse=53.667, time=49.39s
   ✔ train_7: n=88, mae=41.398, rmse=59.416, time=51.48s
   ✔ train_26: n=56, mae=59.504, rmse=73.290, time=53.79s
   ✔ train_4: n=85, mae=13.963, rmse=18.823, time=55.25s
   ✔ train_2: n=8, mae=27.833, rmse=33.549, time=56.69s
   ✔ train_15: n=297, mae=12.893, rmse=17.390, time=57.03s
   ✔ train_22: n=26, mae=20.917, rmse=24.232, time=57.11s
   ✔ train_3: n=88, mae=39.677, 

In [22]:
results = fl_client.run_evaluate()
print(" Resultados de evaluación global:", results)


Number of evaluation sites: 1

Evaluating model at eval_silo
 Resultados de evaluación global: {'mae': 64.77994487287624, 'rmse': 102.13392207657787}


# Diseño de experimentos

In [5]:
# ------------------------------------------------------------------
# Generador de grids de hiper-parámetros  
# ------------------------------------------------------------------
from itertools import product

RF_PARAM_GRID = {
    "n_base_estimators":        [100, 200],
    "n_incremental_estimators": [10],           # fijo
    "train_size":               [0.70, 0.80],
    "max_depth":                [None, 8, 10],
    "max_features":             [None],
    "min_samples_leaf":         [1, 4],     #  
}


def build_param_grid(grid: dict) -> list[dict]:
    combos = product(*grid.values())
    out    = []
    for values in combos:
        d = {
            "model": None,
            "test_size": 1 - values[list(grid).index("train_size")],
            "sample_size": None,
            "fl_epochs": 2,               # fijo el numero de épocas
        }
        d.update({k: v for k, v in zip(grid.keys(), values)})
        out.append(d)
    return out

param_grid = build_param_grid(RF_PARAM_GRID)
print(f"Total configs: {len(param_grid)}")



Total configs: 24


In [None]:
# ------------------------------------------------------------------
# Lanzador de experimentos en batch
# ------------------------------------------------------------------
from pathlib import Path
import pandas as pd, json, time

GRID_RESULTS  = []                        # se llenará con dicts
OUT_ROOT      = Path("results/grid_runs") # carpeta común

for idx, cfg in enumerate(param_grid, start=1):
    print(f"\n🧪  Experimento {idx}/{len(param_grid)}  —  {cfg}")

    #  preparamos el diccionario completo de hiper-parámetros
    run_params = {
        "model"                 : None,   # ← siempre empezamos de cero
        "n_base_estimators"     : cfg["n_base_estimators"],
        "n_incremental_estimators": cfg["n_incremental_estimators"],
        "train_size"            : cfg["train_size"],
        "test_size"             : cfg["test_size"],
        "fl_epochs"             : cfg["fl_epochs"],
        # hiper-parámetros añadidos
        "max_depth"             : cfg.get("max_depth"),
        "max_features"          : cfg.get("max_features"),
        "min_samples_leaf"      : cfg.get("min_samples_leaf"),
    }

    #  actualizamos el cliente
    fl_client.set_model_params(run_params)

    #  entrenamos y evaluamos
    fl_client.run_model(output_root=OUT_ROOT / f"run_{idx:02d}")
    metrics = fl_client.run_evaluate()        # {'mae': …, 'rmse': …}

    # registramos resultados
    GRID_RESULTS.append({**cfg, **metrics})
    print(f"   → MAE={metrics['mae']:.3f} | RMSE={metrics['rmse']:.3f}")

# ------------------------------------------------------------------
# Resumen final
# ------------------------------------------------------------------
df_results = pd.DataFrame(GRID_RESULTS)
df_results.sort_values("rmse", inplace=True)
df_results.to_csv("artefactos/grid_results.csv", index=False)

print("\n Experimentos terminados — Top-5 por RMSE:")
display(df_results.head())



🧪  Experimento 1/108  —  {'model': None, 'test_size': 0.30000000000000004, 'sample_size': None, 'fl_epochs': 2, 'n_base_estimators': 100, 'n_incremental_estimators': 10, 'train_size': 0.7, 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1}
Plantilla global cargado: 20 columnas

Epoch 1/2
Lanzando epoch inicial en paralelo…


In [None]:
# ------------------------------------------------------------------
# 4.  Análisis de los resultados (versión extendida)
# ------------------------------------------------------------------
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns, json, pickle
from pathlib import Path
from sklearn.tree import plot_tree
from textwrap import indent

plt.style.use("ggplot")

# -------------------------------------------------- 4.0  carga datos
CSV = Path("artefactos/grid_results.csv")
df  = pd.read_csv(CSV)
print(f"{len(df)} ejecuciones leídas de {CSV}")

# -------------------------------------------------- 4.1  tabla resumen
agg = df.agg({"mae": ["min","mean","std"],
              "rmse":["min","mean","std"]}).T
display(agg)

# -------------------------------------------------- 4.2  pair-plot de HP numéricos
num_cols = ["n_base_estimators", "train_size",
            "max_depth", "min_samples_leaf",
            "min_samples_split"]
num_cols = [c for c in num_cols if c in df.columns]
if num_cols:
    sns.pairplot(df, vars=num_cols, hue="rmse",
                 diag_kind="hist", corner=True,
                 palette="viridis", height=1.6)
    plt.suptitle("Pair-plot hiper-parámetros", y=1.02)
    plt.show()

# -------------------------------------------------- 4.3  correlación
corr = df[num_cols+["mae","rmse"]].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matriz de correlación")
plt.show()

# -------------------------------------------------- 4.4  Top / Worst-5
def show_top(metric, k=5):
    best = df.nsmallest(k, metric)
    worst= df.nlargest(k, metric)
    print(f"\n  Top-{k} por {metric}")
    display(best)
    print(f"\n  Peor-{k} por {metric}")
    display(worst)

show_top("rmse")
show_top("mae")

# -------------------------------------------------- 4.5  heat-map 2-HP (ejemplo)
if {"max_depth","min_samples_leaf"}.issubset(df.columns):
    pivot = (df
             .pivot_table(index="max_depth",
                          columns="min_samples_leaf",
                          values="rmse",
                          aggfunc="min"))
    plt.figure(figsize=(6,4))
    sns.heatmap(pivot, annot=True, fmt=".1f", cmap="YlGnBu")
    plt.title("RMSE  |  max_depth × min_samples_leaf")
    plt.ylabel("max_depth"); plt.xlabel("min_samples_leaf")
    plt.show()

# -------------------------------------------------- 4.6  bosque ganador – importancias
best = df.loc[df["rmse"].idxmin()]
best_dir   = Path(f"results/grid_runs/run_{best.name+1:02d}")
model_pkl  = best_dir / "model.pkl"
print(f"\n🏆  Mejor configuración:\n{indent(best.to_string(), '   ')}")
print(f"→ Modelo: {model_pkl}")

with open(model_pkl,"rb") as f:
    rf = pickle.load(f)

feat_names = json.loads(Path("artefactos/template_cols.json").read_text())
importances = pd.Series(rf.feature_importances_, index=feat_names)\
                .sort_values(ascending=False).head(15)

plt.figure(figsize=(6,4))
importances[::-1].plot(kind="barh")
plt.title("Top-15 importancias (bosque ganador)")
plt.xlabel("importance")
plt.tight_layout(); plt.show()

# -------------------------------------------------- 4.7  dibujar un árbol concreto
tree_idx = 0
plt.figure(figsize=(20,10))
plot_tree(rf.estimators_[tree_idx],
          feature_names=feat_names,
          filled=True, rounded=True,
          precision=2, fontsize=7)
plt.title(f"Árbol #{tree_idx} (bosque ganador)")
plt.show()
