In [1]:
import sys
import os
import pandas as pd
from pathlib import Path


In [2]:
# Añadir el directorio src al path de Python
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)


In [3]:
from calculadora_margen.encoder import Encoder
from calculadora_margen.cleaning import Cleaner
from calculadora_margen.cleaning.params import Parameters
from calculadora_margen.cleaning.validador import Validator
from calculadora_margen.cleaning.outliers_manager import OutliersManager

In [4]:
project_root_path = Path(src_path).parent
data_path = project_root_path / 'data'
raw_path = data_path / 'raw'
clean_path = data_path / 'clean'

ETL master_lotes

In [5]:
master_lotes = pd.read_csv(raw_path / 'costes.csv',  encoding='UTF-8', sep=';', dtype=str)

In [6]:
cleaner = Cleaner(master_lotes)
params = Parameters.master_lotes

master_lotes = (cleaner
    .keep_and_rename(params.cols_to_keep, params.rename_map)
    .drop_duplicates()
    .drop_na(subset=['lote_interno'])
    .get_df()
)

In [7]:
# Creamos clave única para poder hacer merge en otros df
encoder = Encoder(master_lotes)
master_lotes = encoder.create_key(col2='lote_interno', new_col_name='clave_unica')

In [8]:
master_lotes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19252 entries, 4 to 19325
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   articulo        19252 non-null  object
 1   lote_proveedor  19252 non-null  object
 2   lote_interno    19252 non-null  object
 3   clave_unica     19252 non-null  object
dtypes: object(4)
memory usage: 752.0+ KB


In [9]:
master_lotes.to_csv(clean_path / 'master_lotes_clean.csv', index=False)

ETL costes

In [10]:
costes = pd.read_csv(raw_path / 'costes.csv',  encoding='UTF-8', sep=';', dtype=str)

In [11]:
cleaner = Cleaner(costes)
params = Parameters.costes

costes = (cleaner
    .drop_na(subset=['PRCMONEDA'])
    .drop_duplicates()
    .keep_and_rename(params.cols_to_keep, params.rename_map)
    .fix_numeric_format(params.cols_to_float)
    .drop_duplicates_batch('lote_interno')
    .get_df()
)

In [12]:
validator = Validator(costes)
params = Parameters.costes

costes = (validator
    .validate_with_map(params.validation_map)
    .get_df()
)

Tamaño inicial del DataFrame: 8000

Validando columna: componente
Filas inválidas encontradas: 1

Validando columna: lote_interno
Filas inválidas encontradas: 21

Tamaño final del DataFrame: 7978


In [13]:
# Ver las filas inválidas para una columna específica
#invalid_rows = validator.get_invalid('lote_interno')
#print(invalid_rows.head(10))

In [None]:
outliers_manager = OutliersManager(costes)

costes = (outliers_manager
    .process_outliers()
    .clean_columns()
    .get_df()
)


=== RESUMEN DE OUTLIERS ===
Outliers detectados inicialmente: 71
Outliers reemplazados por la media: 66
Outliers restantes: 5

Detalle de outliers restantes:
- MAT101: 1 outliers (desviación media: 0.0%)
- MAT183: 3 outliers (desviación media: 0.0%)
- MAT265: 1 outliers (desviación media: 0.0%)


In [None]:
costes.to_csv(clean_path / 'costes_clean.csv', index=False)

In [15]:
costes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7978 entries, 0 to 7977
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   coste_componente_unitario  7978 non-null   float64
 1   lote_interno               7978 non-null   object 
 2   componente                 7978 non-null   object 
dtypes: float64(1), object(2)
memory usage: 187.1+ KB
