In [1]:
# Módulos para manejo del sistema y archivos
import os                     # Interacción con el sistema operativo
import glob                   # Búsqueda de archivos con patrones (ej: *.csv)

# Librerías fundamentales para ciencia de datos
import numpy as np            # Operaciones numéricas eficientes con arrays
import pandas as pd           # Estructuras de datos tabulares (DataFrames)

# Visualización de datos
import matplotlib.pyplot as plt  # Creación de gráficos 2D/3D

# Preprocesamiento de datos (normalización/estandarización)
from sklearn.preprocessing import (
    MinMaxScaler,    # Escala datos a un rango [0, 1]
    StandardScaler,  # Estandariza datos (media=0, desviación=1)
    RobustScaler     # Escala resistente a outliers
)

# Persistencia de modelos
import joblib                 # Guardar/cargar modelos entrenados

# Configuración específica para Jupyter Notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # Muestra múltiples outputs por celda

In [2]:
# Configuración de entorno )
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 1. Carga de datos ------------------------------------------------------------
# Construye la ruta al archivo Excel de forma segura (maneja espacios automáticamente)
archivo_excel = os.path.join(
    'C:/Users/Milovan/Desktop/notebook para repositorio/reboiler',
    'Reboiler_Raw_Data_With_Timestamps.xlsx'
)

# Lee el archivo Excel (header=0 indica que la primera fila son los nombres de columnas)
df1 = pd.read_excel(archivo_excel, header=0)

# 2. Metadatos automáticos -----------------------------------------------------
info_df = {
    'variable': 'df1',                     # Nombre asignado al DataFrame
    'tipo_dato': type(df1).__name__,       # Tipo de objeto (DataFrame)
    'dimensiones': df1.shape,              # (filas, columnas)
    'columnas': list(df1.columns),         # Lista de nombres de columnas
    'muestra': df1.head(2).to_dict()       # Primeras 2 filas como diccionario
}

# 3. Visualización interactiva (Jupyter) --------------------------------------
info_df  # Muestra los metadatos expandidos
df1      # Muestra el DataFrame completo con paginación automática

{'variable': 'df1',
 'tipo_dato': 'DataFrame',
 'dimensiones': (42206, 10),
 'columnas': ['Fecha',
  'VAL356CI8017-Conductividad',
  'VAL356M003-Carga Motor',
  'VAL356M014-Carga Motor',
  'VAL356M015-Carga Motor',
  'VAL356PI8026-Ind.Presión',
  'VAL356PIC8025-Ind.Presión',
  'VAL356TI8015-Ind.Temperatura',
  'VAL356TIC8014-Ind.Temperatura',
  'Estado'],
 'muestra': {'Fecha': {0: Timestamp('2006-10-24 23:38:00'),
   1: Timestamp('2006-10-25 01:38:00')},
  'VAL356CI8017-Conductividad': {0: 12.3, 1: 18.9},
  'VAL356M003-Carga Motor': {0: 68.0264591, 1: 68.827098},
  'VAL356M014-Carga Motor': {0: 45.6700563, 1: 45.9245689},
  'VAL356M015-Carga Motor': {0: 30.3431731, 1: 30.1266536},
  'VAL356PI8026-Ind.Presión': {0: 21.4925528, 1: 20.9549559},
  'VAL356PIC8025-Ind.Presión': {0: 20.9, 1: 20.6},
  'VAL356TI8015-Ind.Temperatura': {0: 71.883617, 1: 72.3014535},
  'VAL356TIC8014-Ind.Temperatura': {0: 103.4148458, 1: 103.487449},
  'Estado': {0: 0, 1: 0}}}

Unnamed: 0,Fecha,VAL356CI8017-Conductividad,VAL356M003-Carga Motor,VAL356M014-Carga Motor,VAL356M015-Carga Motor,VAL356PI8026-Ind.Presión,VAL356PIC8025-Ind.Presión,VAL356TI8015-Ind.Temperatura,VAL356TIC8014-Ind.Temperatura,Estado
0,2006-10-24 23:38:00,12.3,68.026459,45.670056,30.343173,21.492553,20.9,71.883617,103.414846,0
1,2006-10-25 01:38:00,18.9,68.827098,45.924569,30.126654,20.954956,20.6,72.301453,103.487449,0
2,2006-10-25 03:38:00,25.9,68.871664,49.477196,29.701442,21.193045,20.3,71.775536,103.582936,0
3,2006-10-25 05:38:00,27.2,65.658285,51.649763,28.820497,21.021897,20.6,70.589861,103.697800,0
4,2006-10-25 07:38:00,22.0,64.166553,46.851182,27.760388,20.975880,20.4,71.762377,103.680362,0
...,...,...,...,...,...,...,...,...,...,...
42201,2018-09-06 01:38:00,7.4,,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
42202,2018-09-06 03:38:00,7.2,,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
42203,2018-09-06 05:38:00,6.9,,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
42204,2018-09-06 07:38:00,6.9,,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [3]:
# Elimina la columna especificada del DataFrame
df1 = df1.drop(columns=['VAL356M003-Carga Motor'])

# Diccionario de metadatos actualizado
attrs = {
    'var': 'df1',                  # Nombre de la variable
    'type': type(df1).__name__,    # Tipo de objeto (DataFrame)
    'columns': list(df1.columns)   # Nuevas columnas disponibles (añadido para referencia)
}

# Muestra los metadatos y el DataFrame actualizado
[attrs]
df1

[{'var': 'df1',
  'type': 'DataFrame',
  'columns': ['Fecha',
   'VAL356CI8017-Conductividad',
   'VAL356M014-Carga Motor',
   'VAL356M015-Carga Motor',
   'VAL356PI8026-Ind.Presión',
   'VAL356PIC8025-Ind.Presión',
   'VAL356TI8015-Ind.Temperatura',
   'VAL356TIC8014-Ind.Temperatura',
   'Estado']}]

Unnamed: 0,Fecha,VAL356CI8017-Conductividad,VAL356M014-Carga Motor,VAL356M015-Carga Motor,VAL356PI8026-Ind.Presión,VAL356PIC8025-Ind.Presión,VAL356TI8015-Ind.Temperatura,VAL356TIC8014-Ind.Temperatura,Estado
0,2006-10-24 23:38:00,12.3,45.670056,30.343173,21.492553,20.9,71.883617,103.414846,0
1,2006-10-25 01:38:00,18.9,45.924569,30.126654,20.954956,20.6,72.301453,103.487449,0
2,2006-10-25 03:38:00,25.9,49.477196,29.701442,21.193045,20.3,71.775536,103.582936,0
3,2006-10-25 05:38:00,27.2,51.649763,28.820497,21.021897,20.6,70.589861,103.697800,0
4,2006-10-25 07:38:00,22.0,46.851182,27.760388,20.975880,20.4,71.762377,103.680362,0
...,...,...,...,...,...,...,...,...,...
42201,2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
42202,2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
42203,2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
42204,2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [4]:
# Renombra las columnas del DataFrame con nombres más descriptivos
df1 = df1.rename(columns={
    'VAL356CI8017-Conductividad': 'Conductividad',
    'VAL356M014-Carga Motor': 'Carga motor (M014)',
    'VAL356M015-Carga Motor': 'Carga motor (M015)',
    'VAL356PI8026-Ind.Presión': 'Ind.Presión (PI8026)',
    'VAL356PIC8025-Ind.Presión': 'Ind.Presión (PIC8025)',
    'VAL356TI8015-Ind.Temperatura': 'Ind.Temperatura (TI8015)',
    'VAL356TIC8014-Ind.Temperatura': 'Ind.Temperatura (TIC8014)'
})

# Crea un diccionario con metadatos básicos del DataFrame
attrs = {
    'var': 'df1',                # Nombre de la variable
    'type': type(df1).__name__,  # Tipo de objeto (DataFrame)
}

# Muestra los metadatos y el DataFrame resultante
[attrs]
df1

[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0,Fecha,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
0,2006-10-24 23:38:00,12.3,45.670056,30.343173,21.492553,20.9,71.883617,103.414846,0
1,2006-10-25 01:38:00,18.9,45.924569,30.126654,20.954956,20.6,72.301453,103.487449,0
2,2006-10-25 03:38:00,25.9,49.477196,29.701442,21.193045,20.3,71.775536,103.582936,0
3,2006-10-25 05:38:00,27.2,51.649763,28.820497,21.021897,20.6,70.589861,103.697800,0
4,2006-10-25 07:38:00,22.0,46.851182,27.760388,20.975880,20.4,71.762377,103.680362,0
...,...,...,...,...,...,...,...,...,...
42201,2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
42202,2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
42203,2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
42204,2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [5]:
# Convertir columna 'Fecha' a datetime con formato específico
df1['Fecha'] = pd.to_datetime(
    df1['Fecha'], 
    errors='coerce',  # Convierte errores a NaT (Not a Time)
    format='%Y-%m-%d %H:%M:%S'  # Formato corregido (ver nota)
)

# Diccionario de metadatos
attrs = {
    'var': 'df1',
    'type': type(df1).__name__,
}

# Mostrar resultados
[attrs]
df1

[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0,Fecha,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
0,2006-10-24 23:38:00,12.3,45.670056,30.343173,21.492553,20.9,71.883617,103.414846,0
1,2006-10-25 01:38:00,18.9,45.924569,30.126654,20.954956,20.6,72.301453,103.487449,0
2,2006-10-25 03:38:00,25.9,49.477196,29.701442,21.193045,20.3,71.775536,103.582936,0
3,2006-10-25 05:38:00,27.2,51.649763,28.820497,21.021897,20.6,70.589861,103.697800,0
4,2006-10-25 07:38:00,22.0,46.851182,27.760388,20.975880,20.4,71.762377,103.680362,0
...,...,...,...,...,...,...,...,...,...
42201,2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
42202,2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
42203,2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
42204,2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [6]:
# Rellena valores NaN usando el método 'forward fill' (propaga el último valor válido hacia adelante)
df1 = df1.fillna(method='ffill')

# Diccionario de metadatos del DataFrame
attrs = {
    'var': 'df1',                # Nombre de la variable
    'type': type(df1).__name__,  # Tipo de objeto (DataFrame)
}

# Mostrar los metadatos y el DataFrame resultante
[attrs]
df1

  df1 = df1.fillna(method='ffill')


[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0,Fecha,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
0,2006-10-24 23:38:00,12.3,45.670056,30.343173,21.492553,20.9,71.883617,103.414846,0
1,2006-10-25 01:38:00,18.9,45.924569,30.126654,20.954956,20.6,72.301453,103.487449,0
2,2006-10-25 03:38:00,25.9,49.477196,29.701442,21.193045,20.3,71.775536,103.582936,0
3,2006-10-25 05:38:00,27.2,51.649763,28.820497,21.021897,20.6,70.589861,103.697800,0
4,2006-10-25 07:38:00,22.0,46.851182,27.760388,20.975880,20.4,71.762377,103.680362,0
...,...,...,...,...,...,...,...,...,...
42201,2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
42202,2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
42203,2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
42204,2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [7]:
# Filtra el DataFrame para conservar solo filas con fecha >= '2016-01-01 00:00:00'
cond = (df1['Fecha'] >= pd.to_datetime('2016-01-01 00:00:00', format='%Y-%m-%d %H:%M:%S', errors='coerce'))
df1 = df1[cond]

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df1',
    'type': type(df1).__name__,
}

# Muestra los metadatos y el DataFrame filtrado
[attrs]
df1

[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0,Fecha,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
30676,2016-01-08 11:38:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0
30677,2016-01-08 13:38:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0
30678,2016-01-08 15:38:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0
30679,2016-01-08 17:38:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0
30680,2016-01-08 19:38:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0
...,...,...,...,...,...,...,...,...,...
42201,2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
42202,2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
42203,2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
42204,2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [8]:
# Establece la columna 'Fecha' como índice del DataFrame
df1.set_index('Fecha', inplace=True)

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df1',
    'type': type(df1).__name__,
}

# Muestra los metadatos y el DataFrame con el nuevo índice
[attrs]
df1

[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0_level_0,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-08 11:38:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0
2016-01-08 13:38:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0
2016-01-08 15:38:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0
2016-01-08 17:38:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0
2016-01-08 19:38:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0
...,...,...,...,...,...,...,...,...
2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [9]:
# Filtra el DataFrame para mantener solo filas donde Conductividad <= 12
cond = (df1['Conductividad'] <= 12)
df1 = df1[cond]

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df1',
    'type': type(df1).__name__,
}

# Muestra los metadatos y el DataFrame filtrado
[attrs]
df1

[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0_level_0,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-08 11:38:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0
2016-01-08 13:38:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0
2016-01-08 15:38:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0
2016-01-08 17:38:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0
2016-01-08 19:38:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0
...,...,...,...,...,...,...,...,...
2018-09-06 01:38:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0
2018-09-06 03:38:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0
2018-09-06 05:38:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0
2018-09-06 07:38:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0


In [10]:
from functools import partial
from pandas.tseries.frequencies import to_offset

# Función para redondear timestamps a intervalos específicos
def roundfunc(t, freq):
    freq = to_offset(freq)  # Convierte el string de frecuencia (ej: '120T') a objeto pandas
    return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value)  # Redondeo matemático

# Agrupa el DataFrame en intervalos de 120 minutos (2 horas)
df1 = df1.groupby(partial(roundfunc, freq='120T'))  # '120T' = 120 minutos

# Metadatos del objeto GroupBy
attrs = {
    'var': 'df1',
    'type': type(df1).__name__,  # Será 'DataFrameGroupBy'
}

# Muestra los metadatos y el objeto GroupBy
[attrs]
df1

# Itera sobre los grupos (primeros 2 como ejemplo)
enum = enumerate(df1.groups)  # Crea un enumerador de grupos
nextel = next(enum)  # Primer grupo: (índice, timestamp)
df1.get_group(nextel[1])  # Muestra datos del primer grupo

nextel = next(enum)  # Segundo grupo
df1.get_group(nextel[1])  # Muestra datos del segundo grupo

  freq = to_offset(freq)  # Convierte el string de frecuencia (ej: '120T') a objeto pandas
  return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value)  # Redondeo matemático


[{'var': 'df1', 'type': 'DataFrameGroupBy'}]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000284CFDC69F0>

Unnamed: 0_level_0,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-08 11:38:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0


Unnamed: 0_level_0,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-08 13:38:00,4.4,67.954487,42.381221,18.41102,19.0,78.184952,98.775669,0


In [11]:
# Calcula la mediana de todas las columnas numéricas del DataFrame
df1 = df1.aggregate('median')

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df1',
    'type': type(df1).__name__,  # Será 'Series' (ya que aggregate devuelve una Series con las medianas)
}

# Muestra los metadatos y el resultado
[attrs]
df1

[{'var': 'df1', 'type': 'DataFrame'}]

Unnamed: 0_level_0,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-08 10:00:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0.0
2016-01-08 12:00:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0.0
2016-01-08 14:00:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0.0
2016-01-08 16:00:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0.0
2016-01-08 18:00:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0.0
...,...,...,...,...,...,...,...,...
2018-09-06 00:00:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0.0
2018-09-06 02:00:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0.0
2018-09-06 04:00:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0.0
2018-09-06 06:00:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0.0


In [12]:
# Guarda el DataFrame df1 como archivo CSV en la ruta especificada
file = os.path.join('C:/Users/Milovan/Desktop/notebook para repositorio/reboiler', 'Reboiler_Depurado.csv')
df1.to_csv(file)  # Exporta a CSV sin incluir el índice (default)

# Diccionario de metadatos (configurado manualmente como 'empty')
attrs = {
    'var': 'empty',  # Indica que no hay variable asociada
    'type': 'empty'   # Tipo de dato marcado como vacío
}

# Muestra los metadatos y confirma el guardado
[attrs]
'csv file saved'  # Mensaje de confirmación textual

[{'var': 'empty', 'type': 'empty'}]

'csv file saved'

In [13]:
# Carga el archivo CSV guardado previamente como un nuevo DataFrame
file = os.path.join('C:/Users/Milovan/Desktop/notebook para repositorio/reboiler', 'Reboiler_Depurado.csv')
df14 = pd.read_csv(file, header=0)  # header=0 usa la primera fila como nombres de columna

# Diccionario de metadatos del DataFrame cargado
attrs = {
    'var': 'df14',                # Nombre de la variable asignada
    'type': type(df14).__name__,   # Tipo de objeto ('DataFrame')
}

# Muestra los metadatos y el DataFrame cargado
[attrs]
df14

[{'var': 'df14', 'type': 'DataFrame'}]

Unnamed: 0,Fecha,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
0,2016-01-08 10:00:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0.0
1,2016-01-08 12:00:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0.0
2,2016-01-08 14:00:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0.0
3,2016-01-08 16:00:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0.0
4,2016-01-08 18:00:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0.0
...,...,...,...,...,...,...,...,...,...
11161,2018-09-06 00:00:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0.0
11162,2018-09-06 02:00:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0.0
11163,2018-09-06 04:00:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0.0
11164,2018-09-06 06:00:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0.0


In [14]:
# Conversión segura de tipos de datos columna por columna
df14['Fecha'] = pd.to_datetime(df14['Fecha'], errors='coerce')  # Convertir a datetime

# Convertir columnas numéricas (maneja errores con coerción a NaN)
columnas_numericas = [
    'Conductividad',
    'Carga motor (M014)',
    'Carga motor (M015)',
    'Ind.Presión (PI8026)',
    'Ind.Presión (PIC8025)',
    'Ind.Temperatura (TI8015)',
    'Ind.Temperatura (TIC8014)',
    'Estado'
]

for col in columnas_numericas:
    df14[col] = pd.to_numeric(df14[col], errors='coerce')

# Metadatos del DataFrame procesado
attrs = {
    'var': 'df14',
    'type': type(df14).__name__,
}

# Mostrar resultados
[attrs]
df14

[{'var': 'df14', 'type': 'DataFrame'}]

Unnamed: 0,Fecha,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
0,2016-01-08 10:00:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0.0
1,2016-01-08 12:00:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0.0
2,2016-01-08 14:00:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0.0
3,2016-01-08 16:00:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0.0
4,2016-01-08 18:00:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0.0
...,...,...,...,...,...,...,...,...,...
11161,2018-09-06 00:00:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0.0
11162,2018-09-06 02:00:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0.0
11163,2018-09-06 04:00:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0.0
11164,2018-09-06 06:00:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0.0


In [15]:
# Establece la columna 'Fecha' como índice del DataFrame
df14.set_index('Fecha', inplace=True)

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df14',
    'type': type(df14).__name__,  # Será 'DataFrame' con índice datetime
}

# Muestra los metadatos y el DataFrame con el nuevo índice
[attrs]
df14

[{'var': 'df14', 'type': 'DataFrame'}]

Unnamed: 0_level_0,Conductividad,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014),Estado
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-08 10:00:00,4.5,69.658713,42.280192,18.769251,19.0,79.399877,99.160674,0.0
2016-01-08 12:00:00,4.4,67.954487,42.381221,18.411020,19.0,78.184952,98.775669,0.0
2016-01-08 14:00:00,4.2,69.688606,41.967134,18.601041,19.0,80.599028,101.165308,0.0
2016-01-08 16:00:00,4.2,69.274182,42.124866,18.582166,19.0,79.301819,99.934365,0.0
2016-01-08 18:00:00,4.2,69.095421,41.588124,18.593953,19.0,80.170472,100.804536,0.0
...,...,...,...,...,...,...,...,...
2018-09-06 00:00:00,7.4,72.696510,43.464317,19.695450,19.5,81.372489,99.598170,0.0
2018-09-06 02:00:00,7.2,72.905098,44.058929,19.826046,19.5,81.911505,99.633708,0.0
2018-09-06 04:00:00,6.9,72.968914,44.584087,19.523726,19.5,80.810454,99.775918,0.0
2018-09-06 06:00:00,6.9,72.905579,44.417335,19.308117,19.5,81.587220,100.136055,0.0


In [16]:
# Elimina columnas especificadas del DataFrame
df19 = df14.drop(columns=['Estado', 'Conductividad'])

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df19',
    'type': type(df19).__name__,  # Sigue siendo 'DataFrame'
}

# Muestra los metadatos y el DataFrame resultante
[attrs]
df19

[{'var': 'df19', 'type': 'DataFrame'}]

Unnamed: 0_level_0,Carga motor (M014),Carga motor (M015),Ind.Presión (PI8026),Ind.Presión (PIC8025),Ind.Temperatura (TI8015),Ind.Temperatura (TIC8014)
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-08 10:00:00,69.658713,42.280192,18.769251,19.0,79.399877,99.160674
2016-01-08 12:00:00,67.954487,42.381221,18.411020,19.0,78.184952,98.775669
2016-01-08 14:00:00,69.688606,41.967134,18.601041,19.0,80.599028,101.165308
2016-01-08 16:00:00,69.274182,42.124866,18.582166,19.0,79.301819,99.934365
2016-01-08 18:00:00,69.095421,41.588124,18.593953,19.0,80.170472,100.804536
...,...,...,...,...,...,...
2018-09-06 00:00:00,72.696510,43.464317,19.695450,19.5,81.372489,99.598170
2018-09-06 02:00:00,72.905098,44.058929,19.826046,19.5,81.911505,99.633708
2018-09-06 04:00:00,72.968914,44.584087,19.523726,19.5,80.810454,99.775918
2018-09-06 06:00:00,72.905579,44.417335,19.308117,19.5,81.587220,100.136055


In [17]:
# Selecciona solo la columna 'Estado' del DataFrame original
df22 = df14[['Estado']]  # Usar doble corchete para mantener como DataFrame

# Diccionario de metadatos (estructura original preservada)
attrs = {
    'var': 'df22',
    'type': type(df22).__name__,  # Será 'DataFrame' (no Series)
}

# Muestra los metadatos y el DataFrame resultante
[attrs]
df22

[{'var': 'df22', 'type': 'DataFrame'}]

Unnamed: 0_level_0,Estado
Fecha,Unnamed: 1_level_1
2016-01-08 10:00:00,0.0
2016-01-08 12:00:00,0.0
2016-01-08 14:00:00,0.0
2016-01-08 16:00:00,0.0
2016-01-08 18:00:00,0.0
...,...
2018-09-06 00:00:00,0.0
2018-09-06 02:00:00,0.0
2018-09-06 04:00:00,0.0
2018-09-06 06:00:00,0.0


In [21]:
# =====================================================================
# FUNCIÓN: datetime_window
# Propósito: Crea ventanas temporales para análisis de series de tiempo
# =====================================================================
def datetime_window(dfx, dfy, dfy_total, scaler, size, periods, y_choose='last', ahead=''):
    # Convertir columnas categóricas a códigos numéricos
    catcolsx = dfx.select_dtypes(['category']).columns
    dfx[catcolsx] = dfx[catcolsx].apply(lambda x: x.cat.codes)

    catcolsy = dfy.select_dtypes(['category']).columns
    dfy[catcolsy] = dfy[catcolsy].apply(lambda x: x.cat.codes)

    catcolsy_total = dfy_total.select_dtypes(['category']).columns
    dfy_total[catcolsy_total] = dfy_total[catcolsy_total].apply(lambda x: x.cat.codes)

    # Listas para almacenar resultados
    ind = []
    x = []
    y = []

    # Función que procesa cada ventana temporal
    def get_windows(win):
        if (win.shape[0] != periods):
            return np.nan
        
        if (y_choose == 'ahead'):
            wy = dfy.loc[win.index, :]
            try:
                wy = dfy_total.loc[wy.index[-1] + pd.Timedelta(ahead)]
                y.append(wy)

                wx = dfx.loc[win.index, :]
                x.append(wx)

                ind.append(win.index[0])
            except:
                pass
        else:
            wx = dfx.loc[win.index, :]
            x.append(wx)

            wy = dfy.loc[win.index, :]
            if (y_choose == 'last'):
                wy = wy.iloc[-1]
            elif (y_choose == 'first'):
                wy = wy.iloc[0]
            y.append(wy)

            ind.append(win.index[0])
        
        return np.nan

    # Aplica ventana móvil a los datos
    dfx.iloc[:, 0].rolling(size).apply(get_windows)
    
    return x, y, ind


# =====================================================================
# FUNCIÓN: print_list_shape
# Propósito: Muestra las dimensiones de listas de arrays numpy
# =====================================================================
def print_list_shape(lst):
    s = ''
    if (len(lst) > 0):
        s += str(len(lst))
        s += ' x ' + str(np.array(lst[0]).shape)
    else:
        s += 'empty'
        
    return s


# =====================================================================
# FUNCIÓN: normal_anomal_split
# Propósito: Divide datos en normales/anómalos y prepara para modelado
# =====================================================================
def normal_anomal_split(dfx, dfy, size, periods, y_choose='last', ahead='', custom_normal_split=False, normal_test_ratio=0.1, scaler='MinMaxScaler'):
    
    # Configuración del escalador
    if (scaler == 'MinMaxScaler'):
        myscaler = MinMaxScaler(feature_range=(0, 1))
    elif (scaler == 'StandardScaler'):
        myscaler = StandardScaler()
    elif(scaler == 'RobustScaler'):
        myscaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25, 75))
    
    # Generación de ventanas temporales
    x_windowed, y_windowed, windows_ind = datetime_window(dfx=dfx, dfy=dfy, dfy_total=dfy, scaler=myscaler, size=size, periods=periods, y_choose=y_choose, ahead=ahead)
    
    # Separación entre datos normales (0) y anómalos (1)
    x_normal = []
    y_normal = []
    x_anomal = []
    y_anomal = []
    for (i, wy) in enumerate(y_windowed):
        if wy.iloc[0] == 0:
            x_normal.append(x_windowed[i])
            y_normal.append(y_windowed[i])
        else:
            x_anomal.append(x_windowed[i])
            y_anomal.append(y_windowed[i])
    
    # División entrenamiento/prueba
    normal_indices = [i for i in range(len(x_normal))]
    normal_indices_permuted = np.random.permutation(normal_indices)
    
    length = len(x_normal)
    length_test = round(length * normal_test_ratio)
    
    if custom_normal_split:
        normal_test_indices = normal_indices_permuted[:length_test]
        normal_train_indices = normal_indices_permuted[length_test:]
    else:
        normal_test_indices = normal_indices_permuted[:len(y_anomal)]
        normal_train_indices = normal_indices_permuted[len(y_anomal):]
    
    # Preparación datos de entrenamiento
    x_normal_train = [x_normal[i] for i in normal_train_indices]
    x_normal_test = [x_normal[i] for i in normal_test_indices]
    
    # Escalado (solo con datos normales de entrenamiento)
    x_normal_train_total = pd.concat(x_normal_train)
    x_normal_train_total = x_normal_train_total[~x_normal_train_total.index.duplicated(keep='first')]
    x_normal_train_total.sort_index(inplace=True)
    
    myscaler.fit(x_normal_train_total.to_numpy())
    
    # Transformación de datos
    x_train = []
    for wx in x_normal_train:
        wx = wx.to_numpy()
        wx = np.squeeze(myscaler.transform(wx))
        x_train.append(wx)
    
    # Preparación conjunto de prueba
    x_test = []
    y_test = []
    for wx in x_normal_test:
        wx = wx.to_numpy()
        wx = np.squeeze(myscaler.transform(wx))
        x_test.append(wx)
        y_test.append(0)
        
    for wx in x_anomal:
        wx = wx.to_numpy()
        wx = np.squeeze(myscaler.transform(wx))
        x_test.append(wx)
        y_test.append(1)
    
    # Datos completos (para visualización)
    x_mix = []
    for wx in x_windowed:
        wx = wx.to_numpy()
        wx = np.squeeze(myscaler.transform(wx))
        x_mix.append(wx)
    
    y_mix = []
    for wy in y_windowed:
        wy = wy.to_numpy()
        wy = wy[0]
        y_mix.append(wy)
        
    i_mix = windows_ind

    return x_train, x_test, y_test, x_mix, y_mix, i_mix, myscaler


# =====================================================================
# EJECUCIÓN PRINCIPAL
# =====================================================================
# Procesamiento de datos
x_train, x_test, y_test, x_mix, y_mix, i_mix, myscaler = normal_anomal_split(
    dfx=df19, 
    dfy=df22,
    size='84h', 
    periods=42, 
    y_choose='last', 
    ahead='',
    custom_normal_split=True, 
    normal_test_ratio=0.1, 
    scaler='MinMaxScaler'
)

# Estructura de resultados
df25 = {
    'x_train': np.array(x_train),
    'x_test': np.array(x_test),
    'y_test': np.array(y_test),
    'x_mix': np.array(x_mix),
    'y_mix': np.array(y_mix),
    'i_mix': i_mix,
    'myscaler': myscaler,
}

# Metadatos
attrs = {
    'var': 'df25',
    'type': type(df25).__name__,
}
[attrs]

# Resumen de dimensiones
{
    'x_train': print_list_shape(x_train),
    'x_test': print_list_shape(x_test),
    'y_test': print_list_shape(y_test),
}

# Resultados finales (sin cambios)
np.squeeze(df25['x_train'])
np.squeeze(df25['x_test'])
np.squeeze(df25['y_test'])

[{'var': 'df25', 'type': 'dict'}]

{'x_train': '6475 x (42, 6)',
 'x_test': '2926 x (42, 6)',
 'y_test': '2926 x ()'}

array([[[0.95510705, 0.83730761, 0.05483926, 0.05855339, 0.66921702,
         0.59156417],
        [0.94955847, 0.79847663, 0.05322017, 0.05855339, 0.65164122,
         0.5934052 ],
        [0.90696014, 0.77384101, 0.05194466, 0.05855339, 0.63212706,
         0.59424108],
        ...,
        [0.95866511, 0.86758796, 0.05155734, 0.05855339, 0.70574565,
         0.60302968],
        [0.95850276, 0.86907852, 0.0534855 , 0.05855339, 0.70500115,
         0.60272486],
        [0.9591094 , 0.87070974, 0.05559913, 0.05855339, 0.70779685,
         0.60233754]],

       [[0.85908857, 0.6143952 , 0.05348714, 0.05855339, 0.5931118 ,
         0.57635241],
        [0.37906536, 0.78559679, 0.05314099, 0.05855339, 0.57915958,
         0.64451148],
        [0.93787727, 0.76434396, 0.05255292, 0.05855339, 0.62080288,
         0.59528667],
        ...,
        [0.95288947, 0.8385015 , 0.05626721, 0.05855339, 0.64176793,
         0.58845109],
        [0.95513611, 0.84911495, 0.05470411, 0.05855339, 0.653

array([[[0.90048032, 0.77152655, 0.04977712, 0.05855339, 0.64227525,
         0.601159  ],
        [0.90048603, 0.77069141, 0.05087616, 0.05855339, 0.64332482,
         0.60045021],
        [0.90199423, 0.77169744, 0.05144153, 0.05855339, 0.64347325,
         0.5996299 ],
        ...,
        [0.89567574, 0.66461048, 0.05189512, 0.05855339, 0.61020496,
         0.59384315],
        [0.91395166, 0.68960757, 0.04820308, 0.05855339, 0.67999669,
         0.59310568],
        [0.90718598, 0.70168187, 0.04380079, 0.05855339, 0.67139586,
         0.59584744]],

       [[0.96586685, 0.75283562, 0.06035687, 0.06773823, 0.62330575,
         0.59396473],
        [0.96314089, 0.75238028, 0.06590542, 0.06773823, 0.62631306,
         0.5898695 ],
        [0.96680121, 0.74863553, 0.0628143 , 0.06773823, 0.6252755 ,
         0.59648473],
        ...,
        [0.98249482, 0.80772541, 0.06237544, 0.07003444, 0.66842531,
         0.59909002],
        [0.98314844, 0.80917128, 0.06195896, 0.07003444, 0.671

array([0, 0, 0, ..., 1, 1, 1])

In [22]:
# Guarda los arrays numpy en archivo .npz
file = os.path.join('C:/Users/Milovan/Desktop/notebook para repositorio/reboiler/archnpz', 'ventana84hrs')
np.savez(
    file, 
    x_train=df25['x_train'],  # Datos de entrenamiento
    x_test=df25['x_test'],    # Datos de prueba
    y_test=df25['y_test'],    # Etiquetas de prueba (0=normal, 1=anómalo)
    x_mix=df25['x_mix'],      # Todos los datos mezclados
    y_mix=df25['y_mix'],      # Todas las etiquetas mezcladas
    i_mix=df25['i_mix']       # Índices temporales
)

# Guarda el escalador como archivo .joblib
scaler_file = os.path.join('C:/Users/Milovan/Desktop/notebook para repositorio/reboiler/archnpz', 'scaler.joblib')
joblib.dump(df25['myscaler'], scaler_file)  # Serializa el objeto scaler

# Metadatos (configurados como vacíos)
attrs = {
    'var': 'empty',
    'type': 'empty'
}

# Confirmación de guardado
[attrs]
'npz file saved'

['C:/Users/Milovan/Desktop/notebook para repositorio/reboiler/archnpz\\scaler.joblib']

[{'var': 'empty', 'type': 'empty'}]

'npz file saved'