# Limpieza data raw "correcta"

- Se toma la data raw "correcta", la data preprocessed, que como dice el nombre es la data raw correguido valores nulos por fallas de conexión con PI, setear segundos a cero, etc.

- **Limpiezas hechas**:
    - Eliminar puntos atípicos - Limpieza por límites operacionales - ej producciones bajas - errores sensores, etc
    - Delete null values (generated in the previous steps)

-------
**DATA**:
- INPUT: "data_raw_processed.pkl"
- OUTPUT: "data.pkl"

## Root folder and read env variables

In [1]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  D:\github-mi-repo\Optimization-Industrial-Process


In [2]:
import os
from dotenv import load_dotenv, find_dotenv # package used in jupyter notebook to read the variables in file .env

""" get env variable from .env """
load_dotenv(find_dotenv())

""" Read env variables and save it as python variable """
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")

## RUN

In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import json
import pickle
from sklearn.pipeline import Pipeline
import sys
import os
import matplotlib.pyplot as plt
import gcsfs

import warnings
warnings.filterwarnings("ignore")

from sklearn.base import BaseEstimator, TransformerMixin

### 0. Funciones Auxiliares

In [4]:
# funciones auxiliares
def load_all_parameterstags_tagclassification(model_name):
    """
    Read a dictionary with all parameters filtered by model (d0eop, d1d2, d2, etc) located in TagClassification 
    """
    path_json = 'config/params.json'
    with open("{path}".format(path=path_json)) as json_file:
        tag_classification_pars = json.load(json_file)

    return tag_classification_pars[model_name]

In [5]:
# define name of model - transversal model for this example
general_params_models = 'blanqueo_santafe_all'

### 1. Leer data raw - datalake
### 1. Read data raw datalake - preprocessed
- Data get in the previous notebook
- Data without nulls - filled in previous step for problems in upload data, no conextion PI-datalake, etc

In [6]:
path_raw_processed_data = f'artifacts/data/data_raw_processed.pkl'
processed_data = pd.read_pickle(path_raw_processed_data)
processed_data.head(3)

Unnamed: 0_level_0,230AIT446.PNT,240AIC022.MEAS,240AIC126.MEAS,240AIC224.MEAS,240AIC286.MEAS,240AIC324.MEAS,240AIC433.MEAS,240AIT063A.PNT,240AIT063B.PNT,240AIT225A.PNT,...,S240ALDP022,S240ALDP031,S240ALDP032,S276PER002,S2MAQUINAT07,S76ALE017,SSTRIPPING015,calc_prod_d0,calc_prod_d1,calc_prod_p
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 00:05:00,11.55504,2.983948,11.346645,4.413519,4.352375,10.441675,4.292521,5.86932,62.37495,1.837519,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3240.8635,3313.6215,3259.3745
2021-01-01 00:10:00,11.55232,3.015669,11.353215,4.413179,4.347186,10.43217,4.289684,5.86932,62.37495,1.81402,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3260.7475,3301.692,3208.6785
2021-01-01 00:15:00,11.549955,3.018903,11.355525,4.408321,4.355828,10.410115,4.284427,5.86932,62.37495,1.81402,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3265.5765,3284.133,3210.779


## FINAL STEP: DELETE OUTLIERS
Filters:
- Operational Limits of the industrial process
- other thecniques to delete outliers

### 9. Cleaning data according operational ranges - delete outliers

In [7]:
class OperationalRange(BaseEstimator,TransformerMixin):
    '''
    A class to delete the observations when a value of a certain tags is out of the range defined in the plant. Its is realized for each column independently
    '''
    def __init__(self, RangeDataFrame,outputCol):
        super(OperationalRange,self).__init__()
        self.RangeDataFrame = RangeDataFrame
        self.tags = RangeDataFrame["Tag"].tolist()
        self.outputCol = outputCol
    
    def fit(self,DataFrame):
        return self
    
    def intersection(self, lst1, lst2):
        '''
        Auxiliar function
        Return the intersection of strings between two diferents lists
        '''
        return [value for value in lst1 if value in lst2]
    
    def transform(self,DataFrame):
        print('\naplicando limpieza de acuerdo a los rangos operacionales')
        
        tagsList = self.intersection(lst1 = DataFrame.columns.to_list(),
                                     lst2 = self.tags)
        for tag in tagsList:
            lim_inf = self.RangeDataFrame["lim_inf"][self.RangeDataFrame["Tag"]==tag].tolist()[0]
            lim_sup = self.RangeDataFrame["lim_sup"][self.RangeDataFrame["Tag"]==tag].tolist()[0]
            cond = np.logical_or(DataFrame[tag]<lim_inf, DataFrame[tag]>lim_sup)
            DataFrame[tag] = np.where(cond, np.nan, DataFrame[tag])
            DataFrame[self.outputCol] = np.where(cond, False, True)
            
        #info
        print('tamaño data: ', DataFrame.shape)
        print('\n% de nulos hasta el momento: ', 100 * (DataFrame.isnull().sum() / DataFrame.shape[0]))
        
        return DataFrame

In [8]:
############ limpieza limites operacionales - definir limites ############

# path_operational_limits_df = 'config/limites_operacionales.xlsx'
# operational_limits_df = pd.read_excel(path_operational_limits_df)

# d0eop
path_operational_limits_df_d0eop = 'config/config_ml_models_development/Limites_operacionales-d0eop-general.xlsx'
operational_limits_df_d0eop = pd.read_excel(path_operational_limits_df_d0eop)

# d1
path_operational_limits_df_d1 = 'config/config_ml_models_development/Limites_operacionales-d1-general.xlsx'
operational_limits_df_d1 = pd.read_excel(path_operational_limits_df_d1)

# p
path_operational_limits_df_p = 'config/config_ml_models_development/Limites_operacionales-p-general.xlsx'
operational_limits_df_p = pd.read_excel(path_operational_limits_df_p)

# join df
operational_limits_df = pd.concat([operational_limits_df_d0eop, operational_limits_df_d1, operational_limits_df_p], axis=0)
operational_limits_df = operational_limits_df.drop_duplicates(subset=['Tag'])#.set_index('Tag')
operational_limits_df = operational_limits_df.reset_index().drop(columns = ['index'])

In [9]:
############ limpieza limites operacionales - realizar limpieza ############

pars_operational_range = {
        "RangeDataFrame": operational_limits_df,
        "outputCol": "OperationalRange",
    }

In [10]:
# instancia de la clase
droper_out_operational_range = OperationalRange(**pars_operational_range)

# limpieza
processed_data = droper_out_operational_range.transform(processed_data)


aplicando limpieza de acuerdo a los rangos operacionales
tamaño data:  (197568, 65)

% de nulos hasta el momento:  230AIT446.PNT       0.000000
240AIC022.MEAS      2.800049
240AIC126.MEAS      0.405936
240AIC224.MEAS      2.484714
240AIC286.MEAS      0.024295
                      ...   
SSTRIPPING015       0.009111
calc_prod_d0        7.477425
calc_prod_d1        7.483499
calc_prod_p         7.492610
OperationalRange    0.000000
Length: 65, dtype: float64


In [11]:
# delete column OperationalRange
processed_data = processed_data.drop('OperationalRange', axis = 1)
processed_data.head(3)

Unnamed: 0_level_0,230AIT446.PNT,240AIC022.MEAS,240AIC126.MEAS,240AIC224.MEAS,240AIC286.MEAS,240AIC324.MEAS,240AIC433.MEAS,240AIT063A.PNT,240AIT063B.PNT,240AIT225A.PNT,...,S240ALDP022,S240ALDP031,S240ALDP032,S276PER002,S2MAQUINAT07,S76ALE017,SSTRIPPING015,calc_prod_d0,calc_prod_d1,calc_prod_p
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 00:05:00,11.55504,2.983948,11.346645,4.413519,4.352375,10.441675,4.292521,5.86932,62.37495,1.837519,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3240.8635,3313.6215,3259.3745
2021-01-01 00:10:00,11.55232,3.015669,11.353215,4.413179,4.347186,10.43217,4.289684,5.86932,62.37495,1.81402,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3260.7475,3301.692,3208.6785
2021-01-01 00:15:00,11.549955,3.018903,11.355525,4.408321,4.355828,10.410115,4.284427,5.86932,62.37495,1.81402,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3265.5765,3284.133,3210.779


In [12]:
print('tamaño data processed: ', processed_data.shape[0])

tamaño data processed:  197568


### 10. Eliminar nulos
Drop null values por:
- timegap
- fuera de rango operacional

In [13]:
# porcentaje de nulos de un tag
percent_null = 100 * (processed_data.isnull().sum() / processed_data.shape[0])
percent_null.max()

8.234126984126984

In [14]:
# drop null values
print('tamaño dataset antes de borrar nulos: ', processed_data.shape)
processed_data = processed_data.dropna()
print('tamaño dataset luego de borrar nulos: ', processed_data.shape)

tamaño dataset antes de borrar nulos:  (197568, 64)
tamaño dataset luego de borrar nulos:  (167457, 64)


### 11. Limpieza puntos atípicos por IQR
- IMPORTANTE: solo se hace esta limpieza para tener mejores datos para el ejemplo y obtener mejores modelos. Analizar si aplicar esta limpieza de outliers u otras limpiezas o ninguna

- IMPORTANTE 2: el factor fue modificado para que no se fueran tantos datos por IQR, porque se iban demasiados

In [15]:
len_processed_data = processed_data.shape[0]
print('len data: ', len_processed_data)

len data:  167457


In [16]:
# Calcular el primer y tercer cuartil para cada característica
Q1 = processed_data.quantile(0.25)
Q3 = processed_data.quantile(0.75)

In [17]:
# Calcular el rango intercuartílico (IQR) para cada característica
IQR = Q3 - Q1

In [18]:
# Definir los límites para identificar valores atípicos para cada característica
factor_iqr = 5
lower_bound = Q1 - factor_iqr * IQR
upper_bound = Q3 + factor_iqr * IQR

In [19]:
# Identificar valores atípicos para cada característica
outliers = (processed_data < lower_bound) | (processed_data > upper_bound)

In [20]:
# Calcular el percentaje de outlier de cada feature
percent_outliers = (100 * (outliers.sum() / len_processed_data)).sort_values(ascending = False)
print('percent outliers using IQR: ', percent_outliers)

percent outliers using IQR:  240FY212.RO01     6.999409
240AIC324.MEAS    4.528327
S2MAQUINAT07      1.418872
240FY430.RO01     0.632401
240FIC236.MEAS    0.567907
                    ...   
240FY050.RO02     0.000000
240FY039.RO01     0.000000
240FY024A.RO01    0.000000
240FIC440.MEAS    0.000000
calc_prod_p       0.000000
Length: 64, dtype: float64


In [21]:
# delete outliers
df_processed_cleaned = processed_data[~outliers.any(axis=1)]

In [22]:
# calcular tamaño del nuevo dataframe con los valores atipicos por IQR borrados
len_processed_cleaned = df_processed_cleaned.shape[0]

print('len data: ', len_processed_data)
print('len data cleaned: ', len_processed_cleaned)
percent_change_iqr = -100*((len_processed_cleaned - len_processed_data) / len_processed_data)
print('porcentaje disminución data por limpieza IQR: ', round(percent_change_iqr, 3))

len data:  167457
len data cleaned:  143200
porcentaje disminución data por limpieza IQR:  14.486


### 12. GUARDAR PKL PROCESSED

In [23]:
# show data
df_processed_cleaned.head(3)

Unnamed: 0_level_0,230AIT446.PNT,240AIC022.MEAS,240AIC126.MEAS,240AIC224.MEAS,240AIC286.MEAS,240AIC324.MEAS,240AIC433.MEAS,240AIT063A.PNT,240AIT063B.PNT,240AIT225A.PNT,...,S240ALDP022,S240ALDP031,S240ALDP032,S276PER002,S2MAQUINAT07,S76ALE017,SSTRIPPING015,calc_prod_d0,calc_prod_d1,calc_prod_p
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 00:05:00,11.55504,2.983948,11.346645,4.413519,4.352375,10.441675,4.292521,5.86932,62.37495,1.837519,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3240.8635,3313.6215,3259.3745
2021-01-01 00:10:00,11.55232,3.015669,11.353215,4.413179,4.347186,10.43217,4.289684,5.86932,62.37495,1.81402,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3260.7475,3301.692,3208.6785
2021-01-01 00:15:00,11.549955,3.018903,11.355525,4.408321,4.355828,10.410115,4.284427,5.86932,62.37495,1.81402,...,91.49,1.8,11.4,11.77,1.5712,173.6,964.0,3265.5765,3284.133,3210.779


In [24]:
# save data pkl cloud
path_raw_data_processed = 'artifacts/data/data.pkl'
with open(path_raw_data_processed, "wb") as output:
    pickle.dump(df_processed_cleaned, output)
    output.close()