In [None]:
import pandas as pd
import numpy as np
import datetime

# Establecer una semilla para reproducibilidad
np.random.seed(42)

# Crear una lista de categorías
categorias = ['A', 'B', 'C', 'D', 'E']

# Crear una lista de fechas aleatorias
fechas = [datetime.datetime(2023, np.random.randint(1, 13), np.random.randint(1, 29),
                            np.random.randint(0, 24), np.random.randint(0, 60), np.random.randint(0, 60))
          if np.random.random() > 0.2 else None
          for _ in range(1000)]

# Crear un DataFrame
df = pd.DataFrame({'CATEGORÍA': [np.random.choice(categorias) for _ in range(1000)],
                   'FECHA': fechas})

# Introducir valores faltantes en categorías
n_valores_faltantes = 100  # Número de valores faltantes a introducir
indices_a_faltar = np.random.choice(len(df), n_valores_faltantes, replace=False)  # Obtener índices aleatorios
df.loc[indices_a_faltar, 'CATEGORÍA'] = None  # Asignar valores faltantes

df['FECHA'] = df['FECHA'].astype('object')

# Calcular el número de duplicados que deseas agregar (5% de los datos)
porcentaje_duplicados = 0.05
num_duplicados = int(len(df) * porcentaje_duplicados)

# Seleccionar aleatoriamente filas para duplicar
filas_a_duplicar = df.sample(n=num_duplicados, replace=True)

# Agregar las filas duplicadas al DataFrame
df = pd.concat([df, filas_a_duplicar], ignore_index=True)

# Asegurarse de que el DataFrame aún tenga 1000 filas (original + duplicados)
df = df.iloc[:1000]

# Opcionalmente, puedes reordenar las filas si deseas mezclar los duplicados con los originales
df = df.sample(frac=1).reset_index(drop=True)
df['AW'] = None
print(df.shape)
df

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


class ColumnsRenameTransformer(BaseEstimator, TransformerMixin):
    
    """
    A transformer for renaming columns of a DataFrame using a custom transformation function.

    Parameters:
    -----------
    transformation: function
        A function that takes a column name (string) as input and returns the new column name.

    Attributes:
    -----------
    transformation: function
        The transformation function used for renaming column names.

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

    transform(X):
        Rename columns of the input DataFrame X using the provided transformation function.

    Examples:
    ---------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
    >>> def custom_transform(col_name):
    ...     return col_name.lower()
    >>> transformer = ColumnRenameTransformer(transformation=custom_transform)
    >>> df_transformed = transformer.transform(df)
    >>> df_transformed
       a  b
    0  1  3
    1  2  4
    """

    def __init__(self, transformation):
        
        """
        Initialize the transformer with a custom column name transformation function.

        Parameters:
        ----------
        transformation : function
            A function that takes a column name (string) as input and returns the new column name.
        """
        
        self.transformation = transformation

    def fit(self, X: pd.DataFrame, y=None):
        
        """
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame.

        y: None
            Ignored. This parameter is included for compatibility with scikit-learn's transformers.

        Returns:
        --------
        self : ColumnNameTransformer
            The fitted transformer instance.
        """
        return self

    def transform(self, X: pd.DataFrame):
        
        """
        Rename columns of the input DataFrame X using the provided transformation function.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame with columns to be renamed.

        Returns:
        --------
        X_transformed: pandas.DataFrame
            The DataFrame with column names transformed according to the provided function.
        """
        
        X_transformed = X.rename(columns=self.transformation)
        return X_transformed

    
class DropDuplicatedTransformer(BaseEstimator, TransformerMixin):
    
    """
    A transformer to remove duplicate rows from a DataFrame.

    Parameters:
    -----------
    None

    Attributes:
    -----------
    None

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

    transform(X):
        Remove duplicate rows from the input DataFrame X.

    Examples:
    ---------
    >>> from sklearn.datasets import load_iris
    >>> iris = load_iris(as_frame=True)
    >>> df = iris.data
    >>> transformer = DropDuplicatedTransformer()
    >>> df_no_duplicates = transformer.transform(df)
    """

    def __init__(self):
        
        """
        Initialize the transformer.

        Parameters:
        -----------
        None
        """
        pass

    def fit(self, X: pd.DataFrame, y=None):
        
        """
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame.

        y: None
            Ignored. This parameter is included for compatibility with scikit-learn's transformers.

        Returns:
        --------
        self : DropDuplicatedTransformer
            The fitted transformer instance.
        """
        
        return self

    def transform(self, X: pd.DataFrame):
        
        """
        Remove duplicate rows from the input DataFrame X.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame from which duplicate rows will be removed.

        Returns:
        --------
        X_no_duplicates: pandas.DataFrame
            The DataFrame with duplicate rows removed.
        """
        
        X_no_duplicates = X.drop_duplicates()
        return X_no_duplicates


class FillMissingValuesTransformer(BaseEstimator, TransformerMixin):
    
    """
    A transformer to fill missing values in a DataFrame with np.nan.

    Parameters:
    -----------
    None

    Attributes:
    -----------
    None

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

    transform(X):
        Fill missing values in the input DataFrame X with NaN.

    Examples:
    --------
    >>> data = pd.DataFrame({'col1': ['A', 'B', '', 'C'], 'col2': [1, np.nan, 'None', 'N/A']})
    >>> transformer = FillMissingValuesTransformer()
    >>> data_no_missing = transformer.transform(data)
    """

    def __init__(self):
        
        """
        Initialize the transformer.

        Parameters:
        -----------
        None
        """
        
        pass

    def fit(self, X:pd.DataFrame, y=None):
        
        """
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame.

        y: None
            Ignored. This parameter is included for compatibility with scikit-learn's transformers.

        Returns:
        --------
        self: FillMissingValuesTransformer
            The fitted transformer instance.
        """
        
        return self

    def transform(self, X:pd.DataFrame):
        
        """
        Fill missing values in the input DataFrame X with np.nan.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame in which missing values will be replaced with np.nan.

        Returns:
        --------
        X_no_missing: pandas.DataFrame
            The DataFrame with missing values replaced by np.nan.
        """
        
        X = X.copy()
        X_no_missing = X.fillna(np.nan)
        X_no_missing = X_no_missing.replace({'ERROR': np.nan,
                                             '': np.nan,
                                             'None': np.nan,
                                             'n/a': np.nan,
                                             'N/A': np.nan,
                                             'NULL': np.nan, 
                                             'NA': np.nan,
                                             'NAN': np.nan})
        return X_no_missing


class DateColumnTransformer(BaseEstimator, TransformerMixin):
    
    """
    A transformer for transforming date columns in a DataFrame.

    Parameters:
    -----------
    date_columns: list of str, optional (default=None)
        A list of column names to be transformed. If None, all columns containing
        'fecha', 'date', 'tiempo', or 'time' in their names will be transformed.

    format: str, optional (default='%Y-%m-%d')
        The date format to which the columns will be converted.

    Attributes:
    -----------
    date_columns: list of str
        The list of column names to be transformed.

    format: str
        The date format used for conversion.

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

    transform(X):
        Transform date columns of the input DataFrame X to the specified date format.

    Examples:
    ---------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'fecha_inicio': ['2023-01-01', '2023-02-01'],
    ...                    'date_ending': ['2023-03-01', '2023-04-01'],
    ...                    'other_column': [1, 2]})
    >>> transformer = DateColumnTransformer()
    >>> df_transformed = transformer.transform(df)
    >>> df_transformed
      fecha_inicio date_ending  other_column
    0   2023-01-01  2023-03-01             1
    1   2023-02-01  2023-04-01             2
    """

    def __init__(self, date_columns=None, format='%Y-%m-%d'):
        """
        Initialize the transformer.

        Parameters:
        -----------
        date_columns : list of str, optional (default=None)
            A list of column names to be transformed. If None, all columns containing
            'fecha', 'date', 'tiempo', or 'time' in their names will be transformed.

        format : str, optional (default='%Y-%m-%d')
            The date format to which the columns will be converted.
        """
        self.date_columns = date_columns
        self.format = format

    def fit(self, X: pd.DataFrame, y=None):
        """
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

        Parameters:
        -----------
        X : pandas.DataFrame
            The input DataFrame.

        y : None
            Ignored. This parameter is included for compatibility with scikit-learn's transformers.

        Returns:
        --------
        self : DateColumnTransformer
            The fitted transformer instance.
        """
        return self

    def transform(self, X: pd.DataFrame):
        """
        Transform date columns of the input DataFrame X to the specified date format.

        Parameters:
        ----------
        X : pandas.DataFrame
            The input DataFrame with date columns to be transformed.

        Returns:
        -------
        X_transformed : pandas.DataFrame
            The DataFrame with date columns transformed to the specified format.
        """
        X_transformed = X.copy()

        if self.date_columns is None:
            # If date_columns is not specified, select columns with date-related names
            date_columns = X.select_dtypes(include=['datetime64']).columns
        else:
            date_columns = self.date_columns

        for col in date_columns:
            if col in X_transformed.columns:
                X_transformed[col] = pd.to_datetime(X_transformed[col], format=self.format)

        return X_transformed


class CategoricalColumnsTransformer(BaseEstimator, TransformerMixin):
    
    """
    A transformer for preprocessing categorical columns in a DataFrame.

    Parameters:
    -----------
    strip_and_lower: bool, optional (default=True)
        If True, strip leading and trailing whitespaces and convert to lowercase for string columns.

    Attributes:
    -----------
    strip_and_lower: bool
        Whether to apply strip and lowercase transformation.

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

    transform(X):
        Preprocess categorical columns in the input DataFrame X.

    Examples:
    ---------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'A': [' Foo', 'Bar  ', 'Baz'], 'B': ['True', ' False ', True]})
    >>> transformer = CategoricalColumnsTransformer()
    >>> data_transformed = transformer.transform(data)
    >>> data_transformed
         A      B
    0   foo   true
    1   bar  false
    2   baz   true
    """

    def __init__(self, strip_and_lower=True):
        
        """
        Initialize the transformer.

        Parameters:
        -----------
        strip_and_lower : bool, optional (default=True)
            If True, strip leading and trailing whitespaces and convert to lowercase for string columns.
        """
        
        self.strip_and_lower = strip_and_lower

    def fit(self, X:pd.DataFrame, y=None):
        """
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame.

        y: None
            Ignored. This parameter is included for compatibility with scikit-learn's transformers.

        Returns:
        --------
        self: CategoricalColumnsTransformer
            The fitted transformer instance.
        """
        
        return self

    def transform(self, X:pd.DataFrame):
        
        """
        Preprocess categorical columns in the input DataFrame X.

        Parameters:
        ----------
        X: pandas.DataFrame
            The input DataFrame with categorical columns to be preprocessed.

        Returns:
        -------
        X_transformed: pandas.DataFrame
            The DataFrame with categorical columns preprocessed according to the specified options.
        """
        
        X = X.copy()

        if self.strip_and_lower:
            categoricals = X.select_dtypes(include=['object', 'bool']).columns
            X[categoricals] = X[categoricals].applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

        return X
    
    
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
    
    """
    A transformer for dropping columns in a DataFrame based on a missing value threshold.

    Parameters:
    -----------
    threshold: float
        The threshold for column removal. Columns with missing values exceeding this threshold will be dropped.

    Attributes:
    -----------
    threshold: float
        The threshold used for column removal.
    features_to_drop_: list
        A list to store the names of columns that were dropped during transformation.

    Methods:
    --------
    fit(X, y=None):
        Fit the transformer to the data. Since this transformer doesn't require any training, it returns itself unchanged.

    transform(X):
        Remove columns from the input DataFrame X based on the missing value threshold.

    Examples:
    ---------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'A': [1, 2], 'B': [None, 4], 'C': [5, None]})
    >>> transformer = DropColumnsTransformer(threshold=1/3)
    >>> transformed_data = transformer.transform(data)
    >>> transformer.dropped_columns
    ['B', 'C']
    """

    def __init__(self, threshold=1/3):
        
        """
        Initialize the transformer with a missing value threshold.

        Parameters:
        -----------
        threshold : float, default 1/3
            The threshold for column removal. Columns with missing values exceeding this threshold will be dropped.
        """
        
        self.threshold = threshold
        self.features_to_drop_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        
        """
        Fit the transformer to the data. Since this transformer doesn't require any training,
        it returns itself unchanged.

        Parameters:
        -----------
        X: pandas.DataFrame
            The input DataFrame.

        y: None
            Ignored. This parameter is included for compatibility with scikit-learn's transformers.

        Returns:
        --------
        self: DropColumnsTransformer
            The fitted transformer instance.
        """
        
        return self

    def transform(self, X:pd.DataFrame):
        
        """
        Remove columns from the input DataFrame X based on the missing value threshold.

        Parameters:
        ----------
        X: pandas.DataFrame
            The input DataFrame with columns to be potentially removed.

        Returns:
        -------
        X_transformed: pandas.DataFrame
            The DataFrame with columns removed based on the provided threshold.
        """
        
        X_copy = X.copy()
        columns_to_drop = X.columns[X.isnull().mean() > self.threshold].tolist()
        self.features_to_drop_ = columns_to_drop  # Store the names of dropped columns
        X_transformed = X_copy.drop(columns=columns_to_drop)
        
        return X_transformed

In [None]:
from sklearn.pipeline import Pipeline

try:
    pipe = Pipeline([
        ('ColumnsRenameTransformer', ColumnsRenameTransformer(lambda col: str(col).lower().strip())),
        ('DropDuplicatedTransformer', DropDuplicatedTransformer()),
        ('FillMissingValuesTransformer', FillMissingValuesTransformer()),
        ('DateColumnTransformer', DateColumnTransformer(df.filter(regex='fecha|date|tiempo|time').columns)),
        ('CategoricalColumnsTransformer', CategoricalColumnsTransformer()),
        ('DropColumnsTransformer', DropColumnsTransformer())
    ])
    
except Exception as e:
    print(type(e).__name__)
    
finally:
    df = pd.DataFrame(pipe.transform(df))
    print('¡Pre-procesamiento realizado exitosamente!')

In [None]:
pipe.named_steps['DropColumnsTransformer'].features_to_drop_