# Base Preprocessing

## Import Libraries and Loading Data

In [52]:
# Explore dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Unicode normalization
import unicodedata

# Handling encoding and imputation
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin # Custom preprocessing steps

# Creating a pipeline for more reliable preprocessing process
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# Load data
DATA_DIR = "datasets"

df_full = pd.read_csv(f"{DATA_DIR}/train.csv")
df_full.head(2)

Unnamed: 0,ID,PERIODO,ESTU_PRGM_ACADEMICO,ESTU_PRGM_DEPARTAMENTO,ESTU_VALORMATRICULAUNIVERSIDAD,ESTU_HORASSEMANATRABAJA,FAMI_ESTRATOVIVIENDA,FAMI_TIENEINTERNET,FAMI_EDUCACIONPADRE,FAMI_TIENELAVADORA,FAMI_TIENEAUTOMOVIL,ESTU_PRIVADO_LIBERTAD,ESTU_PAGOMATRICULAPROPIO,FAMI_TIENECOMPUTADOR,FAMI_TIENEINTERNET.1,FAMI_EDUCACIONMADRE,RENDIMIENTO_GLOBAL
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,Si,N,No,Si,Si,Postgrado,medio-alto
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,No,N,No,Si,No,Técnica o tecnológica incompleta,bajo


In [4]:
df_full.describe(include="object").T

Unnamed: 0,count,unique,top,freq
ESTU_PRGM_ACADEMICO,692500,948,DERECHO,53244
ESTU_PRGM_DEPARTAMENTO,692500,31,BOGOTÁ,282159
ESTU_VALORMATRICULAUNIVERSIDAD,686213,8,Entre 1 millón y menos de 2.5 millones,204048
ESTU_HORASSEMANATRABAJA,661643,5,Más de 30 horas,249352
FAMI_ESTRATOVIVIENDA,660363,7,Estrato 2,232671
FAMI_TIENEINTERNET,665871,2,Si,592514
FAMI_EDUCACIONPADRE,669322,12,Secundaria (Bachillerato) completa,128289
FAMI_TIENELAVADORA,652727,2,Si,563390
FAMI_TIENEAUTOMOVIL,648877,2,No,412606
ESTU_PRIVADO_LIBERTAD,692500,2,N,692466


In [5]:
# Drop columns that we do not need for preprocessing
df_train = df_full.drop(columns=["ID", "RENDIMIENTO_GLOBAL", "FAMI_TIENEINTERNET.1"])
df_train.head().T

Unnamed: 0,0,1,2,3,4
PERIODO,20212,20212,20203,20195,20212
ESTU_PRGM_ACADEMICO,ENFERMERIA,DERECHO,MERCADEO Y PUBLICIDAD,ADMINISTRACION DE EMPRESAS,PSICOLOGIA
ESTU_PRGM_DEPARTAMENTO,BOGOTÁ,ATLANTICO,BOGOTÁ,SANTANDER,ANTIOQUIA
ESTU_VALORMATRICULAUNIVERSIDAD,Entre 5.5 millones y menos de 7 millones,Entre 2.5 millones y menos de 4 millones,Entre 2.5 millones y menos de 4 millones,Entre 4 millones y menos de 5.5 millones,Entre 2.5 millones y menos de 4 millones
ESTU_HORASSEMANATRABAJA,Menos de 10 horas,0,Más de 30 horas,0,Entre 21 y 30 horas
FAMI_ESTRATOVIVIENDA,Estrato 3,Estrato 3,Estrato 3,Estrato 4,Estrato 3
FAMI_TIENEINTERNET,Si,No,Si,Si,Si
FAMI_EDUCACIONPADRE,Técnica o tecnológica incompleta,Técnica o tecnológica completa,Secundaria (Bachillerato) completa,No sabe,Primaria completa
FAMI_TIENELAVADORA,Si,Si,Si,Si,Si
FAMI_TIENEAUTOMOVIL,Si,No,No,No,Si


In [64]:
df_train["PERIODO"] = df_train["PERIODO"].astype(str).apply(lambda x: x[:4])
df_train.dtypes

PERIODO                           object
ESTU_PRGM_ACADEMICO               object
ESTU_PRGM_DEPARTAMENTO            object
ESTU_VALORMATRICULAUNIVERSIDAD    object
ESTU_HORASSEMANATRABAJA           object
FAMI_ESTRATOVIVIENDA              object
FAMI_TIENEINTERNET                object
FAMI_EDUCACIONPADRE               object
FAMI_TIENELAVADORA                object
FAMI_TIENEAUTOMOVIL               object
ESTU_PRIVADO_LIBERTAD             object
ESTU_PAGOMATRICULAPROPIO          object
FAMI_TIENECOMPUTADOR              object
FAMI_EDUCACIONMADRE               object
dtype: object

In [134]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, *, normalize=False):
        self.freq_map = {}
        self.normalize = normalize

    def _check_X(self, X: np.generic | np.ndarray | pd.DataFrame) -> pd.DataFrame:

        if isinstance(X, pd.DataFrame):
            X = X.copy()
        elif isinstance(X, (np.generic, np.ndarray)):
            X = pd.DataFrame(X)

        return X

    def fit(self, X: pd.DataFrame, y=None):
        # Procesar para llenar los datos de las atributos de
        # la clase.
        # TODO: Si quiero usarlo en el ColumnTransformer, necesito manejar
        # el caso donde sea un np.ndarray
        X = self._check_X(X)
        X_columns = X.select_dtypes(include=["object", "category"]).columns.tolist()
        for col in X_columns:
            self.freq_map[col] = X[col].value_counts(normalize=self.normalize).to_dict()
        return self

    def transform(self, X, y=None):
        # Transformo los datos a lo que quiero, siempre
        # retorno el dataset transformado.
        X = self._check_X(X) # Copiar SIEMPRE los datos que voy a cambiar
        for col, freq_values in self.freq_map.items():
            X[col] = X[col].map(freq_values)
        return X

In [124]:
has_high_cardinality = lambda col: col.nunique() > 11 and col.dtype in ["object", "category"]
high_card_cols = [cname for cname in df_train.columns if has_high_cardinality(df_train[cname])]
high_card_cols

['ESTU_PRGM_ACADEMICO',
 'ESTU_PRGM_DEPARTAMENTO',
 'FAMI_EDUCACIONPADRE',
 'FAMI_EDUCACIONMADRE']

In [135]:
high_card_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("freq_encoder", FrequencyEncoder())
])

In [141]:
preprocessor = ColumnTransformer(transformers=[
    ("freq_encoding", high_card_transformer, high_card_cols)
])

freq_data = preprocessor.fit_transform(df_train)
preprocessor.feature_names_in_

array(['PERIODO', 'ESTU_PRGM_ACADEMICO', 'ESTU_PRGM_DEPARTAMENTO',
       'ESTU_VALORMATRICULAUNIVERSIDAD', 'ESTU_HORASSEMANATRABAJA',
       'FAMI_ESTRATOVIVIENDA', 'FAMI_TIENEINTERNET',
       'FAMI_EDUCACIONPADRE', 'FAMI_TIENELAVADORA', 'FAMI_TIENEAUTOMOVIL',
       'ESTU_PRIVADO_LIBERTAD', 'ESTU_PAGOMATRICULAPROPIO',
       'FAMI_TIENECOMPUTADOR', 'FAMI_EDUCACIONMADRE'], dtype=object)

In [74]:
categorical_cols = [
    cname
    for cname in df_train.columns
    if not has_high_cardinality(df_train[cname]) and cname != "FAMI_ESTRATOVIVIENDA"
]

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

In [79]:
encoded_data = cat_transformer.fit_transform(df_train[categorical_cols])
feature_names = cat_transformer.get_feature_names_out()
df_onehot_encoded = pd.DataFrame(encoded_data, columns=feature_names)
df_onehot_encoded.head()

Unnamed: 0,PERIODO_2018,PERIODO_2019,PERIODO_2020,PERIODO_2021,ESTU_VALORMATRICULAUNIVERSIDAD_Entre 1 millón y menos de 2.5 millones,ESTU_VALORMATRICULAUNIVERSIDAD_Entre 2.5 millones y menos de 4 millones,ESTU_VALORMATRICULAUNIVERSIDAD_Entre 4 millones y menos de 5.5 millones,ESTU_VALORMATRICULAUNIVERSIDAD_Entre 5.5 millones y menos de 7 millones,ESTU_VALORMATRICULAUNIVERSIDAD_Entre 500 mil y menos de 1 millón,ESTU_VALORMATRICULAUNIVERSIDAD_Menos de 500 mil,...,FAMI_TIENEAUTOMOVIL_Si,FAMI_TIENEAUTOMOVIL_Unknown,ESTU_PRIVADO_LIBERTAD_N,ESTU_PRIVADO_LIBERTAD_S,ESTU_PAGOMATRICULAPROPIO_No,ESTU_PAGOMATRICULAPROPIO_Si,ESTU_PAGOMATRICULAPROPIO_Unknown,FAMI_TIENECOMPUTADOR_No,FAMI_TIENECOMPUTADOR_Si,FAMI_TIENECOMPUTADOR_Unknown
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [119]:
status_class_order = [["Sin Estrato" if idx == 0 else f"Estrato {idx}" for idx in range(7)]]

ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Sin Estrato")),
    ("ord_encoder", OrdinalEncoder(categories=status_class_order))
])

status_class_order

[['Sin Estrato',
  'Estrato 1',
  'Estrato 2',
  'Estrato 3',
  'Estrato 4',
  'Estrato 5',
  'Estrato 6']]

In [120]:
data_ord_encoded = ord_transformer.fit_transform(df_train[["FAMI_ESTRATOVIVIENDA"]])
ord_transformer["ord_encoder"].categories_

[array(['Sin Estrato', 'Estrato 1', 'Estrato 2', 'Estrato 3', 'Estrato 4',
        'Estrato 5', 'Estrato 6'], dtype=object)]

In [121]:
df_ord = pd.DataFrame(data_ord_encoded, columns=["FAMI_ESTRATOVIVIENDA"])
df_ord

Unnamed: 0,FAMI_ESTRATOVIVIENDA
0,3.0
1,3.0
2,3.0
3,4.0
4,3.0
...,...
692495,2.0
692496,3.0
692497,3.0
692498,1.0


## Missing Values and Duplicated Columns

In [6]:
df_train.isnull().sum().sort_values(ascending=False)

FAMI_TIENEAUTOMOVIL               43623
FAMI_TIENELAVADORA                39773
FAMI_TIENECOMPUTADOR              38103
FAMI_ESTRATOVIVIENDA              32137
ESTU_HORASSEMANATRABAJA           30857
FAMI_TIENEINTERNET                26629
FAMI_EDUCACIONMADRE               23664
FAMI_EDUCACIONPADRE               23178
ESTU_PAGOMATRICULAPROPIO           6498
ESTU_VALORMATRICULAUNIVERSIDAD     6287
PERIODO                               0
ESTU_PRGM_ACADEMICO                   0
ESTU_PRGM_DEPARTAMENTO                0
ESTU_PRIVADO_LIBERTAD                 0
dtype: int64

In [7]:
# Calculate the percentage of missing values in each feature
missing_values = df_train.isnull().sum()
total_cells = df_train.shape[0]
missing_percentage = (missing_values / total_cells) * 100
missing_percentage.sort_values(ascending=False)

FAMI_TIENEAUTOMOVIL               6.299350
FAMI_TIENELAVADORA                5.743394
FAMI_TIENECOMPUTADOR              5.502238
FAMI_ESTRATOVIVIENDA              4.640722
ESTU_HORASSEMANATRABAJA           4.455884
FAMI_TIENEINTERNET                3.845343
FAMI_EDUCACIONMADRE               3.417184
FAMI_EDUCACIONPADRE               3.347004
ESTU_PAGOMATRICULAPROPIO          0.938339
ESTU_VALORMATRICULAUNIVERSIDAD    0.907870
PERIODO                           0.000000
ESTU_PRGM_ACADEMICO               0.000000
ESTU_PRGM_DEPARTAMENTO            0.000000
ESTU_PRIVADO_LIBERTAD             0.000000
dtype: float64

In [8]:
# Count the duplicated values
# df_train[["FAMI_TIENEINTERNET", "FAMI_TIENEINTERNET.1"]].duplicated().value_counts()

Since we a duplicated feature, we are going to drop it to avoid noise in futures models.

## Imputing and Encoding

Since missing percentage in each feature (column) in our dataset is low, imputation is a better option to process missing values.

For imputation, given that our variables are mostly categorical we have two strategies
- Replacing all missing values with `Unknown` (We are going to use this one for the base)
- Replacing all missing values with the `most frequent` value

For encoding, we are doing
- Frequency encoding for features with "High-Cardinality"
- Onehot encoding for binominal and nominal features
- Ordinal encoding for `FAMI_ESTRATOFAMILIA` since it describes a status class
