## Experimento No 6 - Different Preprocessing

### Load data and import libraries

In [1]:
# In case you're running this in colab
# !pip install pycaret
# !pip install dython
# !pip install optuna

In [2]:
# Data exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Modeling
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, TargetEncoder
from xgboost import XGBClassifier

# Evaluation
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

# Hyper-parameter tuning
import optuna

# Configurations
RANDOM_SEED = 42 # For reproducibility

sns.set_palette('pastel')
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 50)

In [3]:
df_train_full = pd.read_csv('data/train.csv')
df_test_full = pd.read_csv('data/test.csv')
df_train_full.head().T

Unnamed: 0,0,1,2,3,4
ID,904256,645256,308367,470353,989032
PERIODO,20212,20212,20203,20195,20212
ESTU_PRGM_ACADEMICO,ENFERMERIA,DERECHO,MERCADEO Y PUBLICIDAD,ADMINISTRACION DE EMPRESAS,PSICOLOGIA
ESTU_PRGM_DEPARTAMENTO,BOGOTÁ,ATLANTICO,BOGOTÁ,SANTANDER,ANTIOQUIA
ESTU_VALORMATRICULAUNIVERSIDAD,Entre 5.5 millones y menos de 7 millones,Entre 2.5 millones y menos de 4 millones,Entre 2.5 millones y menos de 4 millones,Entre 4 millones y menos de 5.5 millones,Entre 2.5 millones y menos de 4 millones
ESTU_HORASSEMANATRABAJA,Menos de 10 horas,0,Más de 30 horas,0,Entre 21 y 30 horas
FAMI_ESTRATOVIVIENDA,Estrato 3,Estrato 3,Estrato 3,Estrato 4,Estrato 3
FAMI_TIENEINTERNET,Si,No,Si,Si,Si
FAMI_EDUCACIONPADRE,Técnica o tecnológica incompleta,Técnica o tecnológica completa,Secundaria (Bachillerato) completa,No sabe,Primaria completa
FAMI_TIENELAVADORA,Si,Si,Si,Si,Si


In [4]:
df_analysis = df_train_full.copy()

In [5]:
df_analysis.isna().sum().sort_values(ascending=False)

FAMI_TIENEAUTOMOVIL               43623
FAMI_TIENELAVADORA                39773
FAMI_TIENECOMPUTADOR              38103
FAMI_ESTRATOVIVIENDA              32137
ESTU_HORASSEMANATRABAJA           30857
FAMI_TIENEINTERNET.1              26629
FAMI_TIENEINTERNET                26629
FAMI_EDUCACIONMADRE               23664
FAMI_EDUCACIONPADRE               23178
ESTU_PAGOMATRICULAPROPIO           6498
ESTU_VALORMATRICULAUNIVERSIDAD     6287
ID                                    0
ESTU_PRIVADO_LIBERTAD                 0
PERIODO                               0
ESTU_PRGM_DEPARTAMENTO                0
ESTU_PRGM_ACADEMICO                   0
RENDIMIENTO_GLOBAL                    0
dtype: int64

### Preprocessing

In [6]:
def normalize_text(text: str):
    """Removes any weird character like accents from a string"""
    import unicodedata

    if not pd.isna(text):
        return (
            unicodedata.normalize('NFKD', text)
            .encode('ASCII', 'ignore')
            .decode('utf-8')
        )
    return text

def clean_data(df: pd.DataFrame) -> pd.DataFrame: 
    new_df = df.copy() # Create a copy

    # Drop and clean columns
    new_df.drop(['ID', 'FAMI_TIENEINTERNET.1', 'ESTU_PRIVADO_LIBERTAD'], axis=1, inplace=True)

    # Change data types
    new_df['PERIODO'] = new_df['PERIODO'].astype(str)
    
    # Remove weird characters from values
    for col in ['ESTU_PRGM_ACADEMICO', 'ESTU_PRGM_DEPARTAMENTO', 'FAMI_EDUCACIONPADRE', 'FAMI_EDUCACIONMADRE']:
        new_df[col] = new_df[col].apply(normalize_text)

    # Replace values to make more sense
    new_df['ESTU_HORASSEMANATRABAJA'].replace('0', 'No trabaja', inplace=True)
    new_df['FAMI_EDUCACIONPADRE'].replace(['No Aplica', 'No sabe'], 'Ninguno', inplace=True)
    new_df['FAMI_EDUCACIONMADRE'].replace(['No Aplica', 'No sabe'], 'Ninguno', inplace=True)

    return new_df

def impute_cols(df: pd.DataFrame):
    new_df = df.copy() # Make a copy

    # Impute with own values
    new_df['FAMI_ESTRATOVIVIENDA'] = new_df['FAMI_ESTRATOVIVIENDA'].fillna('Unknown') # 'Sin Estrato' or mode or 'Unknown'
    new_df['ESTU_HORASSEMANATRABAJA'] = new_df['ESTU_HORASSEMANATRABAJA'].fillna('No trabaja') # No trabaja or Unknown
    new_df['FAMI_EDUCACIONPADRE'] = new_df['FAMI_EDUCACIONPADRE'].fillna('Ninguno') # Same as with MADRE
    new_df['FAMI_EDUCACIONMADRE'] = new_df['FAMI_EDUCACIONMADRE'].fillna('Ninguno') # Otra opcion: Ninguno or Unknown
 
    # Select binary columns
    # For VALORMATRICULA values could be 'No pago matricula' or Mode or Unknown (We'll go we the mode first)
    binary_cols = [col for col in new_df.columns if new_df[col].nunique() == 2]
    for col in binary_cols + ['ESTU_VALORMATRICULAUNIVERSIDAD']:
        col_mode = new_df[col].mode()[0]
        new_df[col] = new_df[col].fillna(col_mode)

    return new_df

In [7]:
# Clean the training and testing datasets
df_train = clean_data(df_train_full)
df_test = clean_data(df_test_full)

In [8]:
# Impute values in training and testing datasets
df_train = impute_cols(df_train)
df_test = impute_cols(df_test)

In [9]:
df_train.shape

(692500, 14)

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692500 entries, 0 to 692499
Data columns (total 14 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   PERIODO                         692500 non-null  object
 1   ESTU_PRGM_ACADEMICO             692500 non-null  object
 2   ESTU_PRGM_DEPARTAMENTO          692500 non-null  object
 3   ESTU_VALORMATRICULAUNIVERSIDAD  692500 non-null  object
 4   ESTU_HORASSEMANATRABAJA         692500 non-null  object
 5   FAMI_ESTRATOVIVIENDA            692500 non-null  object
 6   FAMI_TIENEINTERNET              692500 non-null  object
 7   FAMI_EDUCACIONPADRE             692500 non-null  object
 8   FAMI_TIENELAVADORA              692500 non-null  object
 9   FAMI_TIENEAUTOMOVIL             692500 non-null  object
 10  ESTU_PAGOMATRICULAPROPIO        692500 non-null  object
 11  FAMI_TIENECOMPUTADOR            692500 non-null  object
 12  FAMI_EDUCACIONMADRE           

#### Encoding

In [11]:
# Define encoders in a global scope to use them
# after they were fitted with the data.
label_encoder = LabelEncoder()
target_encoder = TargetEncoder(target_type='multiclass', random_state=RANDOM_SEED)

def preprocess_data(X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame():
    """
    Turns features and target into numeric values applying an encoding for each
    column.
    """
    X = X.copy() # Make copy of the features
    y_encoded = None
    
    high_card_cols = ['ESTU_PRGM_ACADEMICO']
    # Check if target was provided
    if y is not None:
        # Label encoding
        y_encoded = pd.Series(label_encoder.fit_transform(y))
        
        # Target encoding for training using the encoded labels
        # We need to fit and transform the data here.
        target_encoder.fit(X[high_card_cols], y_encoded)
        new_features_names = target_encoder.get_feature_names_out()
        df_target_encoded = pd.DataFrame(target_encoder.transform(X[high_card_cols]), columns=new_features_names)
        
        X = pd.concat([X.drop('ESTU_PRGM_ACADEMICO', axis=1), df_target_encoded], axis=1)
    else:
        # Target encoding for testing dataset just using transform
        new_features_names = target_encoder.get_feature_names_out()
        df_target_encoded = pd.DataFrame(target_encoder.transform(X[high_card_cols]), columns=new_features_names)
        X = pd.concat([X.drop('ESTU_PRGM_ACADEMICO', axis=1), df_target_encoded], axis=1)
        

    # Ordinal encoding
    ord_encoder = OrdinalEncoder() # Ordinal Encoder
    ord_cols = ['FAMI_ESTRATOVIVIENDA','ESTU_PRGM_DEPARTAMENTO']    
    df_ordinal = pd.DataFrame(ord_encoder.fit_transform(X[ord_cols]), columns=ord_cols)
    
    for col in ord_cols:
        max_val = df_ordinal[col].max()
        df_ordinal[col] = df_ordinal[col].map(lambda x: x / max_val) # Normalize values 0 - 1

    X = pd.concat([X.drop(ord_cols, axis=1), df_ordinal], axis=1)

    # Binary encoding
    bin_cols = [col for col in X.columns if X[col].nunique() == 2]

    for col in bin_cols:
        X[col] = X[col].replace({'Si': 1, 'No': 0})
    
    # One-hot encoding
    X = pd.get_dummies(X, dtype=int)

    return X, y_encoded

In [12]:
# Split train data into features and target
X = df_train.drop('RENDIMIENTO_GLOBAL', axis=1)
y = df_train.RENDIMIENTO_GLOBAL

In [13]:
# Encode features and target (for training dataset)
X, y_encoded = preprocess_data(X, y)
X_test, _ = preprocess_data(X=df_test)

In [14]:
X.shape, X_test.shape

((692500, 53), (296786, 53))

In [15]:
type(y_encoded)

pandas.core.series.Series

## Model Creation

At this point we are going to try a **XGBClassifier** to see how the preprocessing improve the accuracy



## Hyper-parameters Tuning

In [16]:
def xgb_objective(trial):
    
    # Create params
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 0.1)
    }
    # Set params to the model and train the new model
    model = XGBClassifier(**params)

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=RANDOM_SEED)

    # Fit the model
    model.fit(X_train, y_train)

    # Evaluate
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)

    return acc

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(xgb_objective, n_trials=30)

[I 2024-06-01 00:09:45,030] A new study created in memory with name: no-name-1dabe894-38a0-4935-9a98-5c8907bd7264
[I 2024-06-01 00:10:18,290] Trial 0 finished with value: 0.4347364620938628 and parameters: {'n_estimators': 820, 'learning_rate': 0.014971915185781455}. Best is trial 0 with value: 0.4347364620938628.
[I 2024-06-01 00:10:26,988] Trial 1 finished with value: 0.42691696750902525 and parameters: {'n_estimators': 196, 'learning_rate': 0.026382990281506116}. Best is trial 0 with value: 0.4347364620938628.
[I 2024-06-01 00:10:52,714] Trial 2 finished with value: 0.4408375451263538 and parameters: {'n_estimators': 671, 'learning_rate': 0.05107725421938646}. Best is trial 2 with value: 0.4408375451263538.
[I 2024-06-01 00:11:06,773] Trial 3 finished with value: 0.4398916967509025 and parameters: {'n_estimators': 341, 'learning_rate': 0.07097333672431641}. Best is trial 2 with value: 0.4408375451263538.
[I 2024-06-01 00:11:58,545] Trial 4 finished with value: 0.4405920577617328 and

In [18]:
best_params = study.best_params
best_params

{'n_estimators': 1272, 'learning_rate': 0.04404757058676809}

In [19]:
# Create a model with best parameters and fit with all data
model = XGBClassifier(**best_params)
model.fit(X, y_encoded)

## Making Predictions

In [20]:
preds = model.predict(X_test)
preds

array([3, 2, 0, ..., 2, 0, 0])

In [21]:
real_predictions = label_encoder.inverse_transform(preds)
real_predictions

array(['medio-bajo', 'medio-alto', 'alto', ..., 'medio-alto', 'alto',
       'alto'], dtype=object)

In [22]:
df_submission = pd.DataFrame({
    'ID': df_test_full.ID,
    'RENDIMIENTO_GLOBAL': real_predictions
})

df_submission.head()

Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,medio-bajo
1,98545,medio-alto
2,499179,alto
3,782980,bajo
4,785185,bajo


In [26]:
df_submission.to_csv('xgb_diff_prep_tuned_v3.csv', index=False)