# 1. General Code


   ## 1.1 Libraries


In [150]:
# Import libraries to exploratory data analysis
import numpy as np
import pandas as pd
from packages.common_functions import load_csv_into_dataframe, load_dataframe_to_csv

In [151]:
# Import libraries to preprocessing and enrichment data
#Used to scale data
from sklearn.preprocessing import StandardScaler
#Used to reduction dimension of data set
from sklearn.decomposition import PCA
#used to creates pipelines from grid search
from sklearn.pipeline import Pipeline

In [152]:
#Import libraries to machine learning
#Used on regression models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import LogisticRegression
# Used on classification models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Used to get metrics of ml models
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score

# Used to get bes params for ml models
from sklearn.model_selection import GridSearchCV, train_test_split

# Used to export bests ml models
import joblib

   ## 1.2 Machine learning functions


In [153]:
def grid_search_function(model, param_grid, X_train, y_train, pca_components=[0.50, 0.99], cv=2):
        """
        Perandom_forestorm a grid search with cross-validation to find the best
        parameters for a given machine learning model.

        Parameters:
        - model: Machine learning model.
        - param_grid (dict): Dictionary of parameters to try in the grid search.
        - X (array-like): Training data.
        - y (array-like): Training labels.
        - pca_components (list, optional): List of principal components to try in PCA.
        Default: [0.50, 0.99]
        - cv (int, optional): Number of splits for cross-validation. Default: 2

        Returns:
        tuple: A tuple containing the best parameters, best score, best estimator, and
        cross-validation results.
        """
        # Set uo the pipeline with function of PCA and scaling and ml model
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA()),
            ('classify', model)
        ])

        # used grid search with cross validation using previous pipeline and grid params of specific model
        grid_search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Obtain bes params from specific model
        best_params = grid_search.best_params_

        # delete 'classify__' prefix from all keys 
        best_params = {key.replace('classify__', '', 1): value for key, value in best_params.items()}
        
        return best_params

In [154]:
def train(model, best_params = None):
        # grid search for obtain best params to specific model
        if best_params != None:
            # establish best params into specific model
            model.set_params(**best_params)
        # train the model

        return model.fit(X_train, y_train)

In [155]:
def test(model, is_classification=False):
        # evaluation classification or regression model
        y_pred = model.predict(X_test)
        if is_classification:
            acc = accuracy_score(y_test, y_pred)
            print(f"Precisión del modelo: {acc}")
        else:
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            print(f'Mean Squared Error en el conjunto de prueba: {mse:.2f}')
            print(f'Coeficiente de Determinación (R²): {r2:.2f}')
        
        return y_pred

   ## 1.3 Load Data Frames


In [156]:
# Path of codec df
CODEC_DF_PATH = '../data/processed_csv/2014_census_codec.csv'

In [157]:
# load codec df in variable 'df'
df = load_csv_into_dataframe(CODEC_DF_PATH)

CSV file successfully load in DataFrame.


In [158]:
# filter by category in 'CIUDAD' column is different of NULL value and your equivalent convention used before codec column
df = df[(df['CIUDAD'].notnull()) & (df['CIUDAD'] != 4)].copy()

# 2. Machine Learning Stage

## 2.1 Target Column: City

### 2.1.1 Data Separation

In [159]:
# Separate the columns on X and y (target column)
target_column = 'CIUDAD'
X = df.copy().drop(target_column, axis='columns')
y = df[target_column].copy()

In [160]:
# Separate data on train and test per X and y dfs
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

### 2.1.2 Regression Models

#### 2.1.2.1 Linear Regression

In [161]:
# create linear regression model object from ModeloML class
linear_regression_model = LinearRegression()

In [162]:
# params from linear regression model
linear_regression_model_params_grid = {
        'classify__fit_intercept': [True, False],  # Indica si debe ajustarse o no el intercepto
}

In [163]:
# obtains best params from linear regression model using grid search
linear_regression_model_best_params_grid = grid_search_function(linear_model, param_grid=linear_regression_model_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.2.1.1 Training

In [164]:
# Not specify best params from linear regression model
linear_model = train(linear_model, best_params=linear_regression_model_best_params_grid)

##### 2.1.2.1.2 Testing

In [165]:
# Evaluation of the lineal regression model
y_pred = test(linear_model, is_classification=False)

Mean Squared Error en el conjunto de prueba: 0.00
Coeficiente de Determinación (R²): 1.00


#### 2.1.2.2 Logistic Regression

In [166]:
# Create logistic regression model object
logistic_reg_model = LogisticRegression()

In [167]:
# Hyper params from linear regression model
logistic_reg_params_grid = {
    'classify__penalty': ['l1', 'l2'],
    'classify__C': [0.1, 1, 10],
    'classify__fit_intercept': [True, False],
}

In [168]:
# obtains best params from linear regression model using grid search
logistic_reg_best_params_grid = grid_search_function(logistic_reg_model, param_grid=logistic_reg_params_grid, X_train=X_train, y_train=y_train)

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimato

##### 2.1.2.2.1 Training

In [169]:
# train lineal regression model with best params previous obtains with grid search 
logistic_reg_model = train(logistic_reg_model, best_params=logistic_reg_best_params_grid)

##### 2.1.2.2.2 Testing

In [170]:
# evaluation lineal regression model
y_pred = test(logistic_reg_model, is_classification=False)

Mean Squared Error en el conjunto de prueba: 3.36
Coeficiente de Determinación (R²): -1.88


### 2.1.3 Classification Models

 #### 2.1.3.1 Support Vector Machines

In [171]:
# Create support vector machine model object from ModeloML class
svm_model = SVC()

In [172]:
# params from support vector machine model
svm_params_grid = {
    'classify__C': [0.1, 1, 10],  
    'classify__kernel': ['linear', 'rbf']
}

In [173]:
# obtain best params from support vector machine model using grid search
svm_best_params = grid_search_function(svm_model, param_grid=svm_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.1.1 Training

In [174]:
# train support vector machine model with best params 
svm_model = train(svm_model, svm_best_params)

##### 2.1.3.1.2 Testing

In [175]:
# testing support vector machine model
y_pred = test(svm_model, is_classification=True)

Precisión del modelo: 1.0


#### 2.1.3.2 Decision Tree

In [176]:
# create decision tree model object by ModeloML class
decision_tree_model = DecisionTreeClassifier()

In [177]:
# params for decision tree model
decision_tree_model_params_grid = {
    'classify__max_depth': [None, 10, 20],  # Ajusta estos valores
    'classify__min_samples_split': [2, 5, 10],
    'classify__min_samples_leaf': [1, 2, 4]
}

In [178]:
#obtain best params grid to decision tree model using grid search
decision_tree_model_best_params_grid = grid_search_function(decision_tree_model, param_grid=decision_tree_model_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.2.1 Training

In [179]:
# train decision tree model with best params
decision_tree_model = train(decision_tree_model, decision_tree_model_best_params_grid)

##### 2.1.3.2.2 Testing

In [180]:
# test decision tree model
y_pred = test(decision_tree_model, is_classification=True)

Precisión del modelo: 1.0


#### 2.1.3.3 Random Forest

In [181]:
#create random forest model object by ModeloML class
random_forest_model = RandomForestClassifier()

In [182]:
# params for random forest model
random_forest_params_grid = {
    'classify__max_depth': [None, 5, 10],  # Reducir la profundidad del árbol
    'classify__min_samples_split': [2, 5, 10],  # Aumentar min_samples_split
    'classify__min_samples_leaf': [1, 2, 4],  # Aumentar min_samples_leaf
    'classify__ccp_alpha': [0.0, 0.1, 0.2]  # Añadir ccp_alpha
}

In [183]:
#obtain best params from random forest model with grid search
random_forest_best_params_grid = grid_search_function(model= random_forest_model, param_grid=random_forest_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.3.1 Training

In [184]:
# train random forest model with best params
random_forest_model = train(random_forest_model, random_forest_best_params_grid)

##### 2.1.3.3.2 Testing

In [185]:
# test random forest model
y_pred = test(random_forest_model, is_classification=True)

Precisión del modelo: 1.0


#### 2.1.3.4 K-Neighbors

In [186]:
# create knn model by ModeloML class
knn_model = KNeighborsClassifier()

In [187]:
# params for knn model
knn_params_grid = {
    'classify__n_neighbors': [3, 5, 7],  # Número de vecinos a considerar
    'classify__weights': ['uniform', 'distance'],  # Peso dado a los vecinos (uniforme o ponderado por la inversa de la distancia)
    'classify__metric': ['euclidean', 'manhattan'],  # Métrica de distancia utilizada
}

In [188]:
# obtain best params from knn model with grid search
knn_best_params_grid = grid_search_function(model=knn_model, param_grid=knn_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.4.1 Training

In [189]:
# train knn model with best params
knn_model = train(knn_model, best_params=knn_best_params_grid)

##### 2.1.3.4.2 Testing

In [190]:
# test knn model
y_pred = test(knn_model, is_classification=True)

Precisión del modelo: 1.0


### 2.1.4 Best ML Model

#### 2.1.4.1 Saved Best Model

In [191]:

# No model obtained the necessary precision to be used in imputation.

## 2.2 Target Column: Education Level

In [192]:
# load dataset pre-encode into data frame 'df'
df = load_csv_into_dataframe(CODEC_DF_PATH)

CSV file successfully load in DataFrame.


In [193]:
# filter df without null values 
df = df[df['ULTIMO_NIVEL_EDUCATIVO'] != -1]

### 2.2.1 Data Separation

In [194]:
# separate columns 
# target column
target_column = 'ULTIMO_NIVEL_EDUCATIVO'
y = df[target_column].copy()
# features columns
X = df.copy().drop(target_column, axis='columns')

In [195]:
# separate feature columns and target column into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2.2.2 Regression Models

#### 2.2.2.1 Linear Regression

In [196]:
# create logistic regression instance from ModeloML class
linear_regression_model = LinearRegression()

In [197]:
# params from logistic regression model
linear_regression_model_params_grid = {
        'classify__fit_intercept': [True, False],  # Indica si debe ajustarse o no el intercepto
}

In [198]:
# obtain the best params from logistic regression model with grid search
linear_regression_model_best_params_grid = grid_search_function(linear_regression_model, param_grid=linear_regression_model_params_grid,  X_train=X_train, y_train=y_train)

##### 2.2.2.1.1 Training

In [199]:
# train logistic regression model with best params
linear_regression_model_best_params_grid = train(linear_regression_model, best_params=linear_regression_model_best_params_grid)

##### 2.2.2.1.2 Testing

In [200]:
# test logistic regression model
linear_regression_model = test(linear_regression_model, is_classification=False)

Mean Squared Error en el conjunto de prueba: 3.01
Coeficiente de Determinación (R²): 0.12


#### 2.2.2.2 Logistic Regression

In [201]:
# create logistic regression model instance from ModeloML class
logistic_regression_model = LogisticRegression()

In [202]:
# logistic regression params
logistic_regression_params_grid = {
    'classify__penalty': ['l1', 'l2'], 
    'classify__C': [0.1, 1, 10], 
    'classify__fit_intercept': [True, False],  
}

In [203]:
# obtain logistic regression best params with grid search
logistic_regression_best_params_grid = grid_search_function(logistic_regression_model, logistic_regression_params_grid, X_train=X_train, y_train=y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

##### 2.2.2.2.1 Training

In [204]:
# train logistic regression model with best params
logistic_regression_model = train(logistic_regression_model, logistic_regression_best_params_grid)

##### 2.2.2.2.2 Testing

In [205]:
# test logistic regression model
y_pred = test(logistic_regression_model, is_classification=False)

Mean Squared Error en el conjunto de prueba: 3.57
Coeficiente de Determinación (R²): -0.04


### 2.2.3 Classification Models

 #### 2.1.3.1 Support Vector Machines

In [206]:
# Create support vector machine model object from ModeloML class
svm_model = SVC()

In [207]:
# params from support vector machine model
svm_params_grid = {
    'classify__C': [0.1, 1, 10],  
    'classify__kernel': ['linear', 'rbf']
}

In [208]:
# obtain best params from support vector machine model using grid search
svm_best_params = grid_search_function(svm_model, param_grid=svm_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.1.1 Training

In [209]:
# train support vector machine model with best params 
svm_model = train(svm_model, svm_best_params)

##### 2.1.3.1.2 Testing

In [210]:
# testing support vector machine model
y_pred = test(svm_model, is_classification=True)

Precisión del modelo: 0.31158055484856195


#### 2.1.3.2 Decision Tree

In [211]:
# create decision tree model object by ModeloML class
decision_tree_model = DecisionTreeClassifier()

In [212]:
# params for decision tree model
decision_tree_model_params_grid = {
    'classify__max_depth': [None, 10, 20],  # Ajusta estos valores
    'classify__min_samples_split': [2, 5, 10],
    'classify__min_samples_leaf': [1, 2, 4]
}

In [213]:
#obtain best params grid to decision tree model using grid search
decision_tree_model_best_params_grid = grid_search_function(decision_tree_model, param_grid=decision_tree_model_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.2.1 Training

In [214]:
# train decision tree model with best params
decision_tree_model = train(decision_tree_model, decision_tree_model_best_params_grid)

##### 2.1.3.2.2 Testing

In [215]:
# test decision tree model
y_pred = test(decision_tree_model, is_classification=True)

Precisión del modelo: 0.7640621023161109


#### 2.1.3.3 Random Forest

In [216]:
#create random forest model object by ModeloML class
random_forest_model = RandomForestClassifier()

In [217]:
# params for random forest model
random_forest_params_grid = {
    'classify__max_depth': [None, 5, 10],  # Reducir la profundidad del árbol
    'classify__min_samples_split': [2, 5, 10],  # Aumentar min_samples_split
    'classify__min_samples_leaf': [1, 2, 4],  # Aumentar min_samples_leaf
    'classify__ccp_alpha': [0.0, 0.1, 0.2]  # Añadir ccp_alpha
}

In [218]:
#obtain best params from random forest model with grid search
random_forest_best_params_grid = grid_search_function(model= random_forest_model, param_grid=random_forest_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.3.1 Training

In [219]:
# train random forest model with best params
random_forest_model = train(random_forest_model, random_forest_best_params_grid)

##### 2.1.3.3.2 Testing

In [220]:
# test random forest model
y_pred = test(random_forest_model, is_classification=True)

Precisión del modelo: 0.780198523797404


#### 2.1.3.4 K-Neighbors

In [221]:
# create knn model by ModeloML class
knn_model = KNeighborsClassifier()

In [222]:
# params for knn model
knn_params_grid = {
    'classify__n_neighbors': [3, 5, 7],  # Número de vecinos a considerar
    'classify__weights': ['uniform', 'distance'],  # Peso dado a los vecinos (uniforme o ponderado por la inversa de la distancia)
    'classify__metric': ['euclidean', 'manhattan'],  # Métrica de distancia utilizada
}

In [223]:
# obtain best params from knn model with grid search
knn_best_params_grid = grid_search_function(model=knn_model, param_grid=knn_params_grid, X_train=X_train, y_train=y_train)

##### 2.1.3.4.1 Training

In [224]:
# train knn model with best params
knn_model = train(knn_model, best_params=knn_best_params_grid)

##### 2.1.3.4.2 Testing

In [225]:
# test knn model
y_pred = test(knn_model, is_classification=True)

Precisión del modelo: 0.6786459658946297


### 2.2.4 Best ML Model

 #### 2.2.4.1 Saved Best Model

In [227]:
# export the model that obtain best precision value for this column
joblib.dump(random_forest_model, '../data/ml_best_model/ultimo_nivel_educativo_random_forest.pkl')

['../data/ml_best_model/ultimo_nivel_educativo_random_forest.pkl']