Creación de un pipeline ANOVA-SVM --- 5:52 min
===

* 5:52 min | Última modificación: Octubre 1, 2021 | [YouTube](https://youtu.be/8HY4KOneF9s)

https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection_pipeline.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-pipeline-py

Construcción del pipeline
---

In [1]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

anova_svm = make_pipeline(
    # -------------------------------------------------------------------------
    # Select features according to the k highest scores.
    SelectKBest(
        # ---------------------------------------------------------------------
        # Function taking two arrays X and y, and returning a pair of arrays
        # (scores, pvalues) or a single array with scores. Default is f_classif
        # f_classif: ANOVA F-value between label/feature for classification
        # tasks.
        score_func=f_classif,
        # ---------------------------------------------------------------------
        # Number of top features to select.
        k=3,
    ),
    # -------------------------------------------------------------------------
    # Linear Support Vector Classification
    LinearSVC(
        # ---------------------------------------------------------------------
        # Specifies the norm used in the penalization.
        penalty="l2",
        # ---------------------------------------------------------------------
        # Specifies the loss function.
        loss="squared_hinge",
        # ---------------------------------------------------------------------
        # Select the algorithm to either solve the dual or primal optimization
        # problem.
        dual=True,
        # ---------------------------------------------------------------------
        # Regularization parameter.
        C=1.0,
        # ---------------------------------------------------------------------
        # Whether to calculate the intercept for this model.
        fit_intercept=True,
        # ---------------------------------------------------------------------
        # The maximum number of iterations to be run.
        max_iter=1000,
        # ---------------------------------------------------------------------
        random_state=None,
    ),
)

Preparación de los datos
---

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_features=20,
    n_informative=3,
    n_redundant=0,
    n_classes=2,
    n_clusters_per_class=2,
    random_state=42,
)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    # -------------------------------------------------------------------------
    # If float, should be between 0.0 and 1.0 and represent the proportion of
    # the dataset to include in the test split. If int, represents the absolute
    # number of test samples. If None, the value is set to the complement of
    # the train size. If train_size is also None, it will be set to 0.25.
    test_size=None,
    # -------------------------------------------------------------------------
    # If float, should be between 0.0 and 1.0 and represent the proportion of
    # the dataset to include in the train split. If int, represents the
    # absolute number of train samples. If None, the value is automatically
    # set to the complement of the test size.
    train_size=None,
    # -------------------------------------------------------------------------
    # Whether or not to shuffle the data before splitting. If shuffle=False
    # then stratify must be None
    shuffle=True,
    # -------------------------------------------------------------------------
    # If not None, data is split in a stratified fashion, using this as the
    # class labels.
    stratify=None,
    # -------------------------------------------------------------------------
    random_state=42,
)

Entrenamiento del pipeline
---

In [3]:
anova_svm.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('selectkbest',
                 SelectKBest(k=3,
                             score_func=<function f_classif at 0x7f409c77a048>)),
                ('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False)

Variables independientes seleccionadas
---

In [4]:
#
# Get a mask, or integer index, of the features selected.
#
anova_svm[0].get_support()

array([False, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False,  True])