#Tecnicas avanzadas de Machine Learning

#Validacion Cruzada

In [2]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Cargamos los datos
iris = load_iris()
X = iris.data
y = iris.target

In [7]:
# Creamos el modelo
modelo = RandomForestClassifier()

In [9]:
# Realizar validacion cruzada
scores = cross_val_score(modelo, X, y, cv=5)
print("Puntuaciones de validacion cruzada: ", scores)
print("Media de puntuaciones: ", scores.mean())

Puntuaciones de validacion cruzada:  [0.96666667 0.96666667 0.9        0.96666667 1.        ]
Media de puntuaciones:  0.9600000000000002


#Ajuste de hiperparametros

In [11]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [12]:
# Carga de datos
iris = load_iris()
X = iris.data
y = iris.target

In [13]:
# Dividimos los datos para entrenamiento / prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Creamos el modelo
modelo = RandomForestClassifier()

In [15]:
# Definimos hiperparametros
param_grid = {
    'n_estimators': [10,50,100],
    'max_depth': [None, 10, 20, 30]}

In [16]:
# Realizamos busqueda de hiperparametros
grid_search = GridSearchCV(modelo, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [17]:
# Mostramos los mejores parametros
print("Mejores hiperparametros: ", grid_search.best_params_)

Mejores hiperparametros:  {'max_depth': None, 'n_estimators': 50}


#Pipelines en Scikit-Learn

In [18]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [19]:
# Carga de datos
iris = load_iris()
X = iris.data
y = iris.target

In [20]:
# Dividimos los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Creamos el pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

In [22]:
# Entrenamos el pipeline
pipeline.fit(X_train, y_train)

In [23]:
# Evaluamos el pipeline
accuracy = pipeline.score(X_test, y_test)
print("Precision del pipeline:  ", accuracy)

Precision del pipeline:   1.0


#Ejercicio practico

In [24]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [25]:
digits = load_digits()
X = digits.data
y = digits.target

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [27]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

In [28]:
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': [0.001,0.01,0.1]
}
#

In [29]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [31]:
accuracy = grid_search.score(X_test, y_test)
print("Mejores hiperparametros: ", grid_search.best_params_)
print("Precision del pipeline: ", accuracy)

Mejores hiperparametros:  {'classifier__C': 10, 'classifier__gamma': 0.01}
Precision del pipeline:  0.9805555555555555
