In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.kernel_approximation import RBFSampler
from sklearn import mixture
from sklearn.datasets import load_digits
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_openml

In [5]:
#MNIST
digits = fetch_openml('mnist_784')
X=digits.data
y=digits.target

In [6]:
X_train, X, y_train, y = train_test_split(X,y,test_size=0.05, stratify=y)
components=np.unique(y).shape[0]
print('Shape:',X.shape)
print('components:',components)

Shape: (3500, 784)
components: 10


In [7]:
montecarlo=200
gamma=0.01
steps = [('rff', RBFSampler(gamma=gamma,n_components=montecarlo,random_state=48)), 
            ('cluster', mixture.BayesianGaussianMixture(n_components=components,random_state=48))] #clasificador 
parameters ={'rff__gamma':[1e-8,1e-6,1e-3,1e-2,1,2],
             'rff__n_components':np.arange(1000,2000,200)
             }
method = Pipeline(steps)

In [8]:
grid_adjusted_rand_score=make_scorer(adjusted_rand_score)
Niter = 5 #numero particiones
adrs =np.zeros(Niter)#arreglo para guardar acierto
best_estimators = []#mejor metodo por iteracion
best_parameters = []#mejor metodo por iteracion
for j in range(Niter):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y)
    grid_search = GridSearchCV(method, parameters,cv=3,verbose=10,scoring=grid_adjusted_rand_score,n_jobs=4)
    
    grid_search.fit(X_train,y_train)
    
    y_pred = grid_search.best_estimator_.predict(X_test)
    
    best_estimators.append(grid_search.best_estimator_)
    best_parameters.append(grid_search.best_params_)

    adrs[j]=adjusted_rand_score(y_pred, y_test)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  3.8min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  6.3min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  8.6min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 11.9min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 15.9min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed: 19.4min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 23.6min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed: 28.4min
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed: 33.4min finished
Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  10 tasks      | elap

In [9]:
me=np.mean(adrs)
std=np.std(adrs)
print('MNIST Adjusted rand score:'+str(me)+'+-'+str(std))

MNIST Adjusted rand score:0.27820684493257436+-0.052595081153130135


In [2]:
#GLASS
glass = fetch_openml('glass')
X=glass.data
y=glass.target
components=np.unique(y).shape[0]
print('Shape:',X.shape)
print('components:',components)

Shape: (214, 9)
components: 6
  warn("Multiple active versions of the dataset matching the name"


In [3]:
montecarlo=200
gamma=0.01
steps = [('rff', RBFSampler(gamma=gamma,n_components=montecarlo,random_state=48)), 
            ('cluster', mixture.BayesianGaussianMixture(n_components=components,random_state=48))] #clasificador 
parameters ={'rff__gamma':[1e-8,1e-6,1e-3,1e-2,1,2],
             'rff__n_components':np.arange(2,1202,200)
             }
method = Pipeline(steps)

In [4]:
grid_adjusted_rand_score=make_scorer(adjusted_rand_score)
Niter = 5 #numero particiones
adrs =np.zeros(Niter)#arreglo para guardar acierto
best_estimators = []#mejor metodo por iteracion
best_parameters = []#mejor metodo por iteracion
for j in range(Niter):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y)
    grid_search = GridSearchCV(method, parameters,cv=3,verbose=10,scoring=grid_adjusted_rand_score,n_jobs=4)
    
    grid_search.fit(X_train,y_train)
    
    y_pred = grid_search.best_estimator_.predict(X_test)
    
    best_estimators.append(grid_search.best_estimator_)
    best_parameters.append(grid_search.best_params_)

    adrs[j]=adjusted_rand_score(y_pred, y_test)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   38.3s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  6.1min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  6.3min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  6.5min
[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:  6.6min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0860s.) Setting batch_size=2.
Fitting 3 folds for each o

In [5]:
me=np.mean(adrs)
std=np.std(adrs)
print('GLASS Adjusted rand score:'+str(me)+'+-'+str(std))

GLASS Adjusted rand score:0.14055270332982334+-0.052517092547343006


In [6]:
#IRIS
iris = fetch_openml('iris')
X=iris.data
y=iris.target
components=np.unique(y).shape[0]
print('Shape:',X.shape)
print('components:',components)

Shape: (150, 4)
components: 3


In [7]:
montecarlo=200
gamma=0.01
steps = [('rff', RBFSampler(gamma=gamma,n_components=montecarlo,random_state=48)), 
            ('cluster', mixture.BayesianGaussianMixture(n_components=components,random_state=48))] #clasificador 
parameters ={'rff__gamma':[1e-8,1e-6,1e-3,1e-2,1,2],
             'rff__n_components':np.arange(2,1202,200)
             }
method = Pipeline(steps)

In [8]:
grid_adjusted_rand_score=make_scorer(adjusted_rand_score)
Niter = 5 #numero particiones
adrs =np.zeros(Niter)#arreglo para guardar acierto
best_estimators = []#mejor metodo por iteracion
best_parameters = []#mejor metodo por iteracion
for j in range(Niter):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y)
    grid_search = GridSearchCV(method, parameters,cv=3,verbose=10,scoring=grid_adjusted_rand_score,n_jobs=4)
    
    grid_search.fit(X_train,y_train)
    
    y_pred = grid_search.best_estimator_.predict(X_test)
    
    best_estimators.append(grid_search.best_estimator_)
    best_parameters.append(grid_search.best_params_)

    adrs[j]=adjusted_rand_score(y_pred, y_test)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   18.7s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   55.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed:  2.9min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Batch computation too fast (0.0720s.) Setting batch_size=2.
Fitting 3 folds for each o

In [9]:
me=np.mean(adrs)
std=np.std(adrs)
print('IRIS Adjusted rand score:'+str(me)+'+-'+str(std))

IRIS Adjusted rand score:0.5361084795225104+-0.09413677767400576
