In [None]:
import numpy as np
from few import FEW
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
import time
from tqdm import tqdm 

def compare_configs(estimators,X,y,classification=False):
    """routine that compares a list of estimators evaluated on a set of data"""
    if classification:
        cv = StratifiedKFold(n_splits=3,shuffle=True)
    else:
        cv = KFold(n_splits=5,shuffle=True)

    trials = 10
    scores = np.ndarray((len(estimators),trials))
    times = np.zeros((len(estimators),trials))
    
    for e,est in tqdm(enumerate(estimators)):
        for t in np.arange(trials):
            t0 = time.time()
            scores[e,t] = np.mean(cross_val_score(est,X,y,cv=cv,n_jobs=-1))
            times[e,t] = time.time() - t0
    
    return scores, times


In [None]:
# define FEW configurations to compare
from sklearn.linear_model import LassoLarsCV, LogisticRegressionCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
%matplotlib inline

# stall_count options
ms = np.arange(1,102,step=10)
estimators = {}
estimators['lasso'] = []
estimators['dt'] = []
for m in ms:
    estimators['lasso'].append(FEW(ml=LassoLarsCV(),generations=100,max_stall=m))
    estimators['dt'].append(FEW(ml=DecisionTreeRegressor(),generations=100,max_stall=m))
problems = ['concrete','enc','housing','uball5d','yacht']
# problems = ['enc','housing']
###################################################################################################### lasso
print('--- lasso ---')
h,ax = plt.subplots(len(problems),sharex=True)
for i,p in enumerate(problems):
    print('problem:',p)
    input_data = pd.read_csv('data/d_' + p + '.txt', sep=None, engine='python')
    X = StandardScaler().fit_transform(input_data.drop('label',axis=1).values)
    y = input_data['label'].values
    scores,times = compare_configs(estimators['lasso'],X,y)
    norm_scores = scores - np.median(scores[0,:])
    # plot results
    ax[i].boxplot(list(norm_scores),positions=ms,widths=5)
    ax[i].plot([0,np.max(ms)+1],[0,0],'-k')
    ax[i].set_xticks(ms)
    ax[i].set_title(p)
################################################################################################## decision tree
print('--- decision tree ---')
h2,ax2 = plt.subplots(len(problems),sharex=True)
for i,p in enumerate(problems):
    print('problem:',p)
    input_data = pd.read_csv('data/d_' + p + '.txt', sep=None, engine='python')
    X = StandardScaler().fit_transform(input_data.drop('label',axis=1).values)
    y = input_data['label'].values
    scores,times = compare_configs(estimators['dt'],X,y)
    norm_scores = scores - np.median(scores[0,:])
    # plot results
    ax2[i].boxplot(list(norm_scores),positions=ms,widths=5)
    ax2[i].plot([0,np.max(ms)+1],[0,0],'-k')
    ax2[i].set_xticks(ms)
    ax2[i].set_title(p)

plt.show()

0it [00:00, ?it/s]

--- lasso ---
problem: concrete


11it [32:58, 279.21s/it]
0it [00:00, ?it/s]

problem: enc


11it [35:21, 359.66s/it]
0it [00:00, ?it/s]

problem: housing


11it [1:18:18, 588.74s/it]
0it [00:00, ?it/s]

problem: uball5d


3it [14:12, 242.96s/it]Process ForkPoolWorker-29259:
Process ForkPoolWorker-29258:
Traceback (most recent call last):
  File "/home/bill/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/bill/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/bill/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
Process ForkPoolWorker-29257:
  File "/home/bill/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-29256:
  File "/home/bill/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-29255:
Process ForkPoolWorker-29254:
Traceback (most recent call last):
Process ForkPoolWorker-29251:
Process ForkPoolWorker-29250:
  File "/home/bill/anaconda3/lib/pyt

In [None]:
# define FEW configurations to compare
# CLASSIFICATION
from sklearn.linear_model import LassoLarsCV, LogisticRegressionCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
%matplotlib inline

# stall_count options
ms = np.arange(1,102,step=10)
estimators = {}
estimators['lasso'] = []
estimators['dt'] = []
for m in ms:
    estimators['lasso'].append(FEW(ml=LassoLarsCV(),generations=100,max_stall=m))
    estimators['dt'].append(FEW(ml=DecisionTreeRegressor(),generations=100,max_stall=m))
problems = ['concrete','enc','housing','uball5d','yacht']
# problems = ['enc','housing']
###################################################################################################### lasso
print('--- lasso ---')
h,ax = plt.subplots(len(problems),sharex=True)
for i,p in enumerate(problems):
    print('problem:',p)
    input_data = pd.read_csv('data/d_' + p + '.txt', sep=None, engine='python')
    X = StandardScaler().fit_transform(input_data.drop('label',axis=1).values)
    y = input_data['label'].values
    scores,times = compare_configs(estimators['lasso'],X,y)
    norm_scores = scores - np.median(scores[0,:])
    # plot results
    ax[i].boxplot(list(norm_scores),positions=ms,widths=5)
    ax[i].plot([0,np.max(ms)+1],[0,0],'-k')
    ax[i].set_xticks(ms)
    ax[i].set_title(p)
################################################################################################## decision tree
print('--- decision tree ---')
h2,ax2 = plt.subplots(len(problems),sharex=True)
for i,p in enumerate(problems):
    print('problem:',p)
    input_data = pd.read_csv('data/d_' + p + '.txt', sep=None, engine='python')
    X = StandardScaler().fit_transform(input_data.drop('label',axis=1).values)
    y = input_data['label'].values
    scores,times = compare_configs(estimators['dt'],X,y)
    norm_scores = scores - np.median(scores[0,:])
    # plot results
    ax2[i].boxplot(list(norm_scores),positions=ms,widths=5)
    ax2[i].plot([0,np.max(ms)+1],[0,0],'-k')
    ax2[i].set_xticks(ms)
    ax2[i].set_title(p)

plt.show()## 