In [2]:
import autosklearn.classification
import pandas as pd
import numpy as np
import sklearn.model_selection
import sklearn.metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
import h2o
from h2o.automl import H2OAutoML
from tpot import TPOTClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
import time

  self.re = re.compile(self.reString)


In [10]:
def autosklearn_classification(X_train, y_train, X_test):
    automl = autosklearn.classification.AutoSklearnClassifier()
   
    automl.fit(X_train, y_train)
    
    predictions = automl.predict(X_test)
    
    predictions_proba = automl.predict_proba(X_test)[:,1]
    return (predictions,predictions_proba)
    

In [4]:
def tpot_classification(X_train, y_train, X_test):
    tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=1)
    tpot.fit(X_train, y_train)
    predictions = tpot.predict(X_test)
    predictions_proba = tpot.predict_proba(X_test)[:,1]
    return (predictions, predictions_proba)


In [3]:
def h2o_classifications(X_train, y_train, X_test, target):
    h2o.init()
    aml = H2OAutoML()
    train_hf = pd.concat([X_train, y_train], axis=1)
    train_hf = h2o.H2OFrame(train_hf)
    train_hf[target] = train_hf[target].asfactor()
    aml.train(y = target, training_frame = train_hf)
    response = aml.predict(h2o.H2OFrame(X_test))
    print("various models tested:")
    lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')
    print(lb)
    return response

In [5]:
def metric_calculator(framework, dataset, y_test, predictions):
    
    if framework =="H2O":   
        print("Framework: ", framework)
        ll = log_loss(y_test, predictions.as_data_frame().iloc[:,1:])
        print("log loss: ",ll)
        accuracy = sklearn.metrics.accuracy_score(predictions.as_data_frame().iloc[:,0], y_test.astype('int64'))
        print("accuracy: ",accuracy)
        print("Classification report")
        print(classification_report(y_test.astype('int64'), predictions.as_data_frame().iloc[:,0]))
    else:
        print("Framework: ", framework)
        ll = log_loss(y_test, predictions[1])
        print("log loss: ",ll)
        accuracy = sklearn.metrics.accuracy_score(predictions[0], y_test)
        print("accuracy: ",accuracy)
        print("Classification report")
        print(classification_report(y_test, predictions[0]))
            

### Dataset 1

In [6]:
ds1_df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",sep=";")

X, y = ds1_df.iloc[:,:-1], ds1_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
start_time = time.time()
ds1_autosklearn_predictions = autosklearn_classification(X_train, y_train, X_test)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44905 instead
  http_address["port"], self.http_server.port




In [20]:
metric_calculator("auto-sklearn", "wine quality", y_test, ds1_autosklearn_predictions)
end_time = time.time()
print("total time elapsed: ",end_time - start_time)

Framework:  auto-sklearn
log loss: 0.4268143501
accuracy:  0.6579591836734694
Classification report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.56      0.14      0.22        37
           5       0.72      0.63      0.67       368
           6       0.61      0.84      0.71       544
           7       0.78      0.44      0.56       233
           8       0.91      0.24      0.38        41

    accuracy                           0.66      1225
   macro avg       0.60      0.38      0.42      1225
weighted avg       0.68      0.66      0.64      1225

total time elapsed:  4493.640509843826


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
start_time = time.time()
ds1_tpot_predictions = tpot_classification(X_train, y_train, X_test)

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=300.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.6553237316725056

Generation 2 - Current best internal CV score: 0.6553237316725056

Generation 3 - Current best internal CV score: 0.6555928747520807

Generation 4 - Current best internal CV score: 0.6555928747520807

Generation 5 - Current best internal CV score: 0.6555928747520807

Best pipeline: KNeighborsClassifier(RobustScaler(input_matrix), n_neighbors=70, p=1, weights=distance)


In [18]:
metric_calculator("tpot", "wine quality", y_test, ds1_tpot_predictions)
end_time = time.time()
print("total time elapsed: ",end_time - start_time)

Framework:  tpot
accuracy:  0.6644897959183673
Classification report


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       1.00      0.03      0.05        37
           5       0.73      0.60      0.66       368
           6       0.61      0.85      0.71       544
           7       0.76      0.50      0.60       233
           8       1.00      0.32      0.48        41

    accuracy                           0.66      1225
   macro avg       0.68      0.38      0.42      1225
weighted avg       0.70      0.66      0.65      1225

total time elapsed:  2303.344337940216


In [7]:
start_time = time.time()
ds1_h2o_predictions = h2o_classifications(X_train, y_train, X_test, ds1_df.columns[-1])
end_time = time.time()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.9" 2020-10-20; OpenJDK Runtime Environment (build 11.0.9+11-post-Debian-1deb10u1); OpenJDK 64-Bit Server VM (build 11.0.9+11-post-Debian-1deb10u1, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpc14odxof
  JVM stdout: /tmp/tmpc14odxof/h2o_jupyter_started_from_python.out
  JVM stderr: /tmp/tmpc14odxof/h2o_jupyter_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_jupyter_155207
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.287 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |█████████████████████████████████████████████████████ (cancelled)  99%


H2OJobCancelled: Job<$03017f00000132d4ffffffff$_87b9f4236fa7b6614526e6ac157e49ea> was cancelled by the user.

In [None]:
metric_calculator("H2O", "wine quality",y_test, ds1_h2o_predictions)
print("total time elapsed: ",end_time - start_time)

## dataset 2


In [6]:
X, y = fetch_openml('spambase', version=1, return_X_y=True, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
start_time = time.time()
ds2_autosklearn_predictions = autosklearn_classification(X_train, y_train, X_test)
end_time = time.time()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43301 instead
  http_address["port"], self.http_server.port
  self._dask_client.shutdown()




In [30]:
metric_calculator("auto-sklearn", "spambase", y_test, ds2_autosklearn_predictions)
print("total time elapsed: ",end_time - start_time)

Framework:  auto-sklearn
log loss: 0.3834920926
accuracy:  0.9591659426585578
Classification report
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       701
           1       0.97      0.93      0.95       450

    accuracy                           0.96      1151
   macro avg       0.96      0.95      0.96      1151
weighted avg       0.96      0.96      0.96      1151

total time elapsed:  3627.5266301631927


In [20]:
start_time = time.time()
ds2_tpot_predictions = tpot_classification(X_train, y_train, X_test)
end_time = time.time()

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=300.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.946376811594203

Generation 2 - Current best internal CV score: 0.9489855072463769

Generation 4 - Current best internal CV score: 0.9515942028985507

Generation 5 - Current best internal CV score: 0.952463768115942

Best pipeline: GradientBoostingClassifier(BernoulliNB(input_matrix, alpha=0.01, fit_prior=True), learning_rate=0.1, max_depth=6, max_features=0.5, min_samples_leaf=15, min_samples_split=5, n_estimators=100, subsample=0.7500000000000001)


In [21]:
metric_calculator("tpot", "spambase", y_test,ds2_tpot_predictions)
print("total time elapsed: ",end_time - start_time)

Framework:  tpot
log loss:  0.12303342451402823
accuracy:  0.9548218940052129
Classification report
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       701
           1       0.95      0.94      0.94       450

    accuracy                           0.95      1151
   macro avg       0.95      0.95      0.95      1151
weighted avg       0.95      0.95      0.95      1151

total time elapsed:  1716.766785621643


In [22]:
start_time = time.time()
ds2_h2o_predictions = h2o_classifications(X_train, y_train, X_test, y.name)
end_time = time.time()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 42 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_jupyter_htc3nt
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.163 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%
various models tested:


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
StackedEnsemble_AllModels_AutoML_20201215_113110,0.988626,0.138365,0.982851,0.0481614,0.191989,0.0368598,2039,0.341688
GBM_grid__1_AutoML_20201215_113110_model_13,0.988322,0.131862,0.983112,0.0507669,0.192847,0.0371901,1265,0.038047
StackedEnsemble_BestOfFamily_AutoML_20201215_113110,0.987788,0.141036,0.982787,0.0493742,0.194917,0.0379924,449,0.044199
GBM_grid__1_AutoML_20201215_113110_model_8,0.987766,0.133093,0.982901,0.0499956,0.192318,0.0369862,1709,0.0319
GBM_grid__1_AutoML_20201215_113110_model_7,0.987754,0.132591,0.981016,0.0500404,0.191321,0.0366038,1104,0.035434
GBM_4_AutoML_20201215_113110,0.987694,0.133169,0.981948,0.0485133,0.191923,0.0368345,1446,0.029298
GBM_grid__1_AutoML_20201215_113110_model_6,0.987609,0.132999,0.98119,0.0491424,0.192117,0.0369089,1728,0.033811
GBM_grid__1_AutoML_20201215_113110_model_10,0.987591,0.134586,0.981039,0.0521966,0.194223,0.0377226,1166,0.029874
GBM_2_AutoML_20201215_113110,0.987268,0.137118,0.981027,0.0528777,0.195929,0.0383882,1234,0.026696
GBM_grid__1_AutoML_20201215_113110_model_1,0.987193,0.140357,0.981762,0.0518823,0.198237,0.0392977,1615,0.031847





In [1]:
metric_calculator("H2O", "spambase",y_test,ds2_h2o_predictions )
print("total time elapsed: ",end_time - start_time)

NameError: name 'metric_calculator' is not defined

## Dataset 3

In [None]:
Marketing. = pd.read_csv("bank-marketing",sep=";")

X, y = Marketing.iloc[:,:-1], Marketing.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
start_time = time.time()
ds3_autosklearn_predictions = autosklearn_classification(X_train, y_train, X_test)

1


Perhaps you already have a cluster running?
Hosting the HTTP server on port 38319 instead
  http_address["port"], self.http_server.port


In [None]:
metric_calculator("auto-sklearn", "bank-marketing",y_test, ds3_autosklearn_predictions)
end_time = time.time()
print("total time elapsed: ",end_time - start_time)

In [None]:
start_time = time.time()

Marketing.rename(columns={'y': 'class'}, inplace=True)

In [8]:
for cat in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ,'class']:
    print("Number of levels in category '{0}': \b {1:2.2f} ".format(cat, Marketing[cat].unique().size))

Number of levels in category 'job': 12.00 
Number of levels in category 'marital': 4.00 
Number of levels in category 'education': 8.00 
Number of levels in category 'default': 3.00 
Number of levels in category 'housing': 3.00 
Number of levels in category 'loan': 3.00 
Number of levels in category 'contact': 2.00 
Number of levels in category 'month': 10.00 
Number of levels in category 'day_of_week': 5.00 
Number of levels in category 'poutcome': 3.00 
Number of levels in category 'class': 2.00 


In [9]:
for cat in ['contact', 'poutcome','class', 'marital', 'default', 'housing', 'loan']:
    print("Levels for catgeory '{0}': {1}".format(cat, Marketing[cat].unique()))

Levels for catgeory 'contact': ['telephone' 'cellular']
Levels for catgeory 'poutcome': ['nonexistent' 'failure' 'success']
Levels for catgeory 'class': ['no' 'yes']
Levels for catgeory 'marital': ['married' 'single' 'divorced' 'unknown']
Levels for catgeory 'default': ['no' 'unknown' 'yes']
Levels for catgeory 'housing': ['no' 'yes' 'unknown']
Levels for catgeory 'loan': ['no' 'yes' 'unknown']


In [10]:
Marketing['marital'] = Marketing['marital'].map({'married':0,'single':1,'divorced':2,'unknown':3})
Marketing['default'] = Marketing['default'].map({'no':0,'yes':1,'unknown':2})
Marketing['housing'] = Marketing['housing'].map({'no':0,'yes':1,'unknown':2})
Marketing['loan'] = Marketing['loan'].map({'no':0,'yes':1,'unknown':2})
Marketing['contact'] = Marketing['contact'].map({'telephone':0,'cellular':1})
Marketing['poutcome'] = Marketing['poutcome'].map({'nonexistent':0,'failure':1,'success':2})
Marketing['class'] = Marketing['class'].map({'no':0,'yes':1})

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

job_Trans = mlb.fit_transform([{str(val)} for val in Marketing['job'].values])
education_Trans = mlb.fit_transform([{str(val)} for val in Marketing['education'].values])
month_Trans = mlb.fit_transform([{str(val)} for val in Marketing['month'].values])
day_of_week_Trans = mlb.fit_transform([{str(val)} for val in Marketing['day_of_week'].values])

In [18]:
marketing_new = Marketing.drop(['marital','default','housing','loan','contact','poutcome','class','job','education','month','day_of_week'], axis=1)
marketing_new = np.hstack((marketing_new.values, job_Trans, education_Trans, month_Trans, day_of_week_Trans))

In [26]:
X, y = marketing_new[:,:-1], marketing_new[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
ds3_tpot_predictions = tpot_classification(X_train, y_train, X_test)
end_time = time.time()

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 1.0

Generation 2 - Current best internal CV score: 1.0

Generation 3 - Current best internal CV score: 1.0

Generation 4 - Current best internal CV score: 1.0

Generation 5 - Current best internal CV score: 1.0

Best pipeline: DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=6, min_samples_leaf=20, min_samples_split=14)


In [35]:
metric_calculator("auto-sklearn", "spambase", y_test, ds3_tpot_predictions)
print("total time elapsed: ",end_time - start_time)

Framework:  auto-sklearn
log loss:  9.992007221626413e-16
accuracy:  1.0
Classification report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8233
         1.0       1.00      1.00      1.00      2064

    accuracy                           1.00     10297
   macro avg       1.00      1.00      1.00     10297
weighted avg       1.00      1.00      1.00     10297

total time elapsed:  6585.911735057831


In [None]:
start_time = time.time()
ds3_h2o_predictions = h2o_classifications(X_train, y_train, X_test, y.name)
end_time = time.time()

In [None]:
metric_calculator("H2O", "bank-marketing",y_test,ds3_h2o_predictions )

print("total time elapsed: ",end_time - start_time)