In [2]:
# Surpress warnings
import warnings
warnings.filterwarnings("ignore")

# Import libraries
from pmlb import dataset_names, classification_dataset_names, fetch_data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb

# Import SK-learn and AutoSK-Learn
import autosklearn.classification
import autosklearn.regression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

  from collections import Mapping, defaultdict


In [50]:
usable_datasets = []
dataset_min_count = 1
dataset_max_count = 50
count = 0

for dataset in classification_dataset_names:
    if count < dataset_min_count:
        count += 1
        continue
    usable_datasets.append(dataset)
    count += 1
    if count >= dataset_max_count:
        break


In [51]:
usable_datasets

['GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1',
 'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1',
 'GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1',
 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001',
 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001',
 'Hill_Valley_with_noise',
 'Hill_Valley_without_noise',
 'adult',
 'agaricus-lepiota',
 'allbp',
 'allhyper',
 'allhypo',
 'allrep',
 'analcatdata_aids',
 'analcatdata_asbestos',
 'analcatdata_authorship',
 'analcatdata_bankruptcy',
 'analcatdata_boxing1',
 'analcatdata_boxing2',
 'analcatdata_creditscore',
 'analcatdata_cyyoung8092',
 'analcatdata_cyyoung9302',
 'analcatdata_dmft',
 'analcatdata_fraud',
 'analcatdata_germangss',
 'analcatdata_happiness',
 'analcatdata_japansolvent',
 'analcatdata_lawsuit',
 'ann-thyroid',
 'appendicitis',
 'australian',
 'auto',
 'backache',
 'balance-scale',
 'banana',
 'biomed',
 'breast',
 'breast-cancer',
 'breast-cancer-wisconsin',
 'breast-w',
 'buggyCrx',
 'bupa',
 'calendarDOW'

In [52]:
np.random.seed(0)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

normal_logit_scores = []
junk_logit_scores = []
naive_ens_logit_scores = []

for dataset in usable_datasets:
    print("")
    print("NEW DATASET")
    X, y = fetch_data(dataset, return_X_y=True)
    
    m, d = X.shape
    
    # Create and append junk data of feature size 100 (half discrete, half contintinous)
    junk_discrete = np.round(np.random.rand(m, 50))
    junk_continuous = np.random.rand(m, 50)
    junk = np.hstack((junk_discrete, junk_continuous))
    X_junk = np.hstack((X, junk))    
    
    logit_norm = LogisticRegression()
    logit_junk = LogisticRegression()
    
    # Run models on the normal data and assess performance with regular ML models
    train_X_reg, test_X_reg, train_y_reg, test_y_reg = train_test_split(X, y)
    logit_norm.fit(train_X_reg, train_y_reg)
    normal_logit_score = logit_norm.score(test_X_reg, test_y_reg)
    normal_logit_scores.append(normal_logit_score)
    print("Score Logit Normal: ", normal_logit_score)
    
    # Run models on the normal+junk data and assess performance with regular ML models
    train_X_junk, test_X_junk, train_y_junk, test_y_junk = train_test_split(X_junk, y)
    logit_junk.fit(train_X_junk, train_y_junk)
    junk_logit_score = logit_junk.score(test_X_junk, test_y_junk)
    junk_logit_scores.append(junk_logit_score)
    print("Score Logit Junk: ", junk_logit_score)
    
    # Run naive ensembling with naive average with regular ML models
    train_X_reg, test_X_reg, train_y, test_y = train_test_split(X, y, random_state=0)
    train_X_junk, test_X_junk, train_y, test_y = train_test_split(junk, y, random_state=0)
    
    logit_p1 = LogisticRegression()
    logit_p2 = LogisticRegression()
    
    logit_p1.fit(train_X_reg, train_y)
    logit_p2.fit(train_X_junk, train_y)    
    
    pred1 = logit_p1.predict(test_X_reg)
    pred2 = logit_p2.predict(test_X_junk)
    
    pred_y = np.round(0.5 * pred1 + 0.5 * pred2)
    naive_ens_score = accuracy_score(pred_y, test_y)
    print("Naive Ensemble: ", naive_ens_score)
    naive_ens_logit_scores.append(naive_ens_score)
    
                
    # Run Auto-SKLearn on the normal + junk data

    # TODO: Run Block Regression
    # TODO: Run Block Regression and Auto-SkLearn
    
    # TODO: Repeat this entire process but with different size junk groups and different numbers
    
print("")    
print("Average score for normal logit: ", str(sum(normal_logit_scores)/float(len(normal_logit_scores))))
print("Average score for junk logit: ", str(sum(junk_logit_scores)/float(len(junk_logit_scores))))
print("Average score for naive ensemble: ", str(sum(naive_ens_logit_scores)/float(len(naive_ens_logit_scores))))
    
    
    
    
    


NEW DATASET
Score Logit Normal:  0.48
Score Logit Junk:  0.4925
Naive Ensemble:  0.5

NEW DATASET
Score Logit Normal:  0.5025
Score Logit Junk:  0.52
Naive Ensemble:  0.5

NEW DATASET
Score Logit Normal:  0.5375
Score Logit Junk:  0.4975
Naive Ensemble:  0.5175

NEW DATASET
Score Logit Normal:  0.4675
Score Logit Junk:  0.5075
Naive Ensemble:  0.49

NEW DATASET
Score Logit Normal:  0.4975
Score Logit Junk:  0.53
Naive Ensemble:  0.51

NEW DATASET
Score Logit Normal:  0.9603960396039604
Score Logit Junk:  0.9306930693069307
Naive Ensemble:  0.735973597359736

NEW DATASET
Score Logit Normal:  1.0
Score Logit Junk:  0.976897689768977
Naive Ensemble:  0.7458745874587459

NEW DATASET
Score Logit Normal:  0.7986241913029236
Score Logit Junk:  0.7930554418147572
Naive Ensemble:  0.8017361395463107

NEW DATASET
Score Logit Normal:  0.9430535100638193
Score Logit Junk:  0.9450171821305842
Naive Ensemble:  0.7039764359351989

NEW DATASET
Score Logit Normal:  0.9671261930010604
Score Logit Junk:

In [14]:
X_junk

array([[1.        , 0.        , 0.        , ..., 0.40292058, 0.31297249,
        0.4414766 ],
       [0.        , 1.        , 0.        , ..., 0.12702129, 0.8205428 ,
        0.86351832],
       [0.        , 0.        , 0.        , ..., 0.71576606, 0.58220464,
        0.57942425],
       ...,
       [2.        , 0.        , 0.        , ..., 0.47287437, 0.01392644,
        0.63438904],
       [1.        , 0.        , 0.        , ..., 0.42546666, 0.67785646,
        0.18979169],
       [1.        , 0.        , 0.        , ..., 0.78412312, 0.61925482,
        0.15752893]])

array([0, 0, 0, ..., 0, 0, 0])