# AutoGluon: How To Get The Same Performance With Only 5% of Your Features

In this tutorial we illustrate using 38 real-world regression and classification problems that, thanks to the `kxy` package, you can achieve the same AutoGluon performance or better, with only 5% of features.

## Necessary Imports

In [1]:
import os
import json
import logging
logging.basicConfig(level=logging.WARNING)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, roc_auc_score

from kxy_datasets.regressions import all_regression_datasets
from kxy_datasets.classifications import all_classification_datasets

from kxy.learning import get_autogluon_learner
from kxy.misc.predictors import NaivePredictor
from kxy.learning.leanml_predictor import LeanMLPredictor

## The Basic Function To Load/Train/Save A Compressed Version of Your Model

In [2]:
def train_model(train_features_df, target_column, problem_type, learner_func, \
                feature_selection_method, path):
    '''
    Train a custom model with or without feature compression.
    '''
    cls = LeanMLPredictor if feature_selection_method == 'leanml' else NaivePredictor
    try:
        # First try to load the model from disk
        predictor = cls.load(path, learner_func)
    except:
        try:
            # Train the model from scratch if it is not found from disk
            results = train_features_df.kxy.fit(target_column, learner_func, \
                problem_type=problem_type, feature_selection_method=feature_selection_method, \
                path=path)
        except:
            # Some models do not like NAs
            train_features_df = train_features_df.dropna(axis=0)
            results = train_features_df.kxy.fit(target_column, learner_func, \
                problem_type=problem_type, feature_selection_method=feature_selection_method, \
                path=path)
        predictor = results['predictor']
        
        # Save the trained model to disk
        predictor.save(path)
    
    return predictor

## Some Utility Functions To Run Experiments

In [3]:
os.makedirs('./cache/', exist_ok=True)

# Utility functions
def regression_benchmark(learner_func, model_name):
    '''
    Run all regression experiments for a given model.
    '''
    try:
        with open('./cache/%s_regression_benchmark_perfs.json' % model_name, 'r') as f:
            perfs = json.load(f)
    except:
        perfs = {}

    try:
        with open('./cache/%s_regression_benchmark_n_features.json' % model_name, 'r') as f:
            n_features = json.load(f)
    except:
        n_features = {}
        
    try:
        with open('./cache/regression_n_rows.json', 'r') as f:
            n_rows = json.load(f)
    except:
        n_rows = {}

    for dataset_cls in all_regression_datasets:
        print()
        print('Dataset: %s' % dataset_cls.__name__)
        dataset = dataset_cls()
        target_column = dataset.y_column
        dataset_name = dataset.name
        perfs[dataset_name] = perfs.get(dataset_name, {})
        n_features[dataset_name] = n_features.get(dataset_name, {})
        df = dataset.df
        n_rows[dataset_name] = df.shape[0]

        # Features generation
        features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', \
            exclude=[target_column])
        train_features_df, test_features_df = train_test_split(features_df, test_size=0.2, \
            random_state=0)
        test_labels_df = test_features_df.loc[:, [target_column]]
        test_features_df = test_features_df.drop(target_column, axis=1)
        
        print('%s %d Features, Target: %s' % (dataset_name, train_features_df.shape[1]-1, target_column))
        
        # LeanML vs. No Feature Seletion
        dataset_performance = perfs[dataset_name].copy()
        dataset_n_features = n_features[dataset_name].copy()
        path = './cache/%s-%s-regression-benchmark.sav' % (model_name, dataset_name)
        
        for feature_selection_method in ['leanml', 'none']:
            # Training
            predictor = train_model(train_features_df, target_column, 'regression', learner_func, \
                feature_selection_method, path)                    
            n_selected_features = len(predictor.selected_variables)

            # Evaluation
            try:
                try:
                    test_predictions_df = predictor.predict(test_features_df)
                except:
                    nan_features = test_features_df.isna().any(axis=1)
                    test_features_df = test_features_df.loc[np.logical_not(nan_features), :]
                    test_predictions_df = predictor.predict(test_features_df)
                    test_labels_df = test_labels_df.loc[np.logical_not(nan_features), :]

                perf = r2_score(\
                    test_labels_df[target_column].values, \
                    test_predictions_df[target_column].values)
            except:
                logging.exception('Somthing bad happened')
                perf = 0.0
                
            print('%s, Feature Selection Method: %s --- R-Squared: %.2f, Number of Selected Features: %d' % (\
                dataset_name, feature_selection_method, perf, n_selected_features))
            
            dataset_performance[feature_selection_method]=float(perf)
            dataset_n_features[feature_selection_method]=int(n_selected_features)
            
        perfs[dataset_name]=dataset_performance.copy()
        with open('./cache/%s_regression_benchmark_perfs.json' % model_name, 'w') as f:
            json.dump(perfs, f)

        n_features[dataset_name]=dataset_n_features.copy()
        with open('./cache/%s_regression_benchmark_n_features.json' % model_name, 'w') as f:
            json.dump(n_features, f)

        with open('./cache/regression_n_rows.json', 'w') as f:
            json.dump(n_rows, f)


def classification_benchmark(learner_func, model_name):
    '''
    Run all classification experiments for a given model.
    '''
    try:
        with open('./cache/%s_classification_benchmark_perfs.json' % model_name, 'r') as f:
            perfs = json.load(f)
    except:
        perfs = {}

    try:
        with open('./cache/%s_classification_benchmark_n_features.json' % model_name, 'r') as f:
            n_features = json.load(f)
    except:
        n_features = {}
        
    try:
        with open('./cache/classification_n_rows.json', 'r') as f:
            n_rows = json.load(f)
    except:
        n_rows = {}

    # LeanML
    for dataset_cls in all_classification_datasets:
        print()
        print('Dataset: %s' % dataset_cls.__name__)
        dataset = dataset_cls()
        target_column = dataset.y_column
        dataset_name = dataset.name
        perfs[dataset_name] = perfs.get(dataset_name, {})
        n_features[dataset_name] = n_features.get(dataset_name, {})
        df = dataset.df
        n_rows[dataset_name] = df.shape[0]

        # Features generation
        features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', \
            exclude=[target_column])

        if target_column in features_df:
            target_df = pd.get_dummies(df[target_column], prefix=str(target_column))
            features_df = features_df.drop(target_column, axis=1)
            features_df = pd.concat([features_df, target_df], axis=1)

        train_features_df, test_features_df = train_test_split(features_df, test_size=0.2, \
            random_state=0)
        target_columns = [_ for _ in features_df.columns if str(_).startswith(str(target_column))]
        target_column = target_columns[0]
        test_labels_df = test_features_df.loc[:, [target_column]]
        train_labels_df = train_features_df.loc[:, [target_column]]

        for col in target_columns:
            if col != target_column:
                test_features_df = test_features_df.drop(col, axis=1)
                train_features_df = train_features_df.drop(col, axis=1)
            else:
                test_features_df = test_features_df.drop(col, axis=1)

        
        print('%s %d Features, Target: %s' % (dataset_name, train_features_df.shape[1]-1, target_column))

        # LeanML vs. No Feature Seletion
        dataset_performance = perfs[dataset_name].copy()
        dataset_n_features = n_features[dataset_name].copy()
        path = './cache/%s-%s-classification-benchmark.sav' % (model_name, dataset_name)

        for feature_selection_method in ['leanml', 'none']:
            # Training
            predictor = train_model(train_features_df, target_column, 'classification', learner_func, \
                feature_selection_method, path)                
            n_selected_features = len(predictor.selected_variables)

            # Evaluation
            try:
                try:
                    test_predictions_df = predictor.predict(test_features_df)
                except:
                    nan_features = test_features_df.isna().any(axis=1)
                    test_features_df = test_features_df.loc[np.logical_not(nan_features), :]
                    test_predictions_df = predictor.predict(test_features_df)
                    test_labels_df = test_labels_df.loc[np.logical_not(nan_features), :]
                perf = roc_auc_score(\
                    test_labels_df[target_column].values, \
                    test_predictions_df[target_column].values, \
                    multi_class='ovr')
            except:
                logging.exception('Somthing bad happened')
                perf = 0.5
                
            dataset_performance[feature_selection_method]=float(perf)
            dataset_n_features[feature_selection_method]=int(n_selected_features)

            print('%s, Feature Selection Method: %s --- AUC: %.2f, Number of Selected Features: %d' % (\
                dataset_name, feature_selection_method, perf, n_selected_features))
            
        perfs[dataset_name]=dataset_performance.copy()
        with open('./cache/%s_classification_benchmark_perfs.json' % model_name, 'w') as f:
            json.dump(perfs, f)

        n_features[dataset_name]=dataset_n_features.copy()
        with open('./cache/%s_classification_benchmark_n_features.json' % model_name, 'w') as f:
            json.dump(n_features, f)

        with open('./cache/classification_n_rows.json', 'w') as f:
            json.dump(n_rows, f)
            
            
        
def summarize_results():
    '''
    Print a dataframe summarizing all datasets used.
    '''
    print()
    model_name = 'autogluon'
    dataset_names = []
    sources = []
    ds = []
    leanml_ds = []
    ns = []
    problem_types = []
    full_ps = []
    comp_ps = []

    problem_type = None
    for l in [all_classification_datasets, all_regression_datasets]:
        problem_type = 'classification' if problem_type is None else 'regression'
        try:
            with open('./cache/%s_%s_benchmark_n_features.json' % (model_name, problem_type), 'r') as f:
                n_features = json.load(f)
            with open('./cache/%s_%s_benchmark_perfs.json' % (model_name, problem_type), 'r') as f:
                perfs = json.load(f)
            with open('./cache/%s_n_rows.json' % problem_type, 'r') as f:
                n_rows = json.load(f)

        except:
            logging.exception('Something bad happened')
            n_features = {}
            n_rows = {}

        sub_dataset_names = [_ for _ in n_rows.keys()]
        for dataset_name in sub_dataset_names:
            dataset_names += [dataset_name.replace('UCI', '').replace('Kaggle', '')]
            ns += [n_rows[dataset_name]]
            ds += [n_features[dataset_name]['none']] 
            leanml_ds += [n_features[dataset_name]['leanml']] 
            full_ps += [perfs[dataset_name]['none']] 
            comp_ps += [perfs[dataset_name]['leanml']] 
            sources += ['UCI' if 'UCI' in dataset_name else 'Kaggle']
            problem_types += [problem_type]

    df = pd.DataFrame(data={'Dataset': dataset_names, 'Number of Rows': ns, 'Number of Candidate Features': ds, \
                            'Number of Features Selected': leanml_ds, 'Performance (Full Model)': full_ps, \
                            'Performance (Compressed Model)': comp_ps, 'Problem Type': problem_types, \
                            'Source': sources})
    df = df.sort_values(by=['Number of Candidate Features', 'Number of Rows', 'Problem Type'])
    df.reset_index(drop=True, inplace=True)
    
    print('Avg. Performance Full Models: %.2f' % np.mean(df['Performance (Full Model)']))
    print('Avg. Performance Full Models (That Did Not Overfit): %.2f' % np.mean(\
                    np.maximum(df['Performance (Full Model)'], 0.0))) 
    print('Avg. Performance Compressed Models: %.2f' % np.mean(df['Performance (Compressed Model)']))
    print('Avg. Compression Rate: %d%s' % (int(np.round(100.*(1.-np.mean(1.*\
                    df['Number of Features Selected']/df['Number of Candidate Features'])))), '%'))
    print('Weighted Avg. Compression Rate: %d%s' % (int(np.round(100.*(1.-np.sum(1.*\
                    df['Number of Features Selected']/np.sum(df['Number of Candidate Features']))))), '%'))
    pd.set_option('display.float_format','{:.2f}'.format)
    
    return df


In [4]:
def autogluon_regression_benchmark():
    '''
    Run all regression experiments for AWS' AutoGluon.
    '''
    regressor_cls = get_autogluon_learner(problem_type='regression', verbosity=0)
    regression_benchmark(regressor_cls, 'autogluon')


def autogluon_classification_benchmark():
    '''
    Run all classification experiments for AWS' AutoGluon.
    '''
    classifier_cls = get_autogluon_learner(problem_type='binary', verbosity=0)
    classification_benchmark(classifier_cls, 'autogluon')
    
    
def run_experiments():
    '''
    Run all experiments.
    '''
    print()
    print('========================')
    print('    Model: AutoGluon    ')
    print('========================')
    print()
    print('Regression Datasets')
    autogluon_regression_benchmark()

    print()
    print('Classification Datasets')
    autogluon_classification_benchmark()
    

## Run All Experiments

In [5]:
run_experiments()


    Model: AutoGluon    

Regression Datasets

Dataset: Abalone
UCIAbalone 38 Features, Target: Age
UCIAbalone, Feature Selection Method: leanml --- R-Squared: 0.58, Number of Selected Features: 5
UCIAbalone, Feature Selection Method: none --- R-Squared: 0.58, Number of Selected Features: 38

Dataset: AirFoil
UCIAirFoil 25 Features, Target: Sound Pressure
UCIAirFoil, Feature Selection Method: leanml --- R-Squared: 0.94, Number of Selected Features: 14
UCIAirFoil, Feature Selection Method: none --- R-Squared: 0.95, Number of Selected Features: 25

Dataset: AirQuality
UCIAirQuality 70 Features, Target: C6H6(GT)
UCIAirQuality, Feature Selection Method: leanml --- R-Squared: 1.00, Number of Selected Features: 2
UCIAirQuality, Feature Selection Method: none --- R-Squared: 1.00, Number of Selected Features: 70

Dataset: BikeSharing
UCIBikeSharing 90 Features, Target: cnt
UCIBikeSharing, Feature Selection Method: leanml --- R-Squared: 1.00, Number of Selected Features: 3
UCIBikeSharing, Feat

UCILetterRecognition, Feature Selection Method: none --- AUC: 0.99, Number of Selected Features: 80

Dataset: MagicGamma
UCIMagicGamma 50 Features, Target: 10_g
UCIMagicGamma, Feature Selection Method: leanml --- AUC: 0.86, Number of Selected Features: 13
UCIMagicGamma, Feature Selection Method: none --- AUC: 0.86, Number of Selected Features: 50

Dataset: SensorLessDrive
UCISensorLessDrive 240 Features, Target: 48_1.0
UCISensorLessDrive, Feature Selection Method: leanml --- AUC: 1.00, Number of Selected Features: 19
UCISensorLessDrive, Feature Selection Method: none --- AUC: 1.00, Number of Selected Features: 240

Dataset: Shuttle
UCIShuttle 45 Features, Target: 9_1
UCIShuttle, Feature Selection Method: leanml --- AUC: 1.00, Number of Selected Features: 4
UCIShuttle, Feature Selection Method: none --- AUC: 1.00, Number of Selected Features: 45

Dataset: SkinSegmentation
UCISkinSegmentation 15 Features, Target: y_1
UCISkinSegmentation, Feature Selection Method: leanml --- AUC: 1.00, Nu

## Visualize All Results

In [6]:
results_df = summarize_results()


Avg. Performance Full Models: 0.45
Avg. Performance Full Models (That Did Not Overfit): 0.82
Avg. Performance Compressed Models: 0.82
Avg. Compression Rate: 83%
Weighted Avg. Compression Rate: 95%


In [7]:
print('Number of Selected Features: %d, Number of Candidate Features: %d' % 
      (results_df['Number of Features Selected'].sum(), results_df['Number of Candidate Features'].sum()))

Number of Selected Features: 540, Number of Candidate Features: 10229
