In [42]:
import json
import time
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from autosklearn.metrics import balanced_accuracy, precision, recall, f1
import autosklearn.classification

from smac.tae import StatusType

In [4]:
def generate_sample(df,n):
    pos = df.loc[df['aft_net_sign_helpful'] > 0].sample(n)
    neg = df.loc[df['aft_net_sign_helpful'] < 0].sample(n)
    sample = pos.append(neg)
    return sample

In [48]:
def results_to_table(cls):
    results = pd.DataFrame.from_dict(cls.cv_results_)
    params = results['params'].apply(pd.Series)
    results = pd.concat([results.drop(['params'],axis=1),params],axis=1)
    return results

In [45]:
def k_fold_train(features,lables):
    skf = StratifiedKFold(n_splits=k,shuffle=True)
    cls = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=120,
        per_run_time_limit=30,
        ensemble_size=0,
        scoring_functions=[balanced_accuracy, precision, recall, f1],
        resampling_strategy = StratifiedKFold,
        n_jobs = threads
    )
    cls.fit(features, labels)
    return cls

In [66]:
infile_path = '../datasets/vectorized/aft_vectorized_01-29-21.json'
sample_size = 10
k = 5
threads = 4

def main(cls=None):
    with open(infile_path,'r') as filestream:
        full_df = pd.DataFrame(json.load(filestream))
    sample_df = generate_sample(full_df,sample_size)
        
    features = pd.DataFrame(sample_df['feature_vector'].values.tolist()).to_numpy()
    labels = sample_df['aft_net_sign_helpful'].to_numpy()

    features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=1)

    if not cls:
        cls = k_fold_train(features, labels)
        
    #print(results_to_table(cls))
    
    print(['Numerical'] * np.shape(features)[1])

    
main(cls)

['Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Numerical']


In [38]:
results = pd.DataFrame.from_dict(cls.cv_results_)
params = pd.concat([results[['rank_test_scores','mean_test_score']],results['params'].apply(pd.Series)],axis=1).sort_values('rank_test_scores')
params

Unnamed: 0,rank_test_scores,mean_test_score,balancing:strategy,classifier:__choice__,data_preprocessing:categorical_transformer:categorical_encoding:__choice__,data_preprocessing:categorical_transformer:category_coalescence:__choice__,data_preprocessing:numerical_transformer:imputation:strategy,data_preprocessing:numerical_transformer:rescaling:__choice__,feature_preprocessor:__choice__,classifier:random_forest:bootstrap,...,classifier:adaboost:max_depth,classifier:adaboost:n_estimators,classifier:mlp:validation_fraction,classifier:passive_aggressive:C,classifier:passive_aggressive:average,classifier:passive_aggressive:fit_intercept,classifier:passive_aggressive:loss,classifier:passive_aggressive:tol,feature_preprocessor:kitchen_sinks:gamma,feature_preprocessor:kitchen_sinks:n_components
10,1,0.74,none,random_forest,no_encoding,no_coalescense,median,quantile_transformer,random_trees_embedding,True,...,,,,,,,,,,
13,2,0.62,weighting,mlp,one_hot_encoding,no_coalescense,most_frequent,standardize,feature_agglomeration,,...,,,,,,,,,,
8,2,0.62,none,k_nearest_neighbors,no_encoding,minority_coalescer,most_frequent,standardize,extra_trees_preproc_for_classification,,...,,,,,,,,,,
23,4,0.6,weighting,passive_aggressive,no_encoding,minority_coalescer,most_frequent,robust_scaler,kitchen_sinks,,...,,,,0.148332,True,True,squared_hinge,0.000165,0.000136,148.0
14,4,0.6,none,libsvm_svc,no_encoding,no_coalescense,median,standardize,feature_agglomeration,,...,,,,,,,,,,
12,4,0.6,none,gradient_boosting,no_encoding,no_coalescense,median,minmax,liblinear_svc_preprocessor,,...,,,,,,,,,,
6,7,0.6,weighting,random_forest,no_encoding,minority_coalescer,most_frequent,robust_scaler,extra_trees_preproc_for_classification,True,...,,,,,,,,,,
22,8,0.58,weighting,mlp,one_hot_encoding,no_coalescense,most_frequent,standardize,extra_trees_preproc_for_classification,,...,,,0.1,,,,,,,
5,8,0.58,weighting,random_forest,no_encoding,minority_coalescer,mean,quantile_transformer,extra_trees_preproc_for_classification,False,...,,,,,,,,,,
15,8,0.58,weighting,decision_tree,one_hot_encoding,no_coalescense,mean,normalize,no_preprocessing,,...,,,,,,,,,,


In [44]:
def get_runhistory_models_performance(automl):
    metric = cls.automl_._metric
    data = automl.automl_.runhistory_.data
    performance_list = []
    for run_key, run_value in data.items():
        if run_value.status != StatusType.SUCCESS:
            # Ignore crashed runs
            continue
        # Alternatively, it is possible to also obtain the start time with ``run_value.starttime``
        endtime = pd.Timestamp(time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(run_value.endtime)))
        print(run_value.additional_info)
        val_score = metric._optimum - (metric._sign * run_value.cost)
        test_score = metric._optimum - (metric._sign * run_value.additional_info['test_loss'])
        train_score = metric._optimum - (metric._sign * run_value.additional_info['train_loss'])
        performance_list.append({
            'Timestamp': endtime,
            'single_best_optimization_score': val_score,
            'single_best_test_score': test_score,
            'single_best_train_score': train_score,
        })
    return pd.DataFrame(performance_list)

get_runhistory_models_performance(cls)

{'accuracy': 0.44000000000000006, 'balanced_accuracy': 0.43527777777777776, 'precision': 0.4163174603174603, 'recall': 0.5166666666666667, 'f1': 0.480952380952381, 'duration': 8.533705949783325, 'num_run': 2, 'train_loss': 0.039999999999999994, 'configuration_origin': 'Initial design'}


KeyError: 'test_loss'

In [26]:
resutls

NameError: name 'resutls' is not defined

In [56]:
pd.DataFrame.from_dict(cls.cv_results_).to_dict('records')

[{'mean_test_score': 0.5599999999999999,
  'metric_balanced_accuracy': 0.5647222222222222,
  'metric_precision': 0.5836825396825397,
  'metric_recall': 0.4833333333333333,
  'metric_f1': 0.519047619047619,
  'mean_fit_time': 11.139771938323975,
  'params': {'balancing:strategy': 'none',
   'classifier:__choice__': 'random_forest',
   'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding',
   'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer',
   'data_preprocessing:numerical_transformer:imputation:strategy': 'mean',
   'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize',
   'feature_preprocessor:__choice__': 'no_preprocessing',
   'classifier:random_forest:bootstrap': 'True',
   'classifier:random_forest:criterion': 'gini',
   'classifier:random_forest:max_depth': 'None',
   'classifier:random_forest:max_features': 0.5,
   'classifier:random_forest:max_leaf_nodes': 'None

In [58]:
'../datasets/vectorized/aft_vectorized_01-29-21.json'.rsplit('.',1)

['../datasets/vectorized/aft_vectorized_01-29-21', 'json']