In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Imputer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.model_selection._search import RandomizedSearchCV
from sklearn.feature_selection.variance_threshold import VarianceThreshold
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.tree.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
%matplotlib inline  
%load_ext autotime

In [7]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


time: 18 ms


In [8]:
# Load data and identify target column
df_australian = pd.read_csv("phpelnJ6y.csv")
target_column = 'A15'

time: 28.3 ms


In [9]:
# Consider a variable as nominal if there are less than 20 uniquie values (cutoff open to change)
nominal_vars = [name for name,value in df_australian.iteritems() if len(set(value)) < 20 and name != target_column]

# Get all object variables (categorical)
object_vars = [column[0] for column in df_australian.dtypes.iteritems() if column[1] == object and 
               column[0] != target_column]

# Get set of variables to encode to categorical
nominal_vars.extend(object_vars)
factors = set(nominal_vars)

# One hot encode each categorical variable as 0 or 1
df_australian = pd.get_dummies(df_australian, columns=factors)

time: 114 ms


In [10]:
# Initialize predictor variables
first_col = df_australian.columns[0]
X = np.array(df_australian[first_col])[:, np.newaxis]

# Get predictor variables into proper format
for variable in df_australian[df_australian.columns.difference([target_column, first_col])]:
    X = np.hstack((X, np.array(df_australian[variable])[:, np.newaxis]))
    
# Get target variable into proper format
le = LabelEncoder()
y = le.fit_transform(df_australian[target_column])

time: 33.9 ms


In [19]:
# Initialize pipeline steps
estimators = [('imputer', Imputer()), ('reduce_dim', PCA()), ('clf', AdaBoostClassifier())]

# Create pipeline
pipe = Pipeline(estimators)

# Pipeline parameters
pipe.get_params().keys()

dict_keys(['steps', 'imputer', 'reduce_dim', 'clf', 'imputer__axis', 'imputer__copy', 'imputer__missing_values', 'imputer__strategy', 'imputer__verbose', 'reduce_dim__copy', 'reduce_dim__iterated_power', 'reduce_dim__n_components', 'reduce_dim__random_state', 'reduce_dim__svd_solver', 'reduce_dim__tol', 'reduce_dim__whiten', 'clf__algorithm', 'clf__base_estimator', 'clf__learning_rate', 'clf__n_estimators', 'clf__random_state'])

time: 12 ms


In [13]:
# Number of predictor variables
num_feat = len(X[1])

# Parameters to search over
param_dist = {'clf__base_estimator': [DecisionTreeClassifier(), RandomForestClassifier(), 
                                      GradientBoostingClassifier(), ExtraTreesClassifier()],
             'clf__learning_rate': [.0001, .001, .01],
             'clf__n_estimators': [10, 50, 100, 250, 500], 
             'reduce_dim__n_components': np.arange(2,num_feat,int(num_feat/4))} # If 4 or more features

time: 6.96 ms


In [18]:
# Initialize random search to pipeline and parameters
n_iter = 10
ran_search = RandomizedSearchCV(estimator=pipe, param_distributions=param_dist, n_iter=n_iter)

# Fit random search to data
ran_search.fit(X,y)

# See results of search
report(ran_search.cv_results_, n_top=5)

Model with rank: 1
Mean validation score: 0.814 (std: 0.027)
Parameters: {'reduce_dim__n_components': 12, 'clf__n_estimators': 500, 'clf__learning_rate': 0.0001, 'clf__base_estimator': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)}

Model with rank: 2
Mean validation score: 0.633 (std: 0.024)
Parameters: {'reduce_dim__n_components': 2, 'clf__n_estimators': 10, 'clf__learning_rate': 0.01, 'clf__base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fractio

In [11]:
# distribution of which ones have pipelines

time: 1.16 ms


In [None]:
flows = pd.read_csv("Results/OpenML Supervised Classification on blood-transfusion-service-center.csv")
ff = flows.groupby('Flow')["Flow"].count()
ff

In [4]:
flows = pd.read_csv("results/OpenML Supervised Classification on credit-g.csv")
ff = flows.groupby('Flow')["Flow"].count()
ff

time: 1.88 ms


In [2]:
flows = pd.read_csv("results/OpenML Supervised Classification on Australian.csv")
ff = flows.groupby('Flow')["Flow"].count()
ff

time: 294 ms
