# Multioutput-Ensemble-Base Combinations for Ordering Contributing Factors

In [1]:
import numpy as np
import csv
import time
import pandas
import signal
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

In [3]:
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import MultiOutputClassifier

from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.ensemble.weight_boosting import AdaBoostRegressor
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.bagging import BaggingRegressor
from sklearn.ensemble.forest import ExtraTreesClassifier
from sklearn.ensemble.forest import ExtraTreesRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.ensemble.forest import RandomForestRegressor

from sklearn.linear_model.bayes import ARDRegression
from sklearn.linear_model.bayes import BayesianRidge
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_decomposition.cca_ import CCA
from sklearn.tree.tree import DecisionTreeClassifier
from sklearn.tree.tree import DecisionTreeRegressor
from sklearn.linear_model.coordinate_descent import ElasticNet
from sklearn.tree.tree import ExtraTreeClassifier
from sklearn.tree.tree import ExtraTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process.gpc import GaussianProcessClassifier
from sklearn.gaussian_process.gpr import GaussianProcessRegressor
from sklearn.linear_model.huber import HuberRegressor
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.neighbors.regression import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.semi_supervised.label_propagation import LabelPropagation
from sklearn.semi_supervised.label_propagation import LabelSpreading
from sklearn.linear_model.least_angle import Lars
from sklearn.linear_model.coordinate_descent import Lasso
from sklearn.linear_model.least_angle import LassoLars
from sklearn.linear_model.least_angle import LassoLarsIC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model.base import LinearRegression
from sklearn.svm.classes import LinearSVC
from sklearn.svm.classes import LinearSVR
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
from sklearn.neural_network.multilayer_perceptron import MLPRegressor
from sklearn.linear_model.coordinate_descent import MultiTaskElasticNet
from sklearn.linear_model.coordinate_descent import MultiTaskLasso
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.svm.classes import NuSVC
from sklearn.svm.classes import NuSVR
from sklearn.linear_model.omp import OrthogonalMatchingPursuit
from sklearn.cross_decomposition.pls_ import PLSCanonical
from sklearn.cross_decomposition.pls_ import PLSRegression
from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier
from sklearn.linear_model.passive_aggressive import PassiveAggressiveRegressor
from sklearn.linear_model.perceptron import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model.ransac import RANSACRegressor
from sklearn.neighbors.classification import RadiusNeighborsClassifier
from sklearn.neighbors.regression import RadiusNeighborsRegressor
from sklearn.linear_model.ridge import Ridge
from sklearn.linear_model.ridge import RidgeClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVC
from sklearn.svm.classes import SVR
from sklearn.linear_model.theil_sen import TheilSenRegressor

In [4]:
tfidf = pandas.read_csv('07 NM CF TFIDF.csv', delimiter=',', encoding='latin-1').fillna('')
tfidf[:3]

Unnamed: 0,____________________,_____________________,_____________________ follow,able,acceptable,access,accessory,accessory require,accidentally,accommodate,...,wrong ct,wrong info,wrong info mri,wrong patient,wrong pt,wrong set,wrong tattoo,xrt,xrt date,xrt pt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
ohe = pandas.read_csv('07 NM CF OHE.csv', delimiter=',', encoding='latin-1').fillna('')
ohe[:3]

Unnamed: 0,Failure to identify potential risks,Staff behaviour,Staff education or training inadequate,Equipment quality assurance and/or maintenance inadequate,Other,Organizational and/or workspace resources inadequate (excluding human resources),Expectation bias involving staff,Policies and/or procedures non-existent or inadequate,Policies and/or procedures not followed,Handoffs inadequate,...,Patient or family member medical condition preference or behaviour,"Equipment software or hardware commissioning, calibration or acceptance testing inadequate","Patient or family member medical condition, preference or behaviour",Communication or documentation inadequate (patient specific),Change management,"Equipment software or hardware design, including 'human factors' design, inadequate",Human resources inadequate,Unfamiliar treatment approach or radiation treatment technique,External factors beyond programmatic control,Patient education inadequate
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
multioutputs = [ClassifierChain, MultiOutputRegressor, MultiOutputClassifier]
ensembles = [AdaBoostClassifier, AdaBoostRegressor, BaggingClassifier, BaggingRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor]
bases = [ARDRegression, BayesianRidge, BernoulliNB, CCA, DecisionTreeClassifier, DecisionTreeRegressor, ElasticNet, ExtraTreeClassifier, ExtraTreeRegressor, GaussianNB, GaussianProcessClassifier, GaussianProcessRegressor, HuberRegressor, KNeighborsClassifier, KNeighborsRegressor, KernelRidge, LabelPropagation, LabelSpreading, Lars, Lasso, LassoLars, LassoLarsIC, LinearDiscriminantAnalysis, LinearRegression, LinearSVC, LinearSVR, LogisticRegression, MLPClassifier, MLPRegressor, MultiTaskElasticNet, MultiTaskLasso, MultinomialNB, NearestCentroid, NuSVC, NuSVR, OrthogonalMatchingPursuit, PLSCanonical, PLSRegression, PassiveAggressiveClassifier, PassiveAggressiveRegressor, Perceptron, QuadraticDiscriminantAnalysis, RANSACRegressor, RadiusNeighborsClassifier, RadiusNeighborsRegressor, Ridge, RidgeClassifier, SGDClassifier, SGDRegressor, SVC, SVR, TheilSenRegressor]

In [7]:
metas = pandas.read_csv('11 PS MS.csv', delimiter=',', encoding='latin-1').fillna('')
metas = metas[(metas['Multioutput']=='MultiOutputRegressor') & (metas['Ensemble']!='') & (metas['Base']!='')]
metas

Unnamed: 0,Multioutput,Ensemble,Base,Time,Score
0,MultiOutputRegressor,BaggingRegressor,KernelRidge,128.26,1.69
2,MultiOutputRegressor,BaggingRegressor,Ridge,87.49,1.69
13,MultiOutputRegressor,AdaBoostRegressor,Ridge,277.67,1.76
15,MultiOutputRegressor,AdaBoostRegressor,LinearSVR,130.54,1.77
18,MultiOutputRegressor,BaggingRegressor,PassiveAggressiveRegressor,64.53,1.82
21,MultiOutputRegressor,BaggingRegressor,ExtraTreeRegressor,79.21,1.84
26,MultiOutputRegressor,BaggingRegressor,Perceptron,33.29,1.84
28,MultiOutputRegressor,BaggingRegressor,LinearSVR,67.64,1.84
30,MultiOutputRegressor,BaggingRegressor,DecisionTreeClassifier,83.5,1.87
31,MultiOutputRegressor,BaggingRegressor,ExtraTreeClassifier,20.07,1.87


In [8]:
# This is a custom scoring function that implements the following.
# If a model were to order the labels in a drop down list
# how far down the list does the user need to look down in order to find all the correct labels?
def lowest_correct(trues, preds):
    num_of_options = len(trues)
    drop_down_options = list(reversed(np.argsort(preds)))
    correct_options = [i for i in range(num_of_options) if trues[i]==1]
    return max([drop_down_options.index(correct_option) for correct_option in correct_options]) + 1
def average_lowest_correct(list_of_trues, list_of_preds):
    length = len(list_of_trues)
    return np.mean([lowest_correct(list(list_of_trues.iloc[i]), list(list_of_preds[i])) for i in range(length)])

In [9]:
def deadline(timeout, *args):
    def decorate(f):
        def handler(signum, frame):
            raise Exception

        def new_f(*args):
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(timeout)
            return f(*args)
            signal.alarm(0)

        new_f.__name__ = f.__name__
        return new_f
    return decorate

In [10]:
@deadline(500)
def cv_ensemble(multioutput, ensemble, base, xs, ys):
    temp = cross_validate(multioutput(ensemble(base())), xs, ys, return_train_score=True, scoring=make_scorer(average_lowest_correct), n_jobs=-1, cv=2)
    return [multioutput.__name__, ensemble.__name__, base.__name__, (np.sum(temp['fit_time'])+ np.sum(temp['score_time'])).round(2), np.mean(temp['test_score']).round(2)]

In [11]:
all_models = multioutputs + ensembles + bases
name_to_model = {}
for model in all_models:
    name_to_model[model.__name__] = model

In [12]:
models = pandas.DataFrame(columns=['Multioutput', 'Ensemble', 'Base', 'Time', 'Score'])
row = 0

for i in range(len(metas)):
    meta = list(metas.iloc[i])
    try:
        results = cv_ensemble(name_to_model[meta[0]], name_to_model[meta[1]], name_to_model[meta[2]], tfidf, ohe)
        print(results[0], results[1], results[2], results[3], results[4])
        for j in range(5):
            models.at[row, models.columns[j]] = results[j]
        row += 1
    except Exception:
        pass

MultiOutputRegressor BaggingRegressor KernelRidge 563.15 5.7
MultiOutputRegressor BaggingRegressor Ridge 263.4 5.36
MultiOutputRegressor AdaBoostRegressor Ridge 895.82 5.31
MultiOutputRegressor AdaBoostRegressor LinearSVR 323.79 5.61
MultiOutputRegressor BaggingRegressor PassiveAggressiveRegressor 148.48 7.45
MultiOutputRegressor BaggingRegressor ExtraTreeRegressor 249.46 6.93
MultiOutputRegressor BaggingRegressor LinearSVR 144.66 8.15
MultiOutputRegressor BaggingRegressor DecisionTreeClassifier 254.11 6.8
MultiOutputRegressor BaggingRegressor ExtraTreeClassifier 41.01 6.54
MultiOutputRegressor BaggingRegressor DecisionTreeRegressor 195.41 6.77
MultiOutputRegressor AdaBoostRegressor PassiveAggressiveRegressor 322.62 6.57
MultiOutputRegressor BaggingRegressor RidgeClassifier 210.85 9.07
MultiOutputRegressor BaggingRegressor BernoulliNB 132.56 8.63
MultiOutputRegressor BaggingRegressor SGDRegressor 73.98 5.36
MultiOutputRegressor BaggingRegressor OrthogonalMatchingPursuit 296.14 8.58
Mul

In [13]:
models

Unnamed: 0,Multioutput,Ensemble,Base,Time,Score
0,MultiOutputRegressor,BaggingRegressor,KernelRidge,563.15,5.7
1,MultiOutputRegressor,BaggingRegressor,Ridge,263.4,5.36
2,MultiOutputRegressor,AdaBoostRegressor,Ridge,895.82,5.31
3,MultiOutputRegressor,AdaBoostRegressor,LinearSVR,323.79,5.61
4,MultiOutputRegressor,BaggingRegressor,PassiveAggressiveRegressor,148.48,7.45
5,MultiOutputRegressor,BaggingRegressor,ExtraTreeRegressor,249.46,6.93
6,MultiOutputRegressor,BaggingRegressor,LinearSVR,144.66,8.15
7,MultiOutputRegressor,BaggingRegressor,DecisionTreeClassifier,254.11,6.8
8,MultiOutputRegressor,BaggingRegressor,ExtraTreeClassifier,41.01,6.54
9,MultiOutputRegressor,BaggingRegressor,DecisionTreeRegressor,195.41,6.77


In [14]:
models.to_csv('16 CF MEB.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)