In [1]:
import sys
sys.path.append("../")
from ortho_lib3_Copy2 import *

filename = '../Pickle/def_exercises_sliced_transformed_data_all_categories.pickle'
data_dir = '..//sliced_transformed_data/'
category = ['Category_1', 'Category_2', 'Category_3', 'Category_4']
extype = ['AB', 'AF', 'RF', 'EL']

try:
    all_exercises = Exercises.load(filename)
except:
    dffs = create_dfframes(category, extype = extype, data_dir = data_dir, print_errors=False)
    all_exercises = dffs_to_exercises(dffs)
    all_exercises.dump(filename)

# Multivariate functions

Below are class- and function definitions for multivariate logistic regression in combination with the OR-ensemble method.




In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold, RepeatedKFold, KFold, LeaveOneOut, ShuffleSplit
from sklearn.metrics import multilabel_confusion_matrix
from itertools import groupby as g

class MultivariableExperiment(Experiment):
    
    def __init__(self, exercises, cols = None, class_weight=None):
        self.exercises = exercises
        self.df = exercises.df
        if cols is None:
            self.cols = self.df.columns
        else:
            self.cols = cols
            self.df = self.df[cols]
        
        # y values are category numbers
        self.y = [int(c[-1]) for c in exercises.y]
        self.ids = exercises.patients
    
    def keep_inliers(self, X, y, factor=1.1):
        X0 = X[y == 0]
        mean0 = np.mean(X0)
        if np.mean(X[y == 1]) < mean0:
            min0 = mean0 - factor * max(mean0 - X0)
            keep = ((y == 0)|(X < min0 ).flatten())
        else:
            max0 = mean0 + factor * max(X0 - mean0)
            keep = ((y == 0)|(X > max0).flatten())
        return X[keep], y[keep]
    
    def fit(self, X_train, y_train):
        """
        Fit a model using multinomial logistic regression.
        """
        model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', max_iter=10000)
        model.fit(X_train, y_train)
        return model
    
    def fit_predict(self, X_train, y_train, X_valid):
        model = self.fit(X_train, y_train)
        return model.predict(X_valid)
    
    def fit_predict_p(self, X_train, y_train, X_valid):
        model = self.fit(X_train, y_train)
        return model.predict_proba(X_valid)

    
class MultivariableResults(Results):
    
    def store(self, p_id, feature, tp, tn, fp, fn):
        """
        todo: Add category number?
        """
        self[(p_id, feature)] = {'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn }
        
    def score(self, fs, y_valid, y_pred):
        """
        Score the predictions for multinomial logistic regression.
        
        This method aggregates the results per category to a single confusion matrix.
        """
        y_valid = np.array(y_valid)
        y_pred = np.array(y_pred)
        all_patients = np.array(exp.exercises.patients)
        
        cats = sorted(exp.exercises.categories)
        tp = []
        tn = []
        fp = []
        fn = []
        # iterate over categories
        for cat in cats:
            c = int(cat[-1])
            exs = exp.exercises.select_category(c)
            # get the y values for the patients in the current category
            y_cat_valid = y_valid[np.isin(all_patients, np.array(exs.patients))]
            y_cat_pred = y_pred[np.isin(all_patients, np.array(exs.patients))]
            
            # aggregate the confusion matrix with previous results
            tp = tp + list((y_cat_valid == c) & (y_cat_pred == c))
            tn = tn + list((y_cat_valid != c) & (y_cat_pred != c))
            fp = fp + list((y_cat_valid != c) & (y_cat_pred == c))
            fn = fn + list((y_cat_valid == c) & (y_cat_pred != c))
            
        # store the results to the Results object
        for id, *p in zip(self.exercises.patients, tp, tn, fp, fn):
            self.store( id, str(fs), *p)

    def score_per_category(self, fs, y_valid, y_pred):
        """
        Score the predictions for multinomial logistic regression.
        
        This method *should* create a confusion matrix per category label.
        """
        y_valid = np.array(y_valid)
        y_pred = np.array(y_pred)
        all_patients = np.array(exp.exercises.patients)
        
        cats = sorted(exp.exercises.categories)
        
        # init confusion dicts
        tp = {}
        tn = {}
        fp = {}
        fn = {}
        # iterate over category labels
        for cat in cats:
            c = int(cat[-1])
            exs = exp.exercises.select_category(c)
            
            # get y values for the patients in the current category
            y_cat_valid = y_valid[np.isin(all_patients, np.array(exs.patients))]
            y_cat_pred = y_pred[np.isin(all_patients, np.array(exs.patients))]
            y_not_cat_valid = y_valid[np.isin(all_patients, np.array(exs.patients), invert=True)]
            y_not_cat_pred = y_pred[np.isin(all_patients, np.array(exs.patients), invert=True)]
            
            tp[c] = y_cat_pred[y_cat_pred == c]
            
            print(f"""Category {c}:
y_valid in category {c}: {y_cat_valid}
y_pred in category {c}: {y_cat_pred}
y_valid not in category {c}: {y_not_cat_valid}
y_pred not in category {c}: {y_not_cat_pred} \n""")
            # store confusion per category
#             tp[c] = ((y_cat_valid == c) & (y_cat_pred == c))
#             tn[c] = ((y_cat_valid != c) & (y_cat_pred != c))
#             fp[c] = ((y_cat_valid != c) & (y_cat_pred == c))
#             fn[c] = ((y_cat_valid == c) & (y_cat_pred != c))
            
        return (tp, tn, fp, fn)

        
def most_common_element(l):
    return sorted(l, key=lambda e: l.count(e))[-1]

def keep_inliers(self, X, y, c, factor=1.1):
    X0 = X[y == c]
    mean0 = np.mean(X0)
    if np.mean(X[y != c]) < mean0:
        min0 = mean0 - factor * max(mean0 - X0)
        keep = ((y != c)|(X < min0 ).flatten())
    else:
        max0 = mean0 + factor * max(X0 - mean0)
        keep = ((y != 0)|(X > max0).flatten())
    return X[keep], y[keep]

def fit_inliers1(exp, feature, factor=1.1, splitter = 'loo', model = None):
    _split = model_selectors[splitter]
    
    fs = FeatureSet(feature)
    X = exp.X(fs).to_numpy()
    y = np.array(exp.y)
    
    y_pred = np.zeros(y.shape)
    if model is None:
        model = LogisticRegression(multi_class='multinomial', 
                                   solver='sag', 
                                   penalty='l2', 
                                   max_iter=10000, 
                                   warm_start=True)
    for train_index, test_index in _split.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train_scaled, X_test_scaled = exp.scale(X_train, X_test)
        
        model.fit(X_train_scaled, y_train)
        
        y_pred[test_index] = model.predict(X_test_scaled)
    
        
    return y_pred, model

def fit_inliers_ensemble(exp, featureset, factor=1.1, results=None, name=None, splitter = 'loo'):
    y_pred = [fit_inliers1(exp, f, factor=factor, splitter=splitter) for f in featureset]
    y_pred = [most_common_element(list(y)) for y in zip(*y_pred)]
    return y_pred

In [3]:
from sklearn.metrics import *

exs = all_exercises
exp = MultivariableExperiment(exs)

In [4]:
model_selectors = {
    'loo': LeaveOneOut(),
    'kfold': KFold(n_splits=10),
    'skfold': StratifiedKFold(n_splits=10),
    'rkfold': RepeatedKFold(n_splits=10),
    'rskfold': RepeatedStratifiedKFold(n_splits=10),
}
C_param_range = [0.1,1,10,100]

param_combinations = {
    'lbfgs': ['l2'],
    'sag': ['l2'],
    'saga': ['l1', 'l2', 'elasticnet'],
    'newton-cg': ['l2']
}
    
ensemble_res = Results.load(f'../Pickle/Def_results_1_234/results_f1.7.pickle')

target_names = ['Category 1', 'Category 2', 'Category 3', 'Category 4']


In [5]:

results = {}
for splitter, obj in model_selectors.items():
    print(f'CV: {type(obj).__name__}')
    results[type(obj).__name__] = []
    for solver, penalties in param_combinations.items():
        for pen in penalties:
            for C in C_param_range:
                args = dict(
                    multi_class='multinomial', 
                    solver=solver, 
                    penalty=pen, 
                    max_iter=10000, 
                    C=C
                )
                if pen == 'elasticnet':
                    l1_ratio = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
                else:
                    l1_ratio = [None]
                for r in l1_ratio:
                    args['l1_ratio'] = r
                    model = LogisticRegression(**args)
                    try:
                        y_pred, model = fit_inliers1(exp, exp.cols, factor=1.1, splitter=splitter, model=model)
                    except Exception as e:
                        print(f'fitting with params (solver: {solver} | penalty: {pen} | C: {C}) FAILED\n',e)
                        continue

                    res = MultivariableResults(exp)

                    report = classification_report(exp.y, y_pred, 
                                                                output_dict=False, 
                                                                target_names=target_names
                                                               )
                    report_dict = classification_report(exp.y, y_pred, 
                                                                output_dict=True, 
                                                                target_names=target_names
                                                               )
                    results[type(obj).__name__].append({
                        'penalty': pen,
                        'C': C,
                        'report': report_dict,
                        'model': model,
                        'l1_ratio': r
                    })
                    print(f"""solver: {solver} | penalty: {pen} | C: {C} | l1_ratio: {r}
{report}""")
            print('-------------------------------------- CHANGING PENALTY')
                
        print('-------------------------------------- CHANGING SOLVER')
#     res.score(exp.cols, exp.y, y_pred)
#     print(type(obj), '\n', res.report())

CV: LeaveOneOut
solver: lbfgs | penalty: l2 | C: 0.1 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.76      0.96      0.85        23
  Category 2       0.54      0.66      0.59        32
  Category 3       0.53      0.34      0.42        29
  Category 4       0.80      0.70      0.74        23

    accuracy                           0.64       107
   macro avg       0.66      0.66      0.65       107
weighted avg       0.64      0.64      0.63       107

solver: lbfgs | penalty: l2 | C: 1 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.84      0.91      0.87        23
  Category 2       0.55      0.53      0.54        32
  Category 3       0.50      0.45      0.47        29
  Category 4       0.80      0.87      0.83        23

    accuracy                           0.66       107
   macro avg       0.67      0.69      0.68       107
weighted avg       0.65      0.66      0.66       107

solver: 

solver: saga | penalty: elasticnet | C: 0.1 | l1_ratio: 0.2
              precision    recall  f1-score   support

  Category 1       0.73      0.96      0.83        23
  Category 2       0.54      0.66      0.59        32
  Category 3       0.47      0.31      0.38        29
  Category 4       0.79      0.65      0.71        23

    accuracy                           0.63       107
   macro avg       0.63      0.64      0.63       107
weighted avg       0.62      0.63      0.61       107

solver: saga | penalty: elasticnet | C: 0.1 | l1_ratio: 0.4
              precision    recall  f1-score   support

  Category 1       0.70      0.91      0.79        23
  Category 2       0.47      0.53      0.50        32
  Category 3       0.36      0.28      0.31        29
  Category 4       0.79      0.65      0.71        23

    accuracy                           0.57       107
   macro avg       0.58      0.59      0.58       107
weighted avg       0.56      0.57      0.56       107

solver: sa

solver: saga | penalty: elasticnet | C: 100 | l1_ratio: 0
              precision    recall  f1-score   support

  Category 1       0.73      0.83      0.78        23
  Category 2       0.45      0.41      0.43        32
  Category 3       0.42      0.38      0.40        29
  Category 4       0.73      0.83      0.78        23

    accuracy                           0.58       107
   macro avg       0.58      0.61      0.59       107
weighted avg       0.56      0.58      0.57       107

solver: saga | penalty: elasticnet | C: 100 | l1_ratio: 0.2
              precision    recall  f1-score   support

  Category 1       0.73      0.83      0.78        23
  Category 2       0.45      0.41      0.43        32
  Category 3       0.42      0.38      0.40        29
  Category 4       0.73      0.83      0.78        23

    accuracy                           0.58       107
   macro avg       0.58      0.61      0.59       107
weighted avg       0.56      0.58      0.57       107

solver: saga

solver: sag | penalty: l2 | C: 10 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.72      0.78      0.75        23
  Category 2       0.42      0.47      0.44        32
  Category 3       0.41      0.38      0.39        29
  Category 4       0.63      0.52      0.57        23

    accuracy                           0.52       107
   macro avg       0.54      0.54      0.54       107
weighted avg       0.53      0.52      0.52       107

solver: sag | penalty: l2 | C: 100 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.69      0.78      0.73        23
  Category 2       0.40      0.44      0.42        32
  Category 3       0.39      0.38      0.39        29
  Category 4       0.61      0.48      0.54        23

    accuracy                           0.50       107
   macro avg       0.52      0.52      0.52       107
weighted avg       0.51      0.50      0.50       107

---------------------------

solver: saga | penalty: elasticnet | C: 1 | l1_ratio: 0.2
              precision    recall  f1-score   support

  Category 1       0.83      0.83      0.83        23
  Category 2       0.42      0.50      0.46        32
  Category 3       0.39      0.41      0.40        29
  Category 4       0.60      0.39      0.47        23

    accuracy                           0.52       107
   macro avg       0.56      0.53      0.54       107
weighted avg       0.54      0.52      0.52       107

solver: saga | penalty: elasticnet | C: 1 | l1_ratio: 0.4
              precision    recall  f1-score   support

  Category 1       0.86      0.78      0.82        23
  Category 2       0.46      0.56      0.51        32
  Category 3       0.39      0.41      0.40        29
  Category 4       0.69      0.48      0.56        23

    accuracy                           0.55       107
   macro avg       0.60      0.56      0.57       107
weighted avg       0.57      0.55      0.56       107

solver: saga |

solver: newton-cg | penalty: l2 | C: 1 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.77      0.74      0.76        23
  Category 2       0.38      0.50      0.43        32
  Category 3       0.38      0.38      0.38        29
  Category 4       0.57      0.35      0.43        23

    accuracy                           0.49       107
   macro avg       0.53      0.49      0.50       107
weighted avg       0.51      0.49      0.49       107

solver: newton-cg | penalty: l2 | C: 10 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.74      0.74      0.74        23
  Category 2       0.38      0.44      0.41        32
  Category 3       0.38      0.38      0.38        29
  Category 4       0.61      0.48      0.54        23

    accuracy                           0.50       107
   macro avg       0.53      0.51      0.52       107
weighted avg       0.51      0.50      0.50       107

solver: newton-cg

solver: saga | penalty: l2 | C: 1 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.88      0.91      0.89        23
  Category 2       0.50      0.47      0.48        32
  Category 3       0.46      0.45      0.46        29
  Category 4       0.76      0.83      0.79        23

    accuracy                           0.64       107
   macro avg       0.65      0.66      0.66       107
weighted avg       0.63      0.64      0.63       107

solver: saga | penalty: l2 | C: 10 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.75      0.91      0.82        23
  Category 2       0.50      0.41      0.45        32
  Category 3       0.48      0.45      0.46        29
  Category 4       0.73      0.83      0.78        23

    accuracy                           0.62       107
   macro avg       0.62      0.65      0.63       107
weighted avg       0.60      0.62      0.60       107

solver: saga | penalty: l2 

solver: saga | penalty: elasticnet | C: 10 | l1_ratio: 0.4
              precision    recall  f1-score   support

  Category 1       0.75      0.91      0.82        23
  Category 2       0.46      0.38      0.41        32
  Category 3       0.46      0.45      0.46        29
  Category 4       0.72      0.78      0.75        23

    accuracy                           0.60       107
   macro avg       0.60      0.63      0.61       107
weighted avg       0.58      0.60      0.59       107

solver: saga | penalty: elasticnet | C: 10 | l1_ratio: 0.6
              precision    recall  f1-score   support

  Category 1       0.78      0.91      0.84        23
  Category 2       0.44      0.38      0.41        32
  Category 3       0.44      0.41      0.43        29
  Category 4       0.69      0.78      0.73        23

    accuracy                           0.59       107
   macro avg       0.59      0.62      0.60       107
weighted avg       0.57      0.59      0.58       107

solver: saga

solver: lbfgs | penalty: l2 | C: 100 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.80      0.87      0.83        23
  Category 2       0.50      0.44      0.47        32
  Category 3       0.46      0.45      0.46        29
  Category 4       0.65      0.74      0.69        23

    accuracy                           0.60       107
   macro avg       0.60      0.62      0.61       107
weighted avg       0.59      0.60      0.59       107

-------------------------------------- CHANGING PENALTY
-------------------------------------- CHANGING SOLVER
solver: sag | penalty: l2 | C: 0.1 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.75      0.91      0.82        23
  Category 2       0.56      0.62      0.59        32
  Category 3       0.54      0.45      0.49        29
  Category 4       0.84      0.70      0.76        23

    accuracy                           0.65       107
   macro avg       0.6

solver: saga | penalty: elasticnet | C: 0.1 | l1_ratio: 0.8
              precision    recall  f1-score   support

  Category 1       0.64      0.61      0.62        23
  Category 2       0.41      0.59      0.49        32
  Category 3       0.30      0.24      0.27        29
  Category 4       0.81      0.57      0.67        23

    accuracy                           0.50       107
   macro avg       0.54      0.50      0.51       107
weighted avg       0.52      0.50      0.50       107

solver: saga | penalty: elasticnet | C: 0.1 | l1_ratio: 1.0
              precision    recall  f1-score   support

  Category 1       0.68      0.65      0.67        23
  Category 2       0.42      0.69      0.52        32
  Category 3       0.27      0.14      0.18        29
  Category 4       0.72      0.57      0.63        23

    accuracy                           0.50       107
   macro avg       0.52      0.51      0.50       107
weighted avg       0.50      0.50      0.49       107

solver: sa

solver: saga | penalty: elasticnet | C: 100 | l1_ratio: 0.6
              precision    recall  f1-score   support

  Category 1       0.76      0.83      0.79        23
  Category 2       0.50      0.47      0.48        32
  Category 3       0.46      0.41      0.44        29
  Category 4       0.69      0.78      0.73        23

    accuracy                           0.60       107
   macro avg       0.60      0.62      0.61       107
weighted avg       0.59      0.60      0.59       107

solver: saga | penalty: elasticnet | C: 100 | l1_ratio: 0.8
              precision    recall  f1-score   support

  Category 1       0.73      0.83      0.78        23
  Category 2       0.52      0.53      0.52        32
  Category 3       0.43      0.34      0.38        29
  Category 4       0.64      0.70      0.67        23

    accuracy                           0.58       107
   macro avg       0.58      0.60      0.59       107
weighted avg       0.57      0.58      0.57       107

solver: sa

solver: saga | penalty: l1 | C: 1 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.69      0.87      0.77        23
  Category 2       0.57      0.50      0.53        32
  Category 3       0.46      0.38      0.42        29
  Category 4       0.69      0.78      0.73        23

    accuracy                           0.61       107
   macro avg       0.60      0.63      0.61       107
weighted avg       0.59      0.61      0.60       107

solver: saga | penalty: l1 | C: 10 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.80      0.87      0.83        23
  Category 2       0.48      0.44      0.46        32
  Category 3       0.48      0.45      0.46        29
  Category 4       0.73      0.83      0.78        23

    accuracy                           0.62       107
   macro avg       0.62      0.65      0.63       107
weighted avg       0.60      0.62      0.61       107

solver: saga | penalty: l1 

solver: saga | penalty: elasticnet | C: 1 | l1_ratio: 0.8
              precision    recall  f1-score   support

  Category 1       0.79      0.96      0.86        23
  Category 2       0.55      0.50      0.52        32
  Category 3       0.42      0.38      0.40        29
  Category 4       0.75      0.78      0.77        23

    accuracy                           0.63       107
   macro avg       0.63      0.65      0.64       107
weighted avg       0.61      0.63      0.62       107

solver: saga | penalty: elasticnet | C: 1 | l1_ratio: 1.0
              precision    recall  f1-score   support

  Category 1       0.78      0.91      0.84        23
  Category 2       0.52      0.50      0.51        32
  Category 3       0.42      0.38      0.40        29
  Category 4       0.74      0.74      0.74        23

    accuracy                           0.61       107
   macro avg       0.61      0.63      0.62       107
weighted avg       0.60      0.61      0.60       107

solver: saga |

solver: newton-cg | penalty: l2 | C: 100 | l1_ratio: None
              precision    recall  f1-score   support

  Category 1       0.73      0.83      0.78        23
  Category 2       0.50      0.44      0.47        32
  Category 3       0.52      0.48      0.50        29
  Category 4       0.65      0.74      0.69        23

    accuracy                           0.60       107
   macro avg       0.60      0.62      0.61       107
weighted avg       0.59      0.60      0.59       107

-------------------------------------- CHANGING PENALTY
-------------------------------------- CHANGING SOLVER


In [6]:
file = 'dumps/results_dump_mlp_14_12.pickle'
with open(file, 'wb') as fout:
    pickle.dump(results, fout)

In [7]:
results

{'LeaveOneOut': [{'penalty': 'l2',
   'C': 0.1,
   'report': {'Category 1': {'precision': 0.7586206896551724,
     'recall': 0.9565217391304348,
     'f1-score': 0.8461538461538461,
     'support': 23},
    'Category 2': {'precision': 0.5384615384615384,
     'recall': 0.65625,
     'f1-score': 0.5915492957746479,
     'support': 32},
    'Category 3': {'precision': 0.5263157894736842,
     'recall': 0.3448275862068966,
     'f1-score': 0.4166666666666667,
     'support': 29},
    'Category 4': {'precision': 0.8,
     'recall': 0.6956521739130435,
     'f1-score': 0.7441860465116279,
     'support': 23},
    'accuracy': 0.6448598130841121,
    'macro avg': {'precision': 0.6558495043975987,
     'recall': 0.6633128748125937,
     'f1-score': 0.6496389637766972,
     'support': 107},
    'weighted avg': {'precision': 0.6387121774539722,
     'recall': 0.6448598130841121,
     'f1-score': 0.631689049807738,
     'support': 107}},
   'model': LogisticRegression(C=0.1, max_iter=10000, multi

In [8]:
sums = []
for cv, ress in results.items():
    l = [(cv, res['penalty'], res['C'], res['l1_ratio'], res['report']['macro avg']['precision'] + res['report']['macro avg']['recall'], res['model'], res['report']) for res in ress]
    sums.append(sorted(l, key=lambda z: z[4], reverse=True))
    
best_params = [s[0:3] for s in sums]

In [9]:
best_params

[[('LeaveOneOut',
   'l2',
   1,
   None,
   1.3626304136238332,
   LogisticRegression(C=1, max_iter=10000, multi_class='multinomial'),
   {'Category 1': {'precision': 0.84,
     'recall': 0.9130434782608695,
     'f1-score': 0.8749999999999999,
     'support': 23},
    'Category 2': {'precision': 0.5483870967741935,
     'recall': 0.53125,
     'f1-score': 0.5396825396825397,
     'support': 32},
    'Category 3': {'precision': 0.5,
     'recall': 0.4482758620689655,
     'f1-score': 0.4727272727272727,
     'support': 29},
    'Category 4': {'precision': 0.8,
     'recall': 0.8695652173913043,
     'f1-score': 0.8333333333333333,
     'support': 23},
    'accuracy': 0.6635514018691588,
    'macro avg': {'precision': 0.6720967741935484,
     'recall': 0.6905336394302849,
     'f1-score': 0.6801857864357863,
     'support': 107},
    'weighted avg': {'precision': 0.6520410009044317,
     'recall': 0.6635514018691588,
     'f1-score': 0.6567345686504564,
     'support': 107}}),
  ('Leav