### TODO
Note that the comparison in the end is to be made between the assembler using voting rules
and the accuracy of the grid search model battery.

In [284]:
# load packages
import os
import json
import joblib

from corankco.dataset import Dataset
from corankco.scoringscheme import ScoringScheme
from corankco.kemrankagg import KemRankAgg
from corankco.algorithms.enumeration import Algorithm

import random 
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from itertools import cycle

from numba import jit
from sklearn.preprocessing import LabelBinarizer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import learning_curve, ShuffleSplit
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB, BernoulliNB 
from sklearn.metrics import roc_auc_score, accuracy_score , classification_report, ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve
from sklearn import metrics

from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LearningCurveDisplay

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
pd.set_option('display.max_columns', None)

# FITTING PIPELINE

In [3]:
write = True

def fit_models(X, Y, X_t, Y_t, target_names, models, params, run="", folder="", gs=1):
    
    if folder == "":
        raise Exception("Please provide a folder...")
        
    print("Fitting for parameters...")
    print("X:", X)
    print("-"*50)
    print("Y:", Y)
    print("-"*50)
    print("X_t:", X_t)
    print("-"*50)
    print("Y_t:", Y_t)
    print("-"*50)
    print("models:", models)
    print("-"*50)
    print("params:", params)
    print("-"*50)
    print("run:", run)
    print("-"*50)
    print("folder:", folder)    
            
    @jit(target_backend='cuda')
    def results(X,Y,model_name, model, data_type):
        model_accuracy = accuracy_score(Y, model.predict(X)) 
        model_f1 = f1_score(Y, model.predict(X), average='weighted') 
        model_precision = precision_score(Y, model.predict(X),average='weighted') 
        model_recall = recall_score(Y, model.predict(X),average='weighted')

        print('Model performance for ' + data_type + ' ' + model_name)
        print("- Accuracy: {:.4f}".format(model_accuracy))
        print('- F1 score: {:4f}'.format(model_f1))
        print('- Precision: {:4f}'.format(model_precision))
        print('- Recall: {:4f}'.format(model_recall))

        print('----------------------------------')
        print('='*35)
        
        if write:
            if os.path.isfile("./models/" + folder + "/summaries/" + model_name + ".txt"):
                f = open("./models/" + folder + "/summaries/" + model_name + ".txt", 'a')
            else:
                f = open("./models/" + folder + "/summaries/" + model_name + ".txt", 'w')

            f.write("Model performance for " + data_type + " " + model_name + "\n")
            f.write("- Accuracy: " + str(round(model_accuracy,4)) + "\n")
            f.write("- F1 score: " + str(round(model_f1,4)) + "\n")
            f.write("- Precision: " + str(round(model_precision,4)) + "\n")
            f.write("- Recall: " + str(round(model_recall,4)) + "\n")

            f.write('----------------------------------\n')
            f.write('='*35 + "\n")
            f.close()
    
    @jit(target_backend='cuda')
    def fit(model, X, Y):

        model.fit(X,Y)
        return model

    def plots(model, X, Y, X_t, Y_t, name):
        label_binarizer = LabelBinarizer().fit(Y)
        y_onehot_test = label_binarizer.transform(Y_t)
        y_score = model.predict_proba(X_t)
        n_classes = len(np.unique(Y))
        
        colors = plt.cm.get_cmap('tab10', n_classes)

        # Plot ROC curve for each class
        for class_id in range(n_classes):
            if n_classes > 2:  # Multiclass classification
                y_onehot = np.zeros(y_onehot_test.shape[0])
                y_onehot[y_onehot_test[:, class_id] == 1] = 1
            else:  # Binary classification
                y_onehot = y_onehot_test[:, 0]

            RocCurveDisplay.from_predictions(
                y_onehot,
                y_score[:, class_id],
                name=f"ROC curve for {target_names[class_id]}",
                color=colors(class_id),
                ax=ax[0][1]
            )
        
#         colors = cycle(["aqua", "darkorange", "cornflowerblue"])
#         for class_id, color in zip(range(n_classes), colors):
#             RocCurveDisplay.from_predictions(
#                 y_onehot_test[:, class_id],
#                 y_score[:, class_id],
#                 name=f"ROC curve for {target_names[class_id]}",                    # SPECIFIC TO IRIS
#                 color=color,
#                 ax=ax[0][1],
#             )
        ax[0][1].plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
        ax[0][1].set_xlabel("False Positive Rate")
        ax[0][1].set_ylabel("True Positive Rate")
        ax[0][1].set_title("Micro-averaged One-vs-Rest\nReceiver Operating Characteristic")

        ConfusionMatrixDisplay.from_estimator(model, X, Y, ax=ax[1][0])
        _ = ax[1][0].set_title(
            f"Confusion Matrix on train set for {name}"
        )

        ConfusionMatrixDisplay.from_estimator(model, X_t, Y_t, ax=ax[1][1])
        _ = ax[1][1].set_title(
            f"Confusion Matrix on test set for {name}"
        )

    params_plot = {
        "X": X,
        "y": Y,
        "score_type": "both",
        "n_jobs": -2,
        "line_kw": {"marker": "o"},
        "std_display_style": "fill_between",
        "score_name": "accuracy"
    }
    
    dic_models = {}
    models_ensemble = []
    for i in range(len(list(models))):
        models_ensemble.append((list(models.keys())[i], list(models.values())[i]))
        dic_models[list(models.keys())[i]] = list(models.values())[i]
        
    for i in range(len(list(models))):
        print(list(models.keys())[i] + "\n")
        print("#"*50)

        fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))
        
        if gs:
            # FOR TRAINING MODEL BATTERY USING GRIDSEARCH             
            model = make_pipeline(StandardScaler(), GridSearchCV(list(models.values())[i], list(params.values())[i],
                                                                 scoring='accuracy', refit=True, n_jobs=-2))
        else:
            # FOR TRAINING MODEL BATTERY WITHOUT GRIDSEARCH            
            model = make_pipeline(StandardScaler(), list(models.values())[i])

        LearningCurveDisplay.from_estimator(model, **params_plot, ax=ax[0][0])
        handles, label = ax[0][0].get_legend_handles_labels()
        ax[0][0].legend(handles[:2], ["Training Score", "Test Score"])
        ax[0][0].set_title(f"Learning Curve for {list(models.keys())[i]}")

        model = fit(model, X, Y)
        if write:
            dump(model, "./models/" + folder + "/" + run + "_" + list(models.keys())[i] + ".joblib")

        plots(model, X, Y, X_t, Y_t, list(models.keys())[i])

        fig.tight_layout()
        if write:
            plt.savefig("./models/" + folder +  "/result_images/" + run + "_" + list(models.keys())[i] + ".png", bbox_inches='tight')
            plt.close()
        plt.show()

        results(X,Y, list(models.keys())[i] + "_" + run, model, "Training set")
        results(X_t,Y_t, list(models.keys())[i] + "_" + run, model, "Test set")
        saved_models.append(model)
        
    print("################################################")
    print("################### ENSEMBLE ###################")

    dic_models["ensemble"] = VotingClassifier(models_ensemble, voting='soft', n_jobs=-2, verbose=True)

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))

    LearningCurveDisplay.from_estimator(dic_models["ensemble"], **params_plot, ax=ax[0][0])
    handles, label = ax[0][0].get_legend_handles_labels()
    ax[0][0].legend(handles[:2], ["Training Score", "Test Score"])
    ax[0][0].set_title(f"Learning Curve for ENSEMBLE")

    dic_models["ensemble"] = fit(dic_models["ensemble"], X, Y)

    results(X, Y, "ENSEMBLE"  + "_" + run, dic_models["ensemble"], "train set")
    results(X_t, Y_t, "ENSEMBLE"  + "_" + run, dic_models["ensemble"], "test set")
    
    plots(dic_models["ensemble"], X, Y, X_t, Y_t, "ENSEMBLE")
    if write:
        dump(dic_models["ensemble"], "./models/" + folder + "/" + run + "_" + "ensembler.joblib")

    fig.tight_layout()
    if write:
        plt.savefig("./models/" + folder + "/result_images/" + run + "_" + "ensembler.png", bbox_inches='tight')
        plt.close()

In [42]:
models = {
    "Logistic Regression"      :LogisticRegression(),
    "Decision Tree"            :DecisionTreeClassifier(),
    "Random Forest"            :RandomForestClassifier(random_state=42),
    "Gradient Boosting"        :GradientBoostingClassifier(),
    "K-Nearest Neighbors"      :KNeighborsClassifier(),
    "SVM"                      :svm.SVC(probability=True)
}

params = {
    "Logistic Regression"     :{
        'solver': ['newton-cholesky', 'liblinear', 'newton-cg', 'sag', 'saga'],
        'penalty':['l2'],
        'C':[0.001, 0.01, 0.1, 1, 100],
        'max_iter':[10000],
    },
    "Decision Tree"            :{
        'criterion':['gini','entropy'],
        'max_depth':[4,6,8,10,15,50,90]
    },
    "Random Forest"            :{ 
        'n_estimators': [200, 500],
        'max_features': [None, 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    },
    "Gradient Boosting"        :{
        "loss":["log_loss"],
        "learning_rate": [0.1, 0.01],
        "max_depth":[3,5,8],
        "max_features":["log2","sqrt"],
        "criterion": ["friedman_mse", "squared_error"],
        "subsample":[0.5, 0.8, 1.0],
        "n_estimators":[10, 100, 1000]
    },
    "K-Nearest Neighbors"      :{
         'weights': ['uniform', 'distance'],'leaf_size': [5, 15, 25]
    },
    "SVM"                      :{
    "kernel":['rbf'], "C":[0.1, 1, 10, 100, 1000], "degree":[1,5], 
        'gamma': [1, 0.1, 0.01, 0.001]
    }
}

# FITTING THE MODEL BATTERY

# WINE DATASET

In [46]:
data_wine=pd.read_csv('./data/wine.data',names=['Cultivars','Alcohol','Malic acid','Ash','Alcalinity of ash',
'Magnesium','Total Phenols','Flavanoids','Nonflavanoid phenols',
'Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline'])
data_wine_target_names = np.array(['1','2','3'])
print(data_wine)
x=data_wine.drop(['Cultivars'], axis=1)
y=data_wine['Cultivars']

     Cultivars  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0            1    14.23        1.71  2.43               15.6        127   
1            1    13.20        1.78  2.14               11.2        100   
2            1    13.16        2.36  2.67               18.6        101   
3            1    14.37        1.95  2.50               16.8        113   
4            1    13.24        2.59  2.87               21.0        118   
..         ...      ...         ...   ...                ...        ...   
173          3    13.71        5.65  2.45               20.5         95   
174          3    13.40        3.91  2.48               23.0        102   
175          3    13.27        4.28  2.26               20.0        120   
176          3    13.17        2.59  2.37               20.0        120   
177          3    14.13        4.10  2.74               24.5         96   

     Total Phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0             2.80        

In [47]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [48]:
# GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test, data_wine_target_names, models, params, "wine_Data_gridsearch", "experiment_wine_gridsearch",1)

Fitting for parameters...
X:      Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  Total Phenols  \
158    14.34        1.68  2.70               25.0         98           2.80   
137    12.53        5.51  2.64               25.0         96           1.79   
98     12.37        1.07  2.10               18.5         88           3.52   
159    13.48        1.67  2.64               22.5         89           2.60   
38     13.07        1.50  2.10               15.5         98           2.40   
..       ...         ...   ...                ...        ...            ...   
71     13.86        1.51  2.67               25.0         86           2.95   
106    12.25        1.73  2.12               19.0         80           1.65   
14     14.38        1.87  2.38               12.0        102           3.30   
92     12.69        1.53  2.26               20.7         80           1.38   
102    12.34        2.45  2.46               21.0         98           2.56   

     Flavanoids  Nonfl



Model performance for Training set Logistic Regression_wine_Data_gridsearch
- Accuracy: 0.9930
- F1 score: 0.992970
- Precision: 0.993130
- Recall: 0.992958
----------------------------------
Model performance for Test set Logistic Regression_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Decision Tree

##################################################




Model performance for Training set Decision Tree_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_wine_Data_gridsearch
- Accuracy: 0.9167
- F1 score: 0.909382
- Precision: 0.925463
- Recall: 0.916667
----------------------------------
Random Forest

##################################################




Model performance for Training set Random Forest_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Random Forest_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Gradient Boosting

##################################################




Model performance for Training set Gradient Boosting_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Gradient Boosting_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
K-Nearest Neighbors

##################################################




Model performance for Training set K-Nearest Neighbors_wine_Data_gridsearch
- Accuracy: 0.9859
- F1 score: 0.985872
- Precision: 0.986240
- Recall: 0.985915
----------------------------------
Model performance for Test set K-Nearest Neighbors_wine_Data_gridsearch
- Accuracy: 0.9444
- F1 score: 0.943604
- Precision: 0.949383
- Recall: 0.944444
----------------------------------
SVM

##################################################




Model performance for Training set SVM_wine_Data_gridsearch
- Accuracy: 0.9789
- F1 score: 0.978912
- Precision: 0.979418
- Recall: 0.978873
----------------------------------
Model performance for Test set SVM_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
################################################
################### ENSEMBLE ###################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_wine_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------


In [50]:
# NO GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test, data_wine_target_names, models, params, "wine_Data_no_gridsearch", "experiment_wine_no_gridsearch",0)

Fitting for parameters...
X:      Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  Total Phenols  \
158    14.34        1.68  2.70               25.0         98           2.80   
137    12.53        5.51  2.64               25.0         96           1.79   
98     12.37        1.07  2.10               18.5         88           3.52   
159    13.48        1.67  2.64               22.5         89           2.60   
38     13.07        1.50  2.10               15.5         98           2.40   
..       ...         ...   ...                ...        ...            ...   
71     13.86        1.51  2.67               25.0         86           2.95   
106    12.25        1.73  2.12               19.0         80           1.65   
14     14.38        1.87  2.38               12.0        102           3.30   
92     12.69        1.53  2.26               20.7         80           1.38   
102    12.34        2.45  2.46               21.0         98           2.56   

     Flavanoids  Nonfl

Model performance for Training set Logistic Regression_wine_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Logistic Regression_wine_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Decision Tree

##################################################
Model performance for Training set Decision Tree_wine_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_wine_Data_no_gridsearch
- Accuracy: 0.9444
- F1 score: 0.943997
- Precision: 0.946296
- Recall: 0.944444
----------------------------------
Random Forest

##################################################
Model performance for Training set Random Forest_wine_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.00

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_wine_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_wine_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------


# HEART DATASET

In [67]:
data_heart=pd.read_csv('./data/heart.data',names=['age','sex','chest pain type (4 values)', 'resting blood pressure', 'serum cholesterol in mg/dl', 'fasting blood sugar > 120 mg/dl', 'resting electrocardiographic results (values 0,1,2)', 'maximum heart rate achieved', 'exercise induced angina','oldpeak = ST depression induced by exercise relative to rest', 'the slope of the peak exercise ST segment','number of major vessels (0-3) colored by flourosopy','thal: 3 = normal; 6 = fixed defect; 7 = reversable defect', 'absence = 1, presence = 2'], delim_whitespace=True)
# print(data_heart)
data_heart_target_names = np.array(['Absence','Presence'])
x=data_heart.drop(['absence = 1, presence = 2'], axis=1)
y=data_heart['absence = 1, presence = 2']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [69]:
# GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test,data_heart_target_names, models, params, "heart_Data_gridsearch", "experiment_heart_gridsearch",1)

Fitting for parameters...
X:       age  sex  chest pain type (4 values)  resting blood pressure  \
115  49.0  0.0                         2.0                   134.0   
33   59.0  1.0                         4.0                   170.0   
184  53.0  1.0                         3.0                   130.0   
142  50.0  1.0                         3.0                   140.0   
197  54.0  0.0                         3.0                   110.0   
..    ...  ...                         ...                     ...   
20   67.0  1.0                         4.0                   120.0   
188  62.0  0.0                         4.0                   140.0   
71   57.0  0.0                         4.0                   120.0   
106  51.0  1.0                         3.0                   100.0   
102  49.0  0.0                         4.0                   130.0   

     serum cholesterol in mg/dl  fasting blood sugar > 120 mg/dl  \
115                       271.0                              0

Model performance for Training set Logistic Regression_heart_Data_gridsearch
- Accuracy: 0.8426
- F1 score: 0.842593
- Precision: 0.842593
- Recall: 0.842593
----------------------------------
Model performance for Test set Logistic Regression_heart_Data_gridsearch
- Accuracy: 0.8704
- F1 score: 0.866176
- Precision: 0.879203
- Recall: 0.870370
----------------------------------
Decision Tree

##################################################
Model performance for Training set Decision Tree_heart_Data_gridsearch
- Accuracy: 0.8935
- F1 score: 0.892362
- Precision: 0.900207
- Recall: 0.893519
----------------------------------
Model performance for Test set Decision Tree_heart_Data_gridsearch
- Accuracy: 0.8148
- F1 score: 0.807099
- Precision: 0.822792
- Recall: 0.814815
----------------------------------
Random Forest

##################################################
Model performance for Training set Random Forest_heart_Data_gridsearch
- Accuracy: 0.9398
- F1 score: 0.939665
- Pre

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_heart_Data_gridsearch
- Accuracy: 0.9907
- F1 score: 0.990741
- Precision: 0.990741
- Recall: 0.990741
----------------------------------
Model performance for test set ENSEMBLE_heart_Data_gridsearch
- Accuracy: 0.8519
- F1 score: 0.850327
- Precision: 0.851295
- Recall: 0.851852
----------------------------------


In [70]:
# NO GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test, data_heart_target_names, models, params, "heart_Data_no_gridsearch", "experiment_heart_no_gridsearch",0)

Fitting for parameters...
X:       age  sex  chest pain type (4 values)  resting blood pressure  \
115  49.0  0.0                         2.0                   134.0   
33   59.0  1.0                         4.0                   170.0   
184  53.0  1.0                         3.0                   130.0   
142  50.0  1.0                         3.0                   140.0   
197  54.0  0.0                         3.0                   110.0   
..    ...  ...                         ...                     ...   
20   67.0  1.0                         4.0                   120.0   
188  62.0  0.0                         4.0                   140.0   
71   57.0  0.0                         4.0                   120.0   
106  51.0  1.0                         3.0                   100.0   
102  49.0  0.0                         4.0                   130.0   

     serum cholesterol in mg/dl  fasting blood sugar > 120 mg/dl  \
115                       271.0                              0

Model performance for Training set Logistic Regression_heart_Data_no_gridsearch
- Accuracy: 0.8380
- F1 score: 0.837742
- Precision: 0.837847
- Recall: 0.837963
----------------------------------
Model performance for Test set Logistic Regression_heart_Data_no_gridsearch
- Accuracy: 0.9074
- F1 score: 0.906969
- Precision: 0.907190
- Recall: 0.907407
----------------------------------
Decision Tree

##################################################
Model performance for Training set Decision Tree_heart_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_heart_Data_no_gridsearch
- Accuracy: 0.6852
- F1 score: 0.689028
- Precision: 0.704518
- Recall: 0.685185
----------------------------------
Random Forest

##################################################
Model performance for Training set Random Forest_heart_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score:

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_heart_Data_no_gridsearch
- Accuracy: 0.9907
- F1 score: 0.990741
- Precision: 0.990741
- Recall: 0.990741
----------------------------------
Model performance for test set ENSEMBLE_heart_Data_no_gridsearch
- Accuracy: 0.8333
- F1 score: 0.832545
- Precision: 0.832353
- Recall: 0.833333
----------------------------------


# LYMPHOGRAPHY DATASET

In [72]:
data_lymphography=pd.read_csv('./data/lymphography.data',names=['class: normal find, metastases, malign lymph, fibrosis','lymphatics: normal, arched, deformed, displaced','block of affere: no, yes','bl. of lymph. c: no, yes','bl. of lymph. s: no, yes','by pass: no, yes','extravasates: no, yes', 'regeneration of: no, yes', 'early uptake in: no, yes', 'lym.nodes dimin: 0-3', 'lym.nodes enlar: 1-4', 'changes in lym.: bean, oval, round', 'defect in node: no, lacunar, lac. marginal, lac. central', 'changes in node: no, lacunar, lac. margin, lac. central', 'changes in stru: no, grainy, drop-like, coarse, diluted, reticular, stripped, faint', 'special forms: no, chalices, vesicles', 'dislocation of: no, yes', 'exclusion of no: no, yes', 'no. of nodes in: 0-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, >=70'])
print(data_lymphography)
data_lymphography_target_names = np.array(['normal find', 'metastases', 'malign lymph', 'fibrosis'])
x=data_lymphography.drop(['class: normal find, metastases, malign lymph, fibrosis'], axis=1)
y=data_lymphography['class: normal find, metastases, malign lymph, fibrosis']

     class: normal find, metastases, malign lymph, fibrosis  \
0                                                    3        
1                                                    2        
2                                                    3        
3                                                    3        
4                                                    2        
..                                                 ...        
143                                                  3        
144                                                  2        
145                                                  3        
146                                                  2        
147                                                  2        

     lymphatics: normal, arched, deformed, displaced  \
0                                                  4   
1                                                  3   
2                                                  3   
3                  

In [73]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [74]:
# GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test,data_lymphography_target_names, models, params, "lymphography_Data_gridsearch", "experiment_lymphography_gridsearch", 1)

Fitting for parameters...
X:      lymphatics: normal, arched, deformed, displaced  \
136                                                1   
45                                                 2   
119                                                4   
27                                                 4   
4                                                  3   
..                                               ...   
71                                                 3   
106                                                3   
14                                                 3   
92                                                 3   
102                                                2   

     block of affere: no, yes  bl. of lymph. c: no, yes  \
136                         1                         1   
45                          1                         1   
119                         2                         1   
27                          1                         1   
4  



Model performance for Training set Logistic Regression_lymphography_Data_gridsearch
- Accuracy: 0.8729
- F1 score: 0.863975
- Precision: 0.863526
- Recall: 0.872881
----------------------------------
Model performance for Test set Logistic Regression_lymphography_Data_gridsearch
- Accuracy: 0.9000
- F1 score: 0.896169
- Precision: 0.904444
- Recall: 0.900000
----------------------------------
Decision Tree

##################################################




Model performance for Training set Decision Tree_lymphography_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_lymphography_Data_gridsearch
- Accuracy: 0.7000
- F1 score: 0.687901
- Precision: 0.679808
- Recall: 0.700000
----------------------------------
Random Forest

##################################################




Model performance for Training set Random Forest_lymphography_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Random Forest_lymphography_Data_gridsearch
- Accuracy: 0.8333
- F1 score: 0.804598
- Precision: 0.777778
- Recall: 0.833333
----------------------------------
Gradient Boosting

##################################################




Model performance for Training set Gradient Boosting_lymphography_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Gradient Boosting_lymphography_Data_gridsearch
- Accuracy: 0.9000
- F1 score: 0.899872
- Precision: 0.901880
- Recall: 0.900000
----------------------------------
K-Nearest Neighbors

##################################################




Model performance for Training set K-Nearest Neighbors_lymphography_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set K-Nearest Neighbors_lymphography_Data_gridsearch
- Accuracy: 0.8333
- F1 score: 0.833120
- Precision: 0.834872
- Recall: 0.833333
----------------------------------
SVM

##################################################




Model performance for Training set SVM_lymphography_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set SVM_lymphography_Data_gridsearch
- Accuracy: 0.8333
- F1 score: 0.833120
- Precision: 0.834872
- Recall: 0.833333
----------------------------------
################################################
################### ENSEMBLE ###################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_lymphography_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_lymphography_Data_gridsearch
- Accuracy: 0.8667
- F1 score: 0.862835
- Precision: 0.871111
- Recall: 0.866667
----------------------------------


In [75]:
# NO GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test,data_lymphography_target_names, models, params, "lymphography_Data_no_gridsearch", "experiment_lymphography_no_gridsearch", 0)


Fitting for parameters...
X:      lymphatics: normal, arched, deformed, displaced  \
136                                                1   
45                                                 2   
119                                                4   
27                                                 4   
4                                                  3   
..                                               ...   
71                                                 3   
106                                                3   
14                                                 3   
92                                                 3   
102                                                2   

     block of affere: no, yes  bl. of lymph. c: no, yes  \
136                         1                         1   
45                          1                         1   
119                         2                         1   
27                          1                         1   
4  

Model performance for Training set Logistic Regression_lymphography_Data_no_gridsearch
- Accuracy: 0.8898
- F1 score: 0.889221
- Precision: 0.889847
- Recall: 0.889831
----------------------------------
Model performance for Test set Logistic Regression_lymphography_Data_no_gridsearch
- Accuracy: 0.8333
- F1 score: 0.833120
- Precision: 0.834872
- Recall: 0.833333
----------------------------------
Decision Tree

##################################################
Model performance for Training set Decision Tree_lymphography_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_lymphography_Data_no_gridsearch
- Accuracy: 0.6667
- F1 score: 0.654260
- Precision: 0.651961
- Recall: 0.666667
----------------------------------
Random Forest

##################################################
Model performance for Training set Random Forest_lymphography_Data_no_gridse

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_lymphography_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_lymphography_Data_no_gridsearch
- Accuracy: 0.8667
- F1 score: 0.862835
- Precision: 0.871111
- Recall: 0.866667
----------------------------------


# GLASS DATASET

In [76]:
data_glass=pd.read_csv('./data/glass.data',names=['Id number: 1 to 214', 'RI: refractive index', 'Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)', 'Mg: Magnesium', 'Al: Aluminum', 'Si: Silicon','K: Potassium', 'Ca: Calcium','Ba: Barium','Fe: Iron','Type of glass: (class attribute)'])
print(data_glass)
data_glass_target_names = np.array(['1 building_windows_float_processed', '2 building_windows_non_float_processed', '3 vehicle_windows_float_processed','4 vehicle_windows_non_float_processed', '5 containers', '6 tableware', '7 headlamps'])
x=data_glass.drop(['Type of glass: (class attribute)'], axis=1)
y=data_glass['Type of glass: (class attribute)']

     Id number: 1 to 214  RI: refractive index  \
0                      1               1.52101   
1                      2               1.51761   
2                      3               1.51618   
3                      4               1.51766   
4                      5               1.51742   
..                   ...                   ...   
209                  210               1.51623   
210                  211               1.51685   
211                  212               1.52065   
212                  213               1.51651   
213                  214               1.51711   

     Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)  \
0                                                13.64                                              
1                                                13.89                                              
2                                                13.53                                          

In [77]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

In [79]:
# GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test,data_glass_target_names, models, params, "glass_Data_gridsearch", "experiment_glass_gridsearch",1)

Fitting for parameters...
X:      Id number: 1 to 214  RI: refractive index  \
79                    80               1.51590   
161                  162               1.51934   
109                  110               1.51818   
127                  128               1.52081   
95                    96               1.51860   
..                   ...                   ...   
106                  107               1.53125   
14                    15               1.51763   
92                    93               1.51588   
179                  180               1.51852   
102                  103               1.51820   

     Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)  \
79                                               12.82                                              
161                                              13.64                                              
109                                              13.72             



Model performance for Training set Logistic Regression_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Logistic Regression_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Decision Tree

##################################################




Model performance for Training set Decision Tree_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_glass_Data_gridsearch
- Accuracy: 0.9767
- F1 score: 0.976591
- Precision: 0.978295
- Recall: 0.976744
----------------------------------
Random Forest

##################################################




Model performance for Training set Random Forest_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Random Forest_glass_Data_gridsearch
- Accuracy: 0.9535
- F1 score: 0.953336
- Precision: 0.960853
- Recall: 0.953488
----------------------------------
Gradient Boosting

##################################################




Model performance for Training set Gradient Boosting_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Gradient Boosting_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
K-Nearest Neighbors

##################################################




Model performance for Training set K-Nearest Neighbors_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set K-Nearest Neighbors_glass_Data_gridsearch
- Accuracy: 0.9070
- F1 score: 0.900506
- Precision: 0.917636
- Recall: 0.906977
----------------------------------
SVM

##################################################




Model performance for Training set SVM_glass_Data_gridsearch
- Accuracy: 0.9942
- F1 score: 0.994067
- Precision: 0.994245
- Recall: 0.994152
----------------------------------
Model performance for Test set SVM_glass_Data_gridsearch
- Accuracy: 0.9767
- F1 score: 0.975711
- Precision: 0.981395
- Recall: 0.976744
----------------------------------
################################################
################### ENSEMBLE ###################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_glass_Data_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------


In [80]:
# NO GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test,data_glass_target_names, models, params, "glass_Data_no_gridsearch", "experiment_glass_no_gridsearch",0)

Fitting for parameters...
X:      Id number: 1 to 214  RI: refractive index  \
79                    80               1.51590   
161                  162               1.51934   
109                  110               1.51818   
127                  128               1.52081   
95                    96               1.51860   
..                   ...                   ...   
106                  107               1.53125   
14                    15               1.51763   
92                    93               1.51588   
179                  180               1.51852   
102                  103               1.51820   

     Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)  \
79                                               12.82                                              
161                                              13.64                                              
109                                              13.72             

Model performance for Training set Logistic Regression_glass_Data_no_gridsearch
- Accuracy: 0.9766
- F1 score: 0.976281
- Precision: 0.978026
- Recall: 0.976608
----------------------------------
Model performance for Test set Logistic Regression_glass_Data_no_gridsearch
- Accuracy: 0.9302
- F1 score: 0.926155
- Precision: 0.938630
- Recall: 0.930233
----------------------------------
Decision Tree

##################################################
Model performance for Training set Decision Tree_glass_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_glass_Data_no_gridsearch
- Accuracy: 0.9767
- F1 score: 0.976591
- Precision: 0.978295
- Recall: 0.976744
----------------------------------
Random Forest

##################################################
Model performance for Training set Random Forest_glass_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score:

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_glass_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_glass_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0s
[Voting] ...................... (6 of 6) Processing SVM, total=   0.0s
[Voting] ...... (1 of 6) Processing Logistic Regression, total=   0.1s
[Voting] ............ (3 of 6) Processing Random Forest, total=   0.2s
[Voting] ........ (4 of 6) Processing Gradient Boosting, total=   0.5s
[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (1 of 6) Processing Logistic Regression, total=   0.0s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0

[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0s
[Voting] ...................... (6 of 6) Processing SVM, total=   0.0s
[Voting] ...... (1 of 6) Processing Logistic Regression, total=   0.1s
[Voting] ............ (3 of 6) Processing Random Forest, total=   0.2s
[Voting] ........ (4 of 6) Processing Gradient Boosting, total=   0.5s
[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0s
[Voting] ...................... (6 of 6) Processing SVM, total=   0.0s
[Voting] ...... (1 of 6) Processing Logistic Regression, total=   0.1s
[Voting] ............ (3 of 6) Processing Random Forest, total=   0.2s
[Voting] ........ (4 of 6) Processing Gradient Boosting, total=   0.5s
[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...................... (6 of 6) Processing SVM, total=   0.0s
[Votin

[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0s
[Voting] ...................... (6 of 6) Processing SVM, total=   0.0s
[Voting] ...... (1 of 6) Processing Logistic Regression, total=   0.1s
[Voting] ............ (3 of 6) Processing Random Forest, total=   0.3s
[Voting] ........ (4 of 6) Processing Gradient Boosting, total=   0.3s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0s
[Voting] ...................... (6 of 6) Processing SVM, total=   0.0s
[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (1 of 6) Processing Logistic Regression, total=   0.1s
[Voting] ............ (3 of 6) Processing Random Forest, total=   0.2s
[Voting] ........ (4 of 6) Processing Gradient Boosting, total=   0.4s
[Voting] ............ (2 of 6) Processing Decision Tree, total=   0.0s
[Voting] ...... (5 of 6) Processing K-Nearest Neighbors, total=   0.0s
[Votin

## IRIS DATASET

In [20]:
iris = load_iris()
target_names = iris.target_names
X, y = iris.data, iris.target

# X = df[0]
# y = df[1]
y = np.asarray(y)
y = y.reshape(-1,)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [11]:
# GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test, iris.target_names, models, params, "iris_Data", "experiment_iris_gridsearch", 1)

Fitting for parameters...
X: [[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]
 [6.3 2.5 5.  1.9]
 [6.4 3.2 4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]
 [5.8 2.7 5.1 1.9]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [5.4 3.9 1.3 0.4]
 [5.4 3.7 1.5 0.2]
 [5.5 2.4 3.7 1. ]
 [6.3 2.8 5.1 1.5]
 [6.4 3.1 5.5 1.8]
 [6.6 3.  4.4 1.4]
 [7.2 3.6 6.1 2.5]
 [5.7 2.9 4.2 1.3]
 [7.6 3.  6.6 2.1]
 [5.6 3.  4.5 1.5]
 [5.1 3.5 1.4 0.2]
 [7.7 2.8 6.7 2. ]
 [5.8 2.7 4.1 1. ]
 [5.2 3.4 1.4 0.2]
 [5.  3.5 1.3 0.3]
 [5.1 3.8 1.9 0.4]
 [5.  2.  3.5 1. ]
 [6.3 2.7 4.9 1.8]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.1 3.3 1.7 0.5]
 [5.6 2.7 4.2 1.3]
 [5.1 3.4 1.5 0.2]
 [5.7 3.  4.2 1.2]
 [7.7 3.8 6.7 2.2]
 [4.6 3.2 1.4 0.2]
 [6.2 2.9 4.3 1.3]
 [5.7 2.5 5.  2. ]
 [5.5 4.2 1.4 0.2]
 [6.  3.  4.8 1.8]
 [5.8 2.7 5.1 1.9]
 [6.  2.2 4.  1. ]
 [5.4 3.  4.5 1.5]
 [6.2 3.4 5.4 2.3]
 [5.5 2.3 4.  1.3]
 [5.4 3.9 1.7 0.4]
 [5.  2.3 3.3 1. ]
 [6.4 2.7 5.3 1.9]
 [



Model performance for Training set Logistic Regression_iris_Data
- Accuracy: 0.9667
- F1 score: 0.966667
- Precision: 0.967459
- Recall: 0.966667
----------------------------------
Model performance for Test set Logistic Regression_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Decision Tree

##################################################




Model performance for Training set Decision Tree_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Decision Tree_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Random Forest

##################################################




Model performance for Training set Random Forest_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set Random Forest_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Gradient Boosting

##################################################




Model performance for Training set Gradient Boosting_iris_Data
- Accuracy: 0.9750
- F1 score: 0.974988
- Precision: 0.975178
- Recall: 0.975000
----------------------------------
Model performance for Test set Gradient Boosting_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
K-Nearest Neighbors

##################################################




Model performance for Training set K-Nearest Neighbors_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for Test set K-Nearest Neighbors_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
SVM

##################################################




Model performance for Training set SVM_iris_Data
- Accuracy: 0.9667
- F1 score: 0.966625
- Precision: 0.969767
- Recall: 0.966667
----------------------------------
Model performance for Test set SVM_iris_Data
- Accuracy: 0.9667
- F1 score: 0.966411
- Precision: 0.969444
- Recall: 0.966667
----------------------------------
################################################
################### ENSEMBLE ###################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_iris_Data
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------


In [12]:
# NO GRIDSEARCH
fit_models(X_train, y_train, X_test, y_test, iris.target_names, models, params, "iris_Data_no_gridsearch", "experiment_iris_no_gridsearch", 0)

Fitting for parameters...
X: [[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]
 [6.3 2.5 5.  1.9]
 [6.4 3.2 4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]
 [5.8 2.7 5.1 1.9]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [5.4 3.9 1.3 0.4]
 [5.4 3.7 1.5 0.2]
 [5.5 2.4 3.7 1. ]
 [6.3 2.8 5.1 1.5]
 [6.4 3.1 5.5 1.8]
 [6.6 3.  4.4 1.4]
 [7.2 3.6 6.1 2.5]
 [5.7 2.9 4.2 1.3]
 [7.6 3.  6.6 2.1]
 [5.6 3.  4.5 1.5]
 [5.1 3.5 1.4 0.2]
 [7.7 2.8 6.7 2. ]
 [5.8 2.7 4.1 1. ]
 [5.2 3.4 1.4 0.2]
 [5.  3.5 1.3 0.3]
 [5.1 3.8 1.9 0.4]
 [5.  2.  3.5 1. ]
 [6.3 2.7 4.9 1.8]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.1 3.3 1.7 0.5]
 [5.6 2.7 4.2 1.3]
 [5.1 3.4 1.5 0.2]
 [5.7 3.  4.2 1.2]
 [7.7 3.8 6.7 2.2]
 [4.6 3.2 1.4 0.2]
 [6.2 2.9 4.3 1.3]
 [5.7 2.5 5.  2. ]
 [5.5 4.2 1.4 0.2]
 [6.  3.  4.8 1.8]
 [5.8 2.7 5.1 1.9]
 [6.  2.2 4.  1. ]
 [5.4 3.  4.5 1.5]
 [6.2 3.4 5.4 2.3]
 [5.5 2.3 4.  1.3]
 [5.4 3.9 1.7 0.4]
 [5.  2.3 3.3 1. ]
 [6.4 2.7 5.3 1.9]
 [

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model performance for train set ENSEMBLE_iris_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------
Model performance for test set ENSEMBLE_iris_Data_no_gridsearch
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
----------------------------------


# DEPLOYING THE VOTING SYSTEM

In [237]:
def get_best_model_index(l):
    best_i = 0
    best_i_val = 0
    for i in range(len(l)):
        if l[i] > best_i_val:
            best_i_val = l[i]
            best_i = i
        if l[i] == best_i_val:
            best_i = random.choice([best_i, i])
            best_i_val = l[best_i]
    return best_i
        
def sortingPref(preference):
    return np.argsort(preference)[::-1][:]

def copeland(preferences):
    ncandidates=len(preferences[0][0])
    scores=np.zeros((len(preferences),ncandidates), dtype="i2")
    n_model=len(preferences[0])

    '''
    For each preference in the list of preferences
    '''
    for l in range(len(preferences)):
        comparison = np.zeros((ncandidates,ncandidates), dtype="i2")
        profile=np.zeros((n_model,ncandidates), dtype="i2")

        profile = np.flip(np.argsort(preferences[l]))

        for i in range(0, ncandidates):
            for j in range(0,ncandidates):
                _,i1=np.where(profile==i)
                _,i2=np.where(profile==j)
                comparison[i,j]=np.count_nonzero(np.less(i1,i2))
        '''
        Count how many pairwise is won or lose by any candidate
        '''
        comparison = comparison - comparison.T
        '''
        Remove negative numbers, i.e. candidates who lose pairwise
        '''
        comparison[comparison <0 ]=0
        '''
        For each candidate, count how many pairwise comparisons are won
        '''
        scores[l] = np.count_nonzero(comparison, axis=1)

    return scores

def rankaggr_brute(preferences):
    '''
    For each sample compute the score given the scoring vector
    '''
    n_candidates=len(preferences[0][0])
    n_model=len(preferences[0])
    n_samples = len(preferences)
    scores=np.zeros((n_samples,n_candidates), dtype="f4")
   
    for l in range(len(preferences)):
        profile=[]
        temp_ordered = np.flip(np.unique(preferences[l], axis=1), axis=1)
        
        for i in range(n_model):
            temp = temp_ordered[i] 
            temp=[np.where(preferences[l][i]==temp[j])[0] for j in range(len(temp))]
            profile.append(temp)

        ranks = Dataset(profile)
        sc = ScoringScheme()
        if len(profile[0])>5:
            consensus = KemRankAgg.compute_consensus(ranks, sc, Algorithm.ParCons)
        else:
            consensus = KemRankAgg.compute_consensus(ranks, sc, Algorithm.Exact)

        for c in range(len(consensus.consensus_rankings[0])):
            candidate = consensus.consensus_rankings[0][c][0]
            scores[l][candidate] = n_candidates - c

    return scores

def scoringVec(scoring, preferences, weights=None):
    n_candidate=len(preferences[0][0])
    n_model=len(preferences[0])
    if weights==None:
        weights = np.ones(n_model, dtype="f4")
    if scoring is 'Plurality':
        scoring=np.zeros(n_candidate, dtype='i2')
        scoring[0]=1
    if scoring is 'HalfApproval':
        scoring=np.zeros(n_candidate, dtype='i2')
        n=int(n_candidate/2)
        if n_candidate%2 == 1: 
            n += 1
        for i in range(n):
            scoring[i]=1
    if scoring is 'Borda':
        scoring=[i for i in range(n_candidate)]
        scoring.sort(reverse=True)

    '''
    For each sample compute the score given the scoring vector
    '''
    n_samples = len(preferences)
    scores=np.zeros((n_samples,n_candidate), dtype="f4")
    for j in range(n_samples):
        profile=np.zeros((n_model,n_candidate), dtype="i2")
        '''
        For each model sort its preferences
        '''
        profile = np.flip(np.argsort(preferences[j]))
        for i in range(n_candidate):
            for m in range(n_model):
                pos = profile[m,i]
                scores[j,pos] += weights[m] * scoring[i]
    return scores

def votingSystem(scoring, models, x, y=[], classes=10, best_index=0, weights=None, proba=False):
    if y != []:
        y_val = np.reshape(y, (len(y), 1))
    scores = []
    if y != []:
        fscores = []
        
    preferences = np.zeros((x.shape[0], len(models), classes))
    for i in range(len(models)):
        if hasattr(models[i], 'predict_proba'):
            y_pred = models[i].predict_proba(x)
        elif hasattr(models[i], 'decision_function'):
            y_pred = models[i].decision_function(x)
            y_pred = np.exp(y_pred) / np.sum(np.exp(y_pred), axis=1, keepdims=True)
        else:
            raise ValueError("Model doesn't have predict_proba or decision_function method")
        
        preferences[:, i, :] = y_pred

    '''
    Compute voting winner for each sample given rankings from each model
    '''
    if scoring == 'Copeland':
        scores = copeland(preferences)
    elif scoring == "Sum":
        scores = np.sum(preferences, axis=1)
    elif scoring == "Mean":
        scores = np.mean(preferences, axis=1)
    elif scoring == "Kemeny":
        scores = rankaggr_brute(preferences)
    else:
        scores = scoringVec(scoring, preferences, weights=weights)

    if y != []:
        return scores, fscores
    return scores, None

def predict(x, models, best_index, bestClassifier=False, nClasses=10, voting="Sum", argMax=False, weighted=False, tiebreak="best", epsilon=False ,weights=[]):
    temp_fscores=None
    if bestClassifier:
        y_pred = models[best_index].predict(x)
    else:
        if weighted==False:
            y_pred, temp_fscores=votingSystem(voting, models, x, classes=nClasses, proba=True)
        else:
            y_pred, temp_fscores=votingSystem(voting, models, x, classes=nClasses, proba=True, weights=weights)

    if tiebreak is "best":
        best_pred = models[best_index].predict(x)
        return_y_pred=[]
        if voting is "Plurality":
            epsilon = 0.0

        for i in range(len(y_pred)):
            pred=y_pred[i]
            winner = np.argwhere(pred == np.amax(pred))

            if len(winner)>1: 
                pred=best_pred[i]
            return_y_pred.append(pred)
        y_pred=return_y_pred


        if voting is "Plurality":
            epsilon = epsilon / len(y_pred)

    if argMax:
        return np.argmax(y_pred, axis=1),temp_fscores

    return y_pred, temp_fscores

In [76]:
def get_model_weights(X_test, y_test, folder_path):

    weights_gridsearch=[]
    model_weights_gridsearch = {
        "Logistic Regression"   :   0,
        "Random Forest"         :   0,
        "Decision Tree"         :   0,
        "K-Nearest Neighbors"   :   0,
        "Gradient Boosting"     :   0,
        "SVM"                   :   0,
        "ensemble"              :   0
    }

    # create an empty list to store the loaded files
    loaded_models = []

    # loop over the joblib files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".joblib"):
            # load the joblib file
            filepath = os.path.join(folder_path, filename)
            loaded_model = joblib.load(filepath)

            # add the loaded file to the list
            loaded_models.append(loaded_model)

    '''
    Compute accuracy to store weights
    '''
    for m, i in zip(range(len(loaded_models)), model_weights_gridsearch):
        print(loaded_models[m])
        y_pred = loaded_models[m].predict(X_test)
        tmp_f1 = f1_score(y_test, y_pred, average='weighted') 
        weights_gridsearch.append(tmp_f1)
        model_weights_gridsearch[i] = round(tmp_f1/len(loaded_models) * 100,3)

    return weights_gridsearch, model_weights_gridsearch, loaded_models

## IRIS

In [276]:
weights = []
fscores = []
saved_models = []
best_index = 0
epsilon = 0.0

iris = load_iris()
target_names = iris.target_names
X, y = iris.data, iris.target
y = np.asarray(y)
y = y.reshape(-1,)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# define the folder containing the joblib files
folder_path_gridsearch = "./models/experiment_iris_gridsearch/"
folder_path_no_gridsearch = "./models/experiment_iris_no_gridsearch/"

weights_gridsearch_iris_no_gridsearch, model_weights_iris_no_gridsearch, iris_models = get_model_weights(X_test, y_test, folder_path_no_gridsearch)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier())])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=42))])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(probability=True))])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier', DecisionTreeClassifier())])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
VotingClassifier(estimators=[('Logistic Regression', LogisticRegression()),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Random Forest',
                              RandomForestClassifier(random_state=42)),
                             ('Gradient Boosting',
                              GradientBoostingClassifier(

In [277]:
print("MODEL WEIGHTS FOR IRIS WITHOUT GRIDSEARCH - USING TEST DATA")
print(weights_gridsearch_iris_no_gridsearch)
print(model_weights_iris_no_gridsearch)
print("-"*50)

best_index = get_best_model_index(weights_gridsearch_iris_no_gridsearch)
print(best_index)
print(weights_gridsearch_iris_no_gridsearch[best_index])


MODEL WEIGHTS FOR IRIS WITHOUT GRIDSEARCH - USING TEST DATA
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
{'Logistic Regression': 14.286, 'Random Forest': 14.286, 'Decision Tree': 14.286, 'K-Nearest Neighbors': 14.286, 'Gradient Boosting': 14.286, 'SVM': 14.286, 'ensemble': 14.286}
--------------------------------------------------
6
1.0


In [278]:
y_pred_vorace,_ = predict(X_test, iris_models, best_index, False, len(np.unique(y_test)), "Plurality", True, False, "best", False, weights_gridsearch_iris_no_gridsearch)
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace) 
print(conf_mat)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [279]:
y_pred_vorace,_ = predict(X_test, iris_models, best_index, False, len(np.unique(y_test)), "Sum", True, False, "best", False, weights_gridsearch_iris_no_gridsearch)
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace) 
print(conf_mat)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [280]:
y_pred_vorace,_ = predict(X_test, iris_models, best_index, False, len(np.unique(y_test)), "Copeland", True, False, "best", False, weights_gridsearch_iris_no_gridsearch)
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace) 
print(conf_mat)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [281]:
y_pred_vorace,_ = predict(X_test, iris_models, best_index, False, len(np.unique(y_test)), "Borda", True, False, "best", False, weights_gridsearch_iris_no_gridsearch)
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace) 
print(conf_mat)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [285]:
y_pred_vorace,_ = predict(X_test, iris_models, best_index, False, len(np.unique(y_test)), "Kemeny", True, False, "best", False, weights_gridsearch_iris_no_gridsearch)
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace) 
print(conf_mat)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## WINE

In [286]:
weights = []
fscores = []
saved_models = []
best_index = 0
epsilon = 0.0

data_wine=pd.read_csv('./data/wine.data',names=['Cultivars','Alcohol','Malic acid','Ash','Alcalinity of ash',
'Magnesium','Total Phenols','Flavanoids','Nonflavanoid phenols',
'Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline'])
data_wine_target_names = np.array(['1','2','3'])
print(data_wine)
x=data_wine.drop(['Cultivars'], axis=1)
y=data_wine['Cultivars']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# define the folder containing the joblib files
folder_path_gridsearch = "./models/experiment_wine_gridsearch/"
folder_path_no_gridsearch = "./models/experiment_wine_no_gridsearch/"

weights_gridsearch_wine_no_gridsearch, model_weights_wine_no_gridsearch, wine_models = get_model_weights(X_test, y_test, folder_path_no_gridsearch)

     Cultivars  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0            1    14.23        1.71  2.43               15.6        127   
1            1    13.20        1.78  2.14               11.2        100   
2            1    13.16        2.36  2.67               18.6        101   
3            1    14.37        1.95  2.50               16.8        113   
4            1    13.24        2.59  2.87               21.0        118   
..         ...      ...         ...   ...                ...        ...   
173          3    13.71        5.65  2.45               20.5         95   
174          3    13.40        3.91  2.48               23.0        102   
175          3    13.27        4.28  2.26               20.0        120   
176          3    13.17        2.59  2.37               20.0        120   
177          3    14.13        4.10  2.74               24.5         96   

     Total Phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0             2.80        

In [287]:
print("MODEL WEIGHTS FOR WINE WITHOUT GRIDSEARCH - USING TEST DATA")
print(weights_gridsearch_wine_no_gridsearch)
print(model_weights_wine_no_gridsearch)
print("-"*50)

best_index = get_best_model_index(weights_gridsearch_wine_no_gridsearch)
print(best_index)
print(weights_gridsearch_wine_no_gridsearch[best_index])


MODEL WEIGHTS FOR WINE WITHOUT GRIDSEARCH - USING TEST DATA
[1.0, 0.9439974457215836, 0.9439974457215836, 0.9436036129748098, 1.0, 1.0, 1.0]
{'Logistic Regression': 14.286, 'Random Forest': 13.486, 'Decision Tree': 13.486, 'K-Nearest Neighbors': 13.48, 'Gradient Boosting': 14.286, 'SVM': 14.286, 'ensemble': 14.286}
--------------------------------------------------
6
1.0


In [288]:
y_pred_vorace,_ = predict(X_test, wine_models, best_index, False, len(np.unique(y_test)), "Plurality", True, False, "best", False, weights_gridsearch_wine_no_gridsearch)
y_pred_vorace += 1 
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 1 3 1 2 1 2 3 2 3 1 3 1 2 1 2 2 2 1 2 1 2 2 3 3 3 2 2 2 1 1 2 3 1 1 1]
19     1
45     1
140    3
30     1
67     2
16     1
119    2
174    3
109    2
141    3
24     1
150    3
41     1
118    2
15     1
111    2
113    2
82     2
9      1
114    2
18     1
66     2
60     2
169    3
171    3
164    3
117    2
65     2
90     2
55     1
29     1
128    2
145    3
31     1
12     1
42     1
Name: Cultivars, dtype: int64
1.0
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]


In [289]:
y_pred_vorace,_ = predict(X_test, wine_models, best_index, False, len(np.unique(y_test)), "Sum", True, False, "best", False, weights_gridsearch_wine_no_gridsearch)
y_pred_vorace += 1 
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 1 3 1 2 1 2 3 2 3 1 3 1 2 1 2 2 2 1 2 1 2 2 3 3 3 2 2 2 1 1 2 3 1 1 1]
19     1
45     1
140    3
30     1
67     2
16     1
119    2
174    3
109    2
141    3
24     1
150    3
41     1
118    2
15     1
111    2
113    2
82     2
9      1
114    2
18     1
66     2
60     2
169    3
171    3
164    3
117    2
65     2
90     2
55     1
29     1
128    2
145    3
31     1
12     1
42     1
Name: Cultivars, dtype: int64
1.0
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]


In [290]:
y_pred_vorace,_ = predict(X_test, wine_models, best_index, False, len(np.unique(y_test)), "Kemeny", True, False, "best", False, weights_gridsearch_wine_no_gridsearch)
y_pred_vorace += 1 
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 1 3 1 2 1 2 3 2 3 1 3 1 2 1 2 2 2 1 2 1 2 2 3 3 3 2 2 2 1 1 2 3 1 1 1]
19     1
45     1
140    3
30     1
67     2
16     1
119    2
174    3
109    2
141    3
24     1
150    3
41     1
118    2
15     1
111    2
113    2
82     2
9      1
114    2
18     1
66     2
60     2
169    3
171    3
164    3
117    2
65     2
90     2
55     1
29     1
128    2
145    3
31     1
12     1
42     1
Name: Cultivars, dtype: int64
1.0
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]


In [291]:
y_pred_vorace,_ = predict(X_test, wine_models, best_index, False, len(np.unique(y_test)), "Borda", True, False, "best", False, weights_gridsearch_wine_no_gridsearch)
y_pred_vorace += 1 
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 1 3 1 2 1 2 3 2 3 1 3 1 2 1 2 2 2 1 2 1 2 2 3 3 3 2 2 2 1 1 2 3 1 1 1]
19     1
45     1
140    3
30     1
67     2
16     1
119    2
174    3
109    2
141    3
24     1
150    3
41     1
118    2
15     1
111    2
113    2
82     2
9      1
114    2
18     1
66     2
60     2
169    3
171    3
164    3
117    2
65     2
90     2
55     1
29     1
128    2
145    3
31     1
12     1
42     1
Name: Cultivars, dtype: int64
1.0
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]


In [292]:
y_pred_vorace,_ = predict(X_test, wine_models, best_index, False, len(np.unique(y_test)), "Copeland", True, False, "best", False, weights_gridsearch_wine_no_gridsearch)
y_pred_vorace += 1 
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 1 3 1 2 1 2 3 2 3 1 3 1 2 1 2 2 2 1 2 1 2 2 3 3 3 2 2 2 1 1 2 3 1 1 1]
19     1
45     1
140    3
30     1
67     2
16     1
119    2
174    3
109    2
141    3
24     1
150    3
41     1
118    2
15     1
111    2
113    2
82     2
9      1
114    2
18     1
66     2
60     2
169    3
171    3
164    3
117    2
65     2
90     2
55     1
29     1
128    2
145    3
31     1
12     1
42     1
Name: Cultivars, dtype: int64
1.0
[[14  0  0]
 [ 0 14  0]
 [ 0  0  8]]


## LYMPHOGRAPHY

In [327]:
weights = []
fscores = []
saved_models = []
best_index = 0
epsilon = 0.0

data_lymphography=pd.read_csv('./data/lymphography.data',names=['class: normal find, metastases, malign lymph, fibrosis','lymphatics: normal, arched, deformed, displaced','block of affere: no, yes','bl. of lymph. c: no, yes','bl. of lymph. s: no, yes','by pass: no, yes','extravasates: no, yes', 'regeneration of: no, yes', 'early uptake in: no, yes', 'lym.nodes dimin: 0-3', 'lym.nodes enlar: 1-4', 'changes in lym.: bean, oval, round', 'defect in node: no, lacunar, lac. marginal, lac. central', 'changes in node: no, lacunar, lac. margin, lac. central', 'changes in stru: no, grainy, drop-like, coarse, diluted, reticular, stripped, faint', 'special forms: no, chalices, vesicles', 'dislocation of: no, yes', 'exclusion of no: no, yes', 'no. of nodes in: 0-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, >=70'])
print(data_lymphography)
data_lymphography_target_names = np.array(['normal find', 'metastases', 'malign lymph', 'fibrosis'])
x=data_lymphography.drop(['class: normal find, metastases, malign lymph, fibrosis'], axis=1)
y=data_lymphography['class: normal find, metastases, malign lymph, fibrosis']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# define the folder containing the joblib files
folder_path_gridsearch = "./models/experiment_lymphography_gridsearch/"
folder_path_no_gridsearch = "./models/experiment_lymphography_no_gridsearch/"

weights_gridsearch_lymphography_no_gridsearch, model_weights_lymphography_no_gridsearch, lymphography_models = get_model_weights(X_test, y_test, folder_path_no_gridsearch)

     class: normal find, metastases, malign lymph, fibrosis  \
0                                                    3        
1                                                    2        
2                                                    3        
3                                                    3        
4                                                    2        
..                                                 ...        
143                                                  3        
144                                                  2        
145                                                  3        
146                                                  2        
147                                                  2        

     lymphatics: normal, arched, deformed, displaced  \
0                                                  4   
1                                                  3   
2                                                  3   
3                  

In [328]:
print("MODEL WEIGHTS FOR LYMPHOGRAPHY WITHOUT GRIDSEARCH - USING TEST DATA")
print(weights_gridsearch_lymphography_no_gridsearch)
print(model_weights_lymphography_no_gridsearch)
print("-"*50)

best_index = get_best_model_index(weights_gridsearch_lymphography_no_gridsearch)
print(best_index)
print(weights_gridsearch_lymphography_no_gridsearch[best_index])


MODEL WEIGHTS FOR LYMPHOGRAPHY WITHOUT GRIDSEARCH - USING TEST DATA
[0.6542597187758478, 0.7415372361608921, 0.8045977011494254, 0.8628352490421456, 0.8331204767986378, 0.8045977011494254, 0.8477453580901858]
{'Logistic Regression': 9.347, 'Random Forest': 10.593, 'Decision Tree': 11.494, 'K-Nearest Neighbors': 12.326, 'Gradient Boosting': 11.902, 'SVM': 11.494, 'ensemble': 12.111}
--------------------------------------------------
3
0.8628352490421456


In [329]:
y_pred_vorace,_ = predict(X_test, lymphography_models, best_index, False, len(np.unique(y_test))+1, "Plurality", True, False, "best", False, weights_gridsearch_lymphography_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[3 3 4 2 3 2 2 3 2 2 3 2 2 2 2 3 2 3 2 3 2 3 3 3 3 2 2 3 3 2]
125    3
51     3
139    4
19     2
104    2
12     2
76     2
31     3
81     2
9      3
26     3
96     2
144    2
67     2
135    2
66     3
18     2
69     3
124    2
30     3
29     3
105    3
36     4
118    3
55     3
22     2
64     2
131    3
82     3
11     2
Name: class: normal find, metastases, malign lymph, fibrosis, dtype: int64
0.8666666666666667
[[13  1  0]
 [ 2 12  0]
 [ 0  1  1]]


In [330]:
y_pred_vorace,_ = predict(X_test, lymphography_models, best_index, False, len(np.unique(y_test))+1, "Sum", True, False, "best", False, weights_gridsearch_lymphography_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[3 3 4 2 3 2 2 3 2 2 3 2 2 2 2 3 2 3 2 3 2 3 3 3 3 2 2 3 3 2]
125    3
51     3
139    4
19     2
104    2
12     2
76     2
31     3
81     2
9      3
26     3
96     2
144    2
67     2
135    2
66     3
18     2
69     3
124    2
30     3
29     3
105    3
36     4
118    3
55     3
22     2
64     2
131    3
82     3
11     2
Name: class: normal find, metastases, malign lymph, fibrosis, dtype: int64
0.8666666666666667
[[13  1  0]
 [ 2 12  0]
 [ 0  1  1]]


In [331]:
y_pred_vorace,_ = predict(X_test, lymphography_models, best_index, False, len(np.unique(y_test))+1, "Copeland", True, False, "best", False, weights_gridsearch_lymphography_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[3 3 4 2 3 2 2 3 2 2 3 2 2 2 2 3 2 3 2 3 2 3 3 3 3 2 2 3 3 2]
125    3
51     3
139    4
19     2
104    2
12     2
76     2
31     3
81     2
9      3
26     3
96     2
144    2
67     2
135    2
66     3
18     2
69     3
124    2
30     3
29     3
105    3
36     4
118    3
55     3
22     2
64     2
131    3
82     3
11     2
Name: class: normal find, metastases, malign lymph, fibrosis, dtype: int64
0.8666666666666667
[[13  1  0]
 [ 2 12  0]
 [ 0  1  1]]


In [298]:
y_pred_vorace,_ = predict(X_test, lymphography_models, best_index, False, len(np.unique(y_test))+1, "Borda", True, False, "best", False, weights_gridsearch_lymphography_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[3 3 4 2 3 2 2 3 2 2 3 2 2 2 2 3 2 3 2 3 2 3 3 3 3 2 2 3 3 2]
125    3
51     3
139    4
19     2
104    2
12     2
76     2
31     3
81     2
9      3
26     3
96     2
144    2
67     2
135    2
66     3
18     2
69     3
124    2
30     3
29     3
105    3
36     4
118    3
55     3
22     2
64     2
131    3
82     3
11     2
Name: class: normal find, metastases, malign lymph, fibrosis, dtype: int64
0.8666666666666667
[[13  1  0]
 [ 2 12  0]
 [ 0  1  1]]


In [332]:
y_pred_vorace,_ = predict(X_test, lymphography_models, best_index, False, len(np.unique(y_test))+1, "Kemeny", True, False, "best", False, weights_gridsearch_lymphography_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[3 3 2 2 3 2 2 2 2 2 3 2 1 2 2 3 3 3 2 2 2 3 3 3 2 2 2 3 3 2]
125    3
51     3
139    4
19     2
104    2
12     2
76     2
31     3
81     2
9      3
26     3
96     2
144    2
67     2
135    2
66     3
18     2
69     3
124    2
30     3
29     3
105    3
36     4
118    3
55     3
22     2
64     2
131    3
82     3
11     2
Name: class: normal find, metastases, malign lymph, fibrosis, dtype: int64
0.6666666666666666
[[ 0  0  0  0]
 [ 1 11  2  0]
 [ 0  5  9  0]
 [ 0  1  1  0]]


## HEART

In [300]:
weights = []
fscores = []
saved_models = []
best_index = 0
epsilon = 0.0

data_heart=pd.read_csv('./data/heart.data',names=['age','sex','chest pain type (4 values)', 'resting blood pressure', 'serum cholesterol in mg/dl', 'fasting blood sugar > 120 mg/dl', 'resting electrocardiographic results (values 0,1,2)', 'maximum heart rate achieved', 'exercise induced angina','oldpeak = ST depression induced by exercise relative to rest', 'the slope of the peak exercise ST segment','number of major vessels (0-3) colored by flourosopy','thal: 3 = normal; 6 = fixed defect; 7 = reversable defect', 'absence = 1, presence = 2'], delim_whitespace=True)
# print(data_heart)
data_heart_target_names = np.array(['Absence','Presence'])
x=data_heart.drop(['absence = 1, presence = 2'], axis=1)
y=data_heart['absence = 1, presence = 2']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# define the folder containing the joblib files
folder_path_gridsearch = "./models/experiment_heart_gridsearch/"
folder_path_no_gridsearch = "./models/experiment_heart_no_gridsearch/"

weights_gridsearch_heart_no_gridsearch, model_weights_heart_no_gridsearch, heart_models = get_model_weights(X_test, y_test, folder_path_no_gridsearch)

VotingClassifier(estimators=[('Logistic Regression', LogisticRegression()),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Random Forest',
                              RandomForestClassifier(random_state=42)),
                             ('Gradient Boosting',
                              GradientBoostingClassifier()),
                             ('K-Nearest Neighbors', KNeighborsClassifier()),
                             ('SVM', SVC(probability=True))],
                 n_jobs=-2, verbose=True, voting='soft')
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier', DecisionTreeClassifier())])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier())])
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])
Pipeline(steps=[('standardscaler', StandardScaler()),
     

In [301]:
print("MODEL WEIGHTS FOR HEART WITHOUT GRIDSEARCH - USING TEST DATA")
print(weights_gridsearch_heart_no_gridsearch)
print(model_weights_heart_no_gridsearch)
print("-"*50)

best_index = get_best_model_index(weights_gridsearch_heart_no_gridsearch)
print(best_index)
print(weights_gridsearch_heart_no_gridsearch[best_index])

MODEL WEIGHTS FOR HEART WITHOUT GRIDSEARCH - USING TEST DATA
[0.8325445941026575, 0.6890284075495097, 0.810359231411863, 0.9069692189459209, 0.8877450980392156, 0.7380718954248366, 0.7552334943639291]
{'Logistic Regression': 11.893, 'Random Forest': 9.843, 'Decision Tree': 11.577, 'K-Nearest Neighbors': 12.957, 'Gradient Boosting': 12.682, 'SVM': 10.544, 'ensemble': 10.789}
--------------------------------------------------
3
0.9069692189459209


In [303]:
y_pred_vorace,_ = predict(X_test, heart_models, best_index, False, len(np.unique(y_test)), "Plurality", True, False, "best", False, weights_gridsearch_heart_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[2 2 1 1 1 2 2 1 1 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2
 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2]
30     2
116    2
79     1
127    1
196    1
137    2
209    1
45     1
158    1
247    1
183    1
268    1
227    2
82     2
165    1
194    1
226    2
146    2
104    2
60     1
221    2
266    1
46     2
42     1
185    1
9      2
22     1
199    2
109    1
24     1
113    1
68     1
144    2
224    1
252    2
6      2
120    2
67     2
119    2
118    1
25     1
125    1
244    1
19     1
77     1
216    1
90     1
208    2
93     2
180    1
15     1
152    1
232    1
250    2
Name: absence = 1, presence = 2, dtype: int64
0.8518518518518519
[[30  3]
 [ 5 16]]


In [304]:
y_pred_vorace,_ = predict(X_test, heart_models, best_index, False, len(np.unique(y_test)), "Sum", True, False, "best", False, weights_gridsearch_heart_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[2 2 1 1 1 2 2 1 1 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2
 1 2 2 1 1 1 1 1 1 1 2 2 2 1 1 1 2]
30     2
116    2
79     1
127    1
196    1
137    2
209    1
45     1
158    1
247    1
183    1
268    1
227    2
82     2
165    1
194    1
226    2
146    2
104    2
60     1
221    2
266    1
46     2
42     1
185    1
9      2
22     1
199    2
109    1
24     1
113    1
68     1
144    2
224    1
252    2
6      2
120    2
67     2
119    2
118    1
25     1
125    1
244    1
19     1
77     1
216    1
90     1
208    2
93     2
180    1
15     1
152    1
232    1
250    2
Name: absence = 1, presence = 2, dtype: int64
0.8333333333333334
[[29  4]
 [ 5 16]]


In [305]:
y_pred_vorace,_ = predict(X_test, heart_models, best_index, False, len(np.unique(y_test)), "Borda", True, False, "best", False, weights_gridsearch_heart_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[2 2 1 1 1 2 2 1 1 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2
 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2]
30     2
116    2
79     1
127    1
196    1
137    2
209    1
45     1
158    1
247    1
183    1
268    1
227    2
82     2
165    1
194    1
226    2
146    2
104    2
60     1
221    2
266    1
46     2
42     1
185    1
9      2
22     1
199    2
109    1
24     1
113    1
68     1
144    2
224    1
252    2
6      2
120    2
67     2
119    2
118    1
25     1
125    1
244    1
19     1
77     1
216    1
90     1
208    2
93     2
180    1
15     1
152    1
232    1
250    2
Name: absence = 1, presence = 2, dtype: int64
0.8518518518518519
[[30  3]
 [ 5 16]]


In [306]:
y_pred_vorace,_ = predict(X_test, heart_models, best_index, False, len(np.unique(y_test)), "Kemeny", True, False, "best", False, weights_gridsearch_heart_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[2 2 1 1 2 2 1 1 1 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 2
 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2]
30     2
116    2
79     1
127    1
196    1
137    2
209    1
45     1
158    1
247    1
183    1
268    1
227    2
82     2
165    1
194    1
226    2
146    2
104    2
60     1
221    2
266    1
46     2
42     1
185    1
9      2
22     1
199    2
109    1
24     1
113    1
68     1
144    2
224    1
252    2
6      2
120    2
67     2
119    2
118    1
25     1
125    1
244    1
19     1
77     1
216    1
90     1
208    2
93     2
180    1
15     1
152    1
232    1
250    2
Name: absence = 1, presence = 2, dtype: int64
0.8333333333333334
[[29  4]
 [ 5 16]]


In [307]:
y_pred_vorace,_ = predict(X_test, heart_models, best_index, False, len(np.unique(y_test)), "Copeland", True, False, "best", False, weights_gridsearch_heart_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[2 2 1 1 1 2 2 1 1 1 1 1 2 2 2 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2
 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2]
30     2
116    2
79     1
127    1
196    1
137    2
209    1
45     1
158    1
247    1
183    1
268    1
227    2
82     2
165    1
194    1
226    2
146    2
104    2
60     1
221    2
266    1
46     2
42     1
185    1
9      2
22     1
199    2
109    1
24     1
113    1
68     1
144    2
224    1
252    2
6      2
120    2
67     2
119    2
118    1
25     1
125    1
244    1
19     1
77     1
216    1
90     1
208    2
93     2
180    1
15     1
152    1
232    1
250    2
Name: absence = 1, presence = 2, dtype: int64
0.8518518518518519
[[30  3]
 [ 5 16]]


# GLASS

In [311]:
weights = []
fscores = []
saved_models = []
best_index = 0
epsilon = 0.0

data_glass=pd.read_csv('./data/glass.data',names=['Id number: 1 to 214', 'RI: refractive index', 'Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)', 'Mg: Magnesium', 'Al: Aluminum', 'Si: Silicon','K: Potassium', 'Ca: Calcium','Ba: Barium','Fe: Iron','Type of glass: (class attribute)'])
print(data_glass)
data_glass_target_names = np.array(['1 building_windows_float_processed', '2 building_windows_non_float_processed', '3 vehicle_windows_float_processed','4 vehicle_windows_non_float_processed', '5 containers', '6 tableware', '7 headlamps'])
x=data_glass.drop(['Type of glass: (class attribute)'], axis=1)
y=data_glass['Type of glass: (class attribute)']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# define the folder containing the joblib files
folder_path_gridsearch = "./models/experiment_glass_gridsearch/"
folder_path_no_gridsearch = "./models/experiment_glass_no_gridsearch/"

weights_gridsearch_glass_no_gridsearch, model_weights_glass_no_gridsearch, glass_models = get_model_weights(X_test, y_test, folder_path_no_gridsearch)

     Id number: 1 to 214  RI: refractive index  \
0                      1               1.52101   
1                      2               1.51761   
2                      3               1.51618   
3                      4               1.51766   
4                      5               1.51742   
..                   ...                   ...   
209                  210               1.51623   
210                  211               1.51685   
211                  212               1.52065   
212                  213               1.51651   
213                  214               1.51711   

     Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)  \
0                                                13.64                                              
1                                                13.89                                              
2                                                13.53                                          

In [312]:
print("MODEL WEIGHTS FOR GLASS WITHOUT GRIDSEARCH - USING TEST DATA")
print(weights_gridsearch_glass_no_gridsearch)
print(model_weights_glass_no_gridsearch)
print("-"*50)

best_index = get_best_model_index(weights_gridsearch_glass_no_gridsearch)
print(best_index)
print(weights_gridsearch_glass_no_gridsearch[best_index])

MODEL WEIGHTS FOR GLASS WITHOUT GRIDSEARCH - USING TEST DATA
[0.797535070651112, 1.0, 0.926154647905674, 0.8468992248062015, 0.9765914385000192, 0.9765914385000192, 1.0]
{'Logistic Regression': 11.393, 'Random Forest': 14.286, 'Decision Tree': 13.231, 'K-Nearest Neighbors': 12.099, 'Gradient Boosting': 13.951, 'SVM': 13.951, 'ensemble': 14.286}
--------------------------------------------------
6
1.0


In [313]:
y_pred_vorace,_ = predict(X_test, glass_models, best_index, False, len(np.unique(y_test)), "Plurality", True, False, "best", False, weights_gridsearch_glass_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 6 1 6 2 2 1 2 2 2 5 4 2 2 5 4 6 1 1 5 2 6 6 6 3 2 1 1 4 1 1 2 3 2 1 6 4
 3 2 2 2 6 1]
9      1
197    7
66     1
191    7
117    2
111    2
15     1
86     2
75     2
144    2
182    6
170    5
141    2
73     2
178    6
167    5
190    7
18     1
45     1
184    6
98     2
209    7
211    7
195    7
148    3
104    2
30     1
25     1
175    5
16     1
55     1
138    2
158    3
93     2
69     1
203    7
171    5
152    3
97     2
84     2
101    2
200    7
60     1
Name: Type of glass: (class attribute), dtype: int64
0.6511627906976745
[[11  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0]
 [ 0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0  8  0]]


In [314]:
y_pred_vorace,_ = predict(X_test, glass_models, best_index, False, len(np.unique(y_test)), "Borda", True, False, "best", False, weights_gridsearch_glass_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 6 1 6 2 2 1 2 2 2 5 4 2 2 5 4 6 1 1 5 2 6 6 6 3 2 1 1 4 1 1 2 3 2 1 6 4
 3 2 2 2 6 1]
9      1
197    7
66     1
191    7
117    2
111    2
15     1
86     2
75     2
144    2
182    6
170    5
141    2
73     2
178    6
167    5
190    7
18     1
45     1
184    6
98     2
209    7
211    7
195    7
148    3
104    2
30     1
25     1
175    5
16     1
55     1
138    2
158    3
93     2
69     1
203    7
171    5
152    3
97     2
84     2
101    2
200    7
60     1
Name: Type of glass: (class attribute), dtype: int64
0.6511627906976745
[[11  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0]
 [ 0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0  8  0]]


In [315]:
y_pred_vorace,_ = predict(X_test, glass_models, best_index, False, len(np.unique(y_test)), "Sum", True, False, "best", False, weights_gridsearch_glass_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 6 1 6 2 2 1 2 2 2 5 4 2 2 5 4 6 1 1 5 2 6 6 6 3 2 1 1 4 1 1 2 3 2 1 6 4
 3 2 2 2 6 1]
9      1
197    7
66     1
191    7
117    2
111    2
15     1
86     2
75     2
144    2
182    6
170    5
141    2
73     2
178    6
167    5
190    7
18     1
45     1
184    6
98     2
209    7
211    7
195    7
148    3
104    2
30     1
25     1
175    5
16     1
55     1
138    2
158    3
93     2
69     1
203    7
171    5
152    3
97     2
84     2
101    2
200    7
60     1
Name: Type of glass: (class attribute), dtype: int64
0.6511627906976745
[[11  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0]
 [ 0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0  8  0]]


In [317]:
y_pred_vorace,_ = predict(X_test, glass_models, best_index, False, len(np.unique(y_test)), "Kemeny", True, False, "best", False, weights_gridsearch_glass_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

AttributeError: module 'collections' has no attribute 'Iterable'

In [318]:
y_pred_vorace,_ = predict(X_test, glass_models, best_index, False, len(np.unique(y_test)), "Copeland", True, False, "best", False, weights_gridsearch_glass_no_gridsearch)
y_pred_vorace += 1
print(y_pred_vorace)
print(y_test)
accuracy = accuracy_score(y_test, y_pred_vorace)
print(accuracy)
conf_mat = confusion_matrix(y_test, y_pred_vorace)
print(conf_mat)

[1 6 1 6 2 2 1 2 2 2 5 4 2 2 5 4 6 1 1 5 2 6 6 6 3 2 1 1 4 1 1 2 3 2 1 6 4
 3 2 2 2 6 1]
9      1
197    7
66     1
191    7
117    2
111    2
15     1
86     2
75     2
144    2
182    6
170    5
141    2
73     2
178    6
167    5
190    7
18     1
45     1
184    6
98     2
209    7
211    7
195    7
148    3
104    2
30     1
25     1
175    5
16     1
55     1
138    2
158    3
93     2
69     1
203    7
171    5
152    3
97     2
84     2
101    2
200    7
60     1
Name: Type of glass: (class attribute), dtype: int64
0.6511627906976745
[[11  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0]
 [ 0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0]
 [ 0  0  0  4  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0  8  0]]
