# Run the experiments

Run all the tuned approaches on all the datasets in 4 different ways:
- Without filter selection and without calibration
- With filter selection and without calibration
- Without filter selection and calibration
- With filter selection and calibration
Protocol: we run 10 times a 5-fold cross validation on the training dataset (80% of the dataset).

In [3]:
import os, sys
sys.path.append("..")

import pickle
import numpy as np
import pandas as pd

from utils.rms_score import rms_metric
from utils.filter import snr
from utils.load_classifiers import load_classifiers, load_classifiers_names
from utils.load_dataset import load_datasets_names, load_big_datasets_names, load_small_datasets_names

from sklearn.cross_validation import train_test_split, KFold, LeaveOneOut, cross_val_score, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV

## Load datasets and classifiers

In [22]:
datasets = pd.HDFStore("../data/datasets.h5")

In [23]:
data_names = load_datasets_names()
small_data = load_small_datasets_names()
big_data = load_big_datasets_names()

clf_names = load_classifiers_names()

### Without feature selection and without calibration

In [None]:
rms = rms_metric()
tuning_seed = 0
tuning_path = './results/tuning/'
n_iter = 10
n_jobs = 1
results_path = ".."

for data_name in data_names:

    data_path = results_path + "/" + data_name
    if data_name not in os.listdir(results_path):
        os.mkdir(data_path)            

    no_fs_path = data_path + '/' + "noFS"    
    if "noFS" not in os.listdir(data_path):
        os.mkdir(no_fs_path)
    
    all_clfs = load_classifiers_names()
    
    if "results_ACC.pkl" not in os.listdir(no_fs_path):
        result_noFS_ACC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_noFS_ACC = pickle.load(open(no_fs_path + "/results_ACC.pkl", 'rb'))
            
    if "results_AUC.pkl" not in os.listdir(no_fs_path):
        result_noFS_AUC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_noFS_AUC =  pickle.load(open(no_fs_path + "/results_AUC.pkl", 'rb'))   
            
    if "results_RMS.pkl" not in os.listdir(no_fs_path):
        result_noFS_RMS = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_noFS_RMS = pickle.load(open(no_fs_path + "/results_RMS.pkl", 'rb'))
        
    X = datasets[data_name + '_data']
    y = datasets[data_name + '_target']   

    for clf_name in clf_names:
        for iteration in range(n_iter):
            print "===== CV of %s ===== on %s ===== Iteration N°%s " % (clf_name, data_name, iteration)
            clf = load_classifiers(clf_name) # load the classifier with the good parameters !
            
            if clf_name != 'CART' and clf_name != 'ET':
                best_params = pickle.load(open(tuning_path + clf_name + "/" + data_name + "/" + "noFS/best_params.pkl", 'rb'))
                clf.set_params(**best_params)
            

            if not clf_name.startswith("Rot") or (data_name in small_data):
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=tuning_seed, stratify=y)                   
                
                # Take the iteration step as a random seed (then all the split folds are reproducible)
                kf = KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=iteration)

                cv_result_acc = cross_val_score(clf, X_train, y_train, scoring="accuracy", cv=kf, n_jobs=n_jobs)
                cv_result_auc = cross_val_score(clf, X_train, y_train, scoring="roc_auc", cv=kf, n_jobs=n_jobs)
                cv_result_rms = cross_val_score(clf, X_train, y_train, scoring=rms, cv=kf, n_jobs=n_jobs)  
                
                result_noFS_ACC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_acc
                result_noFS_AUC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_auc
                result_noFS_RMS[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_rms

        with open(no_fs_path + "/results_ACC.pkl", 'wb') as fp:
            pickle.dump(result_noFS_ACC, fp)
        with open(no_fs_path + "/results_AUC.pkl", 'wb') as fp:
            pickle.dump(result_noFS_AUC, fp)
        with open(no_fs_path + "/results_RMS.pkl", 'wb') as fp:
            pickle.dump(result_noFS_RMS, fp)                                

### With feature selection and without calibration

In [None]:
rms = rms_metric()
tuning_seed = 0
tuning_path = './results/tuning/'
n_iter = 10
n_jobs = 1
results_path = "./results/performances/"

for data_name in big_data:

    data_path = results_path + "/" + data_name
    if data_name not in os.listdir(results_path):
        os.mkdir(data_path)            

    fs_path = data_path + '/' + "FS"    
    if "FS" not in os.listdir(data_path):
        os.mkdir(fs_path)
    
    all_clfs = load_classifiers_names()
    
    if "results_ACC.pkl" not in os.listdir(fs_path):
        result_FS_ACC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_FS_ACC = pickle.load(open(fs_path + "/results_ACC.pkl", 'rb'))
            
    if "results_AUC.pkl" not in os.listdir(fs_path):
        result_FS_AUC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_FS_AUC =  pickle.load(open(fs_path + "/results_AUC.pkl", 'rb'))   
            
    if "results_RMS.pkl" not in os.listdir(fs_path):
        result_FS_RMS = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_FS_RMS = pickle.load(open(fs_path + "/results_RMS.pkl", 'rb'))
        
    X = datasets[data_name + '_data']
    y = datasets[data_name + '_target']
    best_features = snr(X, y) 
    X = X.iloc[:, best_features[:100]]

    for clf_name in clf_names:
        for iteration in range(n_iter):
            print "===== CV of %s ===== on %s ===== Iteration N°%s " % (clf_name, data_name, iteration)
            clf = load_classifiers(clf_name)
            
            if clf_name != 'CART' and clf_name != 'ET':
                best_params = pickle.load(open(tuning_path + clf_name + "/" + data_name + "/" + "FS/best_params.pkl", 'rb'))
                clf.set_params(**best_params)

            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=tuning_seed, stratify=y)               

            kf = KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=iteration)
            
            cv_result_acc = cross_val_score(clf, X_train, y_train, scoring="accuracy", cv=kf, n_jobs=n_jobs)
            cv_result_auc = cross_val_score(clf, X_train, y_train, scoring="roc_auc", cv=kf, n_jobs=n_jobs)
            cv_result_rms = cross_val_score(clf, X_train, y_train, scoring=rms, cv=kf, n_jobs=n_jobs)  

            result_FS_ACC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_acc
            result_FS_AUC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_auc
            result_FS_RMS[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_rms

        with open(fs_path + "/results_ACC.pkl", 'wb') as fp:
            pickle.dump(result_FS_ACC, fp)
        with open(fs_path + "/results_AUC.pkl", 'wb') as fp:
            pickle.dump(result_FS_AUC, fp)
        with open(fs_path + "/results_RMS.pkl", 'wb') as fp:
            pickle.dump(result_FS_RMS, fp)                                

### Without feature selection and calibration

In [None]:
rms = rms_metric()
tuning_seed = 0
tuning_path = './results/tuning/'
n_iter = 10
n_jobs = 1
results_path = "./results/performances/"

for data_name in data_names:

    data_path = results_path + "/" + data_name
    if data_name not in os.listdir(results_path):
        os.mkdir(data_path)            

    no_fs_path = data_path + '/' + "noFSCal"    
    if "noFSCal" not in os.listdir(data_path):
        os.mkdir(no_fs_path)
    
    all_clfs = load_classifiers_names()
    
    if "results_ACC.pkl" not in os.listdir(no_fs_path):
        result_noFSCal_ACC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_noFSCal_ACC = pickle.load(open(no_fs_path + "/results_ACC.pkl", 'rb'))
            
    if "results_AUC.pkl" not in os.listdir(no_fs_path):
        result_noFSCal_AUC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_noFSCal_AUC =  pickle.load(open(no_fs_path + "/results_AUC.pkl", 'rb'))   
            
    if "results_RMS.pkl" not in os.listdir(no_fs_path):
        result_noFSCal_RMS = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_noFSCal_RMS = pickle.load(open(no_fs_path + "/results_RMS.pkl", 'rb'))
        
    X = datasets[data_name + '_data']
    y = datasets[data_name + '_target']   
    
    for clf_name in clf_names:
        for iteration in range(n_iter):
            print "===== CV of %s ===== on %s ===== Iteration N°%s " % (clf_name, data_name, iteration)
            clf = load_classifiers(clf_name) # load the classifier with the good parameters !
            
            if clf_name != 'CART' and clf_name != 'ET':
                best_params = pickle.load(open(tuning_path + clf_name + "/" + data_name + "/" + "noFS/best_params.pkl", 'rb'))
                clf.set_params(**best_params) 

            if not clf_name.startswith("Rot") or (data_name in small_data):
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=tuning_seed, stratify=y)                 

            # Calibration
            cal_seed = 2
            kf_cal = KFold(X_val.shape[0], n_folds=3, shuffle=True, random_state=cal_seed)
            cal = CalibratedClassifierCV(base_estimator=clf, method='isotonic', cv=kf_cal)

            kf = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=iteration)

            cv_result_acc = cross_val_score(cal, X_train, y_train, scoring="accuracy", cv=kf, n_jobs=n_jobs)
            cv_result_auc = cross_val_score(cal, X_train, y_train, scoring="roc_auc", cv=kf, n_jobs=n_jobs)
            cv_result_rms = cross_val_score(cal, X_train, y_train, scoring=rms, cv=kf, n_jobs=n_jobs)  

            result_noFSCal_ACC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_acc
            result_noFSCal_AUC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_auc
            result_noFSCal_RMS[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_rms

        with open(no_fs_path + "/results_ACC.pkl", 'wb') as fp:
            pickle.dump(result_noFSCal_ACC, fp)
        with open(no_fs_path + "/results_AUC.pkl", 'wb') as fp:
            pickle.dump(result_noFSCal_AUC, fp)
        with open(no_fs_path + "/results_RMS.pkl", 'wb') as fp:
            pickle.dump(result_noFSCal_RMS, fp)                                

### With feature selection and calibration

In [None]:
rms = rms_metric()
tuning_seed = 0
tuning_path = './results/tuning/'
n_iter = 10
n_jobs = 1
results_path = "./results/performances/"

for data_name in big_data:

    data_path = results_path + "/" + data_name
    if data_name not in os.listdir(results_path):
        os.mkdir(data_path)            

    no_fs_path = data_path + '/' + "FSCal"    
    if "FSCal" not in os.listdir(data_path):
        os.mkdir(no_fs_path)
    
    all_clfs = load_classifiers_names()
    
    if "results_ACC.pkl" not in os.listdir(no_fs_path):
        result_FSCal_ACC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_FSCal_ACC = pickle.load(open(no_fs_path + "/results_ACC.pkl", 'rb'))
            
    if "results_AUC.pkl" not in os.listdir(no_fs_path):
        result_FSCal_AUC = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_FSCal_AUC =  pickle.load(open(no_fs_path + "/results_AUC.pkl", 'rb'))   
            
    if "results_RMS.pkl" not in os.listdir(no_fs_path):
        result_FSCal_RMS = pd.DataFrame(np.zeros((50, len(all_clfs))), columns=all_clfs)
    else:
        result_FSCal_RMS = pickle.load(open(no_fs_path + "/results_RMS.pkl", 'rb'))
        
    X = datasets[data_name + '_data']
    y = datasets[data_name + '_target']   
    best_features = snr(X, y)
    X = X.iloc[:, best_features[:100]]   
    
    for clf_name in clf_names:
        for iteration in range(n_iter):
            print "===== CV of %s ===== on %s ===== Iteration N°%s " % (clf_name, data_name, iteration)
            clf = load_classifiers(clf_name) # load the classifier with the good parameters !
            
            if clf_name != 'CART' and clf_name != 'ET':
                best_params = pickle.load(open(tuning_path + clf_name + "/" + data_name + "/" + "FS/best_params.pkl", 'rb'))
                clf.set_params(**best_params) 

            if not clf_name.startswith("Rot") or (data_name in small_data):
                X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=tuning_seed, stratify=y)                  

                # Calibration
                cal_seed = 2
                kf_cal = StratifiedKFold(y_val, n_folds=2, shuffle=True, random_state=cal_seed)

                cal = CalibratedClassifierCV(base_estimator=clf, method='isotonic', cv=kf_cal)

                kf = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=iteration)

                cv_result_acc = cross_val_score(cal, X_train, y_train, scoring="accuracy", cv=kf, n_jobs=n_jobs)
                cv_result_auc = cross_val_score(cal, X_train, y_train, scoring="roc_auc", cv=kf, n_jobs=n_jobs)
                cv_result_rms = cross_val_score(cal, X_train, y_train, scoring=rms, cv=kf, n_jobs=n_jobs)  

                result_FSCal_ACC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_acc
                result_FSCal_AUC[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_auc
                result_FSCal_RMS[clf_name].iloc[5*iteration:5*(iteration+1)] = cv_result_rms

        with open(no_fs_path + "/results_ACC.pkl", 'wb') as fp:
            pickle.dump(result_FSCal_ACC, fp)
        with open(no_fs_path + "/results_AUC.pkl", 'wb') as fp:
            pickle.dump(result_FSCal_AUC, fp)
        with open(no_fs_path + "/results_RMS.pkl", 'wb') as fp:
            pickle.dump(result_FSCal_RMS, fp)                                