In [1]:
# load train and test data
def load_train_test_data():
    '''
    This function can be used to load the preprocessed dataset and output the same training and testing data within our 
    different notebooks.

    We decided to do a 85-15 split since our dataset is not very big and we want to maximize the training data while 
    preserving the test data to some extent. <font color='red'>We have a bit more data due to the oversampling now. 
    
    returns: X_train, X_test, y_train, y_test
    '''
    # import packages needed
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    # load the preprocessed dataset
    df = pd.read_csv('data/diabetes_dataset_preprocessed.csv')
    
    # split the dataset into features and target
    y = df['Diabetic']
    X = df.drop('Diabetic', axis=1)
    
    # perform the train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, stratify=y)

    # print information about the datasets
    print('A snippet of our training data:')
    display(X_train.head())
    print("There are {} entries with {} columns in our training data.".format(X_train.shape[0], X_train.shape[1]))
    print("There are {} entries with {} columns in our testing data.".format(X_test.shape[0], X_test.shape[1]))

    return (X_train, X_test, y_train, y_test)

In [2]:
# load train and test data with oversampled training samples
def load_train_test_oversampled():
    '''
    This function loads our second preprocessed dataset where we oversampled our training data after splitting.
    
    returns: X_train, X_test, y_train, y_test
    '''
    # import packages needed
    import pandas as pd
    
    # load the dataset
    df_ros = pd.read_csv('data/diabetes_dataset_preprocessed_oversampled.csv')
    df_train = df_ros[df_ros['type']=='train'].drop('type', axis=1)
    df_test = df_ros[df_ros['type']=='test'].drop('type', axis=1)

    # seperate data from target variable
    y_train = df_train['Diabetic']
    X_train = df_train.drop('Diabetic', axis=1)
    y_test = df_test['Diabetic']
    X_test = df_test.drop('Diabetic', axis=1)
    
    # print information about the datasets
    print('A snippet of our training data:')
    display(X_train.head())
    print("There are {} entries with {} columns in our training data.".format(X_train.shape[0], X_train.shape[1]))
    print("There are {} entries with {} columns in our testing data.".format(X_test.shape[0], X_test.shape[1]))
    
    return (X_train, X_test, y_train, y_test)

In [3]:
# model evaluation
def model_eval(model, X_train, X_test, y_train, y_test,acc=True,f1=True,recall=True,precision=True,confusion=False,roc=False):
    '''
    This function will evaluate most models with the standart metrics selected from the list:
    accuracy | f1 score | recall | precision | aoc 
    
    returns: accuracy,f1 score,recall,precision 
    '''
    # import packages needed
    import matplotlib.pyplot as plt
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score 
    from sklearn.metrics import precision_score 
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    from sklearn.metrics import RocCurveDisplay
    
    custom_f1 = f1_score
    custom_f1.__dict__['average'] = 'macro'
    custom_f1.average = 'macro'
    
    active = [f1,acc,recall,precision]
    functions = [custom_f1,accuracy_score,recall_score,precision_score]
    results = []
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    for i, func in zip(active,functions):
        if i:
            print(f'\nEvaluation: {func.__name__}')
            v_train =  func(y_train, pred_train)
            v_test = func(y_test, pred_test)
            print('{0:.2%} for the train data'.format(v_train))
            print('{0:.2%} for the test data'.format(v_test))
            results.append((v_train,v_test))
        
    if confusion:
        print(f'\nEvaluation: confusion_matrix')
        cm = confusion_matrix(y_test, pred_test,labels=model.classes_)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
        disp.plot()
        plt.show()
        
    if roc:
        print(f'\nEvaluation: roc')
        RocCurveDisplay.from_estimator(model,X_test, y_test)
        plt.show()
    
    return results

In [4]:
def run_gridsearchCV(model, grid_params, X_train, y_train, scoring='f1_macro', ros='',verbose=2):
    '''
    This function will run a grid search on a given model with a grid defined by the grid parameters. Scoring used for 
    cross-validation can be changed. 
    
    It will save the best estimator and print best hyperparameter settings as well as the best cross-validated score.
    '''
    # import packages needed
    from sklearn.model_selection import GridSearchCV
    import pickle
    
    # define the grid search
    grid_search = GridSearchCV(estimator=model, param_grid=grid_params, scoring=scoring, 
                       cv=10, refit=True, verbose=verbose, return_train_score=True) 
    
    # fit the grid search to training data
    grid_search.fit(X_train, y_train)
    
    # save best parameters
    pickle.dump(grid_search.best_estimator_, open('data/{}_{}_{}_best_model_grid.pkl'.format(type(model).__name__, ros, scoring), 'wb'))
    
    # print best parameter settings and the cross-validated accuracy for this best setting
    print('\nBest hyperparameters :', grid_search.best_params_)
    print('Best cross-validated {} :'.format(scoring), grid_search.best_score_)
    
    return

In [5]:
def run_randomizedsearchCV(model, params, X_train, y_train, scoring='f1_macro', ros='',verbose=2):
    '''
    This function will run a random search on a given model within defined parameters. 
    Scoring used for cross-validation can be changed. 
    
    It will save the best estimator and print best hyperparameter settings as well as the best cross-validated score.
    '''
    # import packages needed
    from sklearn.model_selection import RandomizedSearchCV
    import pickle
    
    # define the randomoized search
    random_search = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=2400, scoring=scoring, random_state=0,
                       cv=10, refit=True, verbose=verbose, return_train_score=True) 
    
    # fit the randomoized search to training data
    random_search.fit(X_train, y_train)
    
    # save best parameters
    pickle.dump(random_search.best_estimator_, open('data/{}_{}_{}_best_model_random.pkl'.format(type(model).__name__, ros, scoring), 'wb'))
    
    # print best parameter settings and the cross-validated accuracy for this best setting
    print('\nBest hyperparameters :', random_search.best_params_)
    print('Best cross-validated {} :'.format(scoring), random_search.best_score_)
    
    return

In [6]:
def load_model(model, scoring, ros='', search='grid'):
    '''
    This function will load any of our saved best models after gridsearch. The used scoring method in the grid search 
    must be specified.
    
    returns: best model if it exists
    '''
    import pickle
    try: 
        best_model = pickle.load(open('data/{}_{}_{}_best_model_{}.pkl'.format(type(model).__name__, ros, scoring, search), 'rb'))
        return best_model
    except:
        print('Model is not saved.')
        return None