In [1]:
def load_train_test_data(df, verbose=True, test_size=0.15):
    '''
    This function can be used to load the preprocessed dataset and output the same training and testing data within 
    different notebooks.

    Decided for a 85-15 split to have enough training data while keeping a suitable amount of data for testing.
    
    returns: X_train, X_test, y_train, y_test
    '''
    # import packages needed
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    # split the dataset into features and target
    y = df['label']
    X = df.drop('label', axis=1)
    
    # perform the train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0, stratify=y)

    if verbose:
        # print information about the dataset
        print('A snippet of the training data:')
        display(X_train.head())
        print("There are {} entries with {} columns in the training data.".format(X_train.shape[0], X_train.shape[1]))
        print("There are {} entries with {} columns in the testing data.".format(X_test.shape[0], X_test.shape[1]))

    return (X_train, X_test, y_train, y_test)

In [2]:
def find_all_hard_cases(cm, threshold=0.2, emotions='all'):
    """
    Calculates where the model has a hard time at doing the correct predictions from the confusion matrix. 
    """
    # import packages needed
    import numpy as np
    import pandas as pd
    
    # convert the confusion matrix to a pandas dataframe
    cm = pd.DataFrame(cm)
    # get the list of labels
    labels = cm.index.tolist()
    # iterate over each label
    for label in labels:
        # calculate false negative hard cases 
        ratio_fn = np.array(cm.loc[label] / cm.loc[label].sum())
        fn_hard_cases = np.asarray(np.where(ratio_fn > threshold))[0]
        fn_hard_cases = fn_hard_cases[fn_hard_cases != label]
        
        # calculate false positive hard cases 
        ratio_fp = np.array(cm[label] / cm[label].sum())
        fp_hard_cases = np.asarray(np.where(ratio_fp > threshold))[0]
        fp_hard_cases = fp_hard_cases[fp_hard_cases != label]
        
        # define emotion labels in correct order
        if emotions == 'all':
            emotion_labels = ['anger', 'boredom', 'disgust', 'fear', 'happiness', 'neutral', 'sadness']
        elif emotions == '5emos':
            emotion_labels = ['boredom', 'disgust', 'happiness', 'neutral', 'sadness']
    
        # print results
        if len(fn_hard_cases) > 0 or len(fp_hard_cases) > 0:
            print(f'Label: {emotion_labels[label]}({label})')
            if len(fn_hard_cases) > 0:
                fn_hard_cases = [f'{emotion_labels[elem]}({elem})' for elem in fn_hard_cases]
                print(f'Hard cases of false negatives: {fn_hard_cases}')
            if len(fp_hard_cases) > 0:
                fp_hard_cases = [f'{emotion_labels[elem]}({elem})' for elem in fp_hard_cases]
                print(f'Hard cases of false positives: {fp_hard_cases}')
            print()

In [3]:
def model_eval(model, X_train, X_test, y_train, y_test, f1=True, acc=True, recall=True, precision=True, confusion=False,
               hardcases_thrd=0.2, emotions='all'):
    '''
    Evaluates simple models with standard metrics: f1, accuracy, precision, recall if chosen, plots the confusion matrix

    as well as the hard cases for the model if chosen.
    '''
    # import packages needed
    import matplotlib.pyplot as plt
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score 
    from sklearn.metrics import precision_score 
    from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
    from sklearn.metrics import RocCurveDisplay
    
    
    active = [f1, acc, precision, recall]
    functions = [f1_score, accuracy_score, precision_score, recall_score]
    results = []
    
    if type(model).__name__== 'Net' : # in case its a neural net
        pred_train = F.softmax(model(X_train)).argmax(dim=1).float()
        pred_test = F.softmax(model(X_test)).argmax(dim=1).float()
    else:
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)
    
    for i, func in zip(active,functions):
        if i:
            print(f'\nEvaluation: {func.__name__}')
            if func.__name__ != 'accuracy_score':
                v_train =  func(y_train, pred_train, average='macro')
                v_test = func(y_test, pred_test, average='macro')
            else: 
                v_train =  func(y_train, pred_train)
                v_test = func(y_test, pred_test)
            print('{0:.2%} for the train data'.format(v_train))
            print('{0:.2%} for the test data'.format(v_test))
            results.append((v_train,v_test))
    
    if type(model).__name__ != 'Net' :
        if confusion:
            print('\nEvaluation: confusion_matrix of test predictions')
            cm = confusion_matrix(y_test, pred_test, labels = model.classes_)
            disp = ConfusionMatrixDisplay(confusion_matrix =cm , display_labels = model.classes_)
            disp.plot()
            plt.show()
            
            print('\nHard cases for the model:\n')
            find_all_hard_cases(cm, threshold=hardcases_thrd, emotions=emotions)

In [4]:
def run_gridsearchCV(model, grid_params, X_train, y_train, scoring='f1_macro',verbose=2, name_spec=None):
    '''
    This function runs a 5-fold cross-validated grid search on a given model with a grid defined by the grid parameters.
    Scoring used for cross-validation can be changed. 
    
    It will save the best estimator and print best hyperparameter settings as well as the best cross-validated score.
    '''
    # import packages needed
    from sklearn.model_selection import GridSearchCV
    import pickle
    
    # define the grid search
    grid_search = GridSearchCV(estimator=model, param_grid=grid_params, scoring=scoring, 
                       cv=5, refit=True, verbose=verbose, return_train_score=True) 
    
    # fit the grid search to training data
    grid_search.fit(X_train, y_train)
    
    # save best parameters
    if name_spec==None:
        pickle.dump(grid_search.best_estimator_, open('../results/models/{}_{}_best_model.pkl'
                                                      .format(type(model).__name__, scoring), 'wb'))
    else:
        pickle.dump(grid_search.best_estimator_, open('../results/models/{}_{}_best_model_{}.pkl'
                                                      .format(type(model).__name__, scoring, name_spec), 'wb'))
        
    # print best parameter settings and the cross-validated accuracy for this best setting
    print('\nBest hyperparameters :', grid_search.best_params_)
    print('Best cross-validated {} :'.format(scoring), grid_search.best_score_)
    
    return

In [5]:
def load_model(model, scoring, name_spec=None):
    '''
    Loads a saved model. The scoring method must be specified.
    
    returns: best model if it exists
    '''
    # import packages needed
    import pickle
    
    try: 
        if name_spec==None:
            best_model = pickle.load(open('../results/models/{}_{}_best_model.pkl'.format(type(model).__name__, scoring)
                                          , 'rb'))
        else: 
            best_model = pickle.load(open('../results/models/{}_{}_best_model_{}.pkl'.format(type(model).__name__, 
                                                                                             scoring, name_spec), 'rb'))
        return best_model
    except:
        print('Model is not saved.')
        return None

In [6]:
def feature_importance(model, X_test, y_test, top=-1):
    """
    Plots the feature importances of a model with top features chosen. Possible for SVC with linear or rbf kernel.
    If the input is a model unknown to this function it will print a corresponding statement.
    """
    # import packages needed
    import numpy as np
    from sklearn.inspection import permutation_importance # feature importance rbf
    import matplotlib.pyplot as plt
    
    if type(model).__name__ == 'SVC':
        kernel = model.get_params()['kernel']
        features = np.array(X_test.columns)
        if top == -1:
                top = len(features) # show all features
                
        if kernel == 'rbf':
            print('RBF kernel. Computing permutation importance.')
            perm_importance = permutation_importance(model, X_test, y_test, random_state=0).importances_mean
            importance_sorted, features_sorted = zip(*sorted(zip(perm_importance, features))) # sorts in ascending order
            plt.barh(range(top), importance_sorted[::-1][:top][::-1], align='center')
            plt.yticks(range(top), features_sorted[::-1][:top][::-1])
            plt.title("Feature Permutation Importance")
            plt.show()
            
        elif kernel == 'linear':
            importance = abs(model.coef_[0])
            importance_sorted, features_sorted = zip(*sorted(zip(importance, features)))
            plt.barh(range(top), importance_sorted[::-1][:top][::-1], align='center')
            plt.yticks(range(top), features_sorted[::-1][:top][::-1])
            plt.title("Feature Importance")
            plt.show()
            
        else:
            print(f'Sorry! {kernel} kernel is not supported for plotting feature importances in SVC.')
    else: 
        print(f'Sorry! {type(model).__name__} model is not supported for plotting feature importances.')