In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_classif

## Create the evaluation function

In [114]:
def feature_selection(X, Y):
    f, p = f_classif(X, Y)
    corr_df = pd.DataFrame({'features': X.columns, 'f_val': f, 'p_val': p})
    # using the 3 level significance test 
    # (***: p_val < 0.001, **: p_val < 0.01, *: p_val < 0.05)
    # we will select the features with p_val < 0.05
    p_tr = 0.001
    relevant_features = corr_df.loc[corr_df['p_val'] < p_tr, "features"].tolist()
    # print(f"There are {len(relevant_features)} relevant features and they are: {relevant_features}")
    
    return relevant_features

In [115]:
def evaluate_model(X, Y, preprocess, classifier, folds=5):
    kf = KFold(n_splits=folds, shuffle=True)
    f1_folds = []
    #do the KFold corss validation
    for train_index, test_index in kf.split(X,Y):
        # split the data into train and test for this fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]   
        
        # preprocess the data
        if preprocess == "none":
            X_train_preprocessed = X_train
            X_test_preprocessed = X_test
        elif preprocess == "featureselection":
            # sig_selector = VarianceThreshold(threshold=0.4)
            # X_train_preprocessed = sig_selector.fit_transform(X_train)
            # X_test_preprocessed = sig_selector.transform(X_test)
            rel_features = feature_selection(X_train, y_train)
            X_train_preprocessed = X_train[rel_features]
            X_test_preprocessed = X_test[rel_features]
        else:
            scaler = StandardScaler()
            X_train_preprocessed = scaler.fit_transform(X_train)
            X_test_preprocessed = scaler.transform(X_test)
            
        # train and evaluate the model
        if classifier.__class__.__name__ == "SVC":
            clf = classifier
            clf.fit(X_train_preprocessed, y_train)
            Y_pred_fold = clf.predict(X_test_preprocessed)
            
        # calculate the accuracy per fold
        f1_folds.append(f1_score(y_test, Y_pred_fold, average="weighted"))
    
    # Calculate the mean accuracy over all folds
    f1_mean = np.mean(f1_folds)    
    
            
         
        
    return f1_mean

## Load the data

### Congressional voting
An iterativ imputation has been already done on the data.

In [116]:
test_df = pd.read_csv("./preprocessed-datasets/CongressionVoting_prepro.csv")
test_df_X = test_df.drop(columns=["class"])
test_df_Y = test_df["class"]

## Evaluation

The **evaluate_model** function takes:
* the input feature (*X*),
* the dependent variable (*Y*)
* the preprocessing method as a string:
    * the so far implemented possible options can be found in the *preprocess_options* list. Here *else* is implemented as the *StandardScaler()*, however it could be combined with additional feature selection to achieve a better result.
* classifier as a function:
    * so far the only tried one is *SVC(C=3, kernel='sigmoid')*
* folds to define the number of  folds. The default value is 5.

In [117]:
preprocess_options = ["none", "featureselection", "else"]

In [118]:
evaluate_model(test_df_X, test_df_Y, "featureselection", SVC(C=3, kernel='sigmoid'))

0.847269679150177