In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, chi2
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import time

## Create the evaluation function

In [4]:
# Function for feature selection version 2
def select_features(X, y, method, corr_threshold=0.5):
    
    if method is None:
        return X.columns
        
    # Feature selection using chi-squared
    elif method == 'p-value':
        # Remove constant features
        X = X.loc[:, X.apply(pd.Series.nunique) != 1]
        p_values = []
        for feature in X.columns:
            f, p = chi2(X[[feature]], y)
            p_values.append(p)

        p_values = np.array(p_values).reshape(-1)
        p_values = pd.Series(p_values, index=X.columns).sort_values()
        selected_features = p_values[p_values < 0.05].index
    
    # Feature selection using Pearson correlation
    elif method == 'correlation':
        X = X.loc[:, X.apply(pd.Series.nunique) != 1]
        corr = X.corr()
        col_corr = set()

        corr_matrix = np.abs(corr.values)

        n = corr_matrix.shape[0]

        for i in range(n):
            for j in range(i):
                if corr_matrix[i, j] > corr_threshold:
                    colname = X.columns[i]
                    col_corr.add(colname)

        selected_features = [col for col in X.columns if col not in col_corr]
                
    return selected_features

In [5]:
# Function for scaling the data
def scale(X_train, X_test, method):
    
    if method is None:
        return X_train, X_test

    # Scaling using StandardScaler 
    elif method == 'standard':
        scaler = StandardScaler().fit(X_train)

    # Scaling using MinMaxScaler
    elif method == 'minmax':
        scaler = MinMaxScaler().fit(X_train)

    # Scaling using RobustScaler
    elif method == 'robust':
        scaler = RobustScaler().fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
                
    return X_train_scaled, X_test_scaled

In [6]:
def evaluate_model(X, Y, feature_selection, scaling, classifier, splitting, folds=5):
    if splitting == "cv":
        kf = KFold(n_splits=folds, shuffle=True, random_state=18)
        metric_results = {}
        #do the KFold cross validation
        for train_index, test_index in kf.split(X,Y):
            # split the data into train and test for this fold
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]   
            
            # preprocess the data
            # feature selection
            preprocess_start_time = time.time()
            rel_features = select_features(X_train, y_train, feature_selection)
            n_features_orig = X_train.shape[1]
            n_features = len(rel_features)
            # scaling the data
            X_train_preprocessed, X_test_preprocessed = scale(X_train[rel_features], X_test[rel_features], scaling)
            preprocess_end_time = time.time()
            preprocess_time = preprocess_end_time - preprocess_start_time
                
            # train and evaluate the model
            clf = classifier
            train_start_time = time.time()
            clf.fit(X_train_preprocessed, y_train)
            train_end_time = time.time()
            training_time = train_end_time - train_start_time
            
            pred_start_time = time.time()
            Y_pred_fold = clf.predict(X_test_preprocessed)
            pred_end_time = time.time()
            prediction_time = pred_end_time - pred_start_time
                
            # calculate the accuracy, precision, recall and f1 score per fold
            accuracy = metrics.accuracy_score(y_test, Y_pred_fold)
            precision = metrics.precision_score(y_test, Y_pred_fold, average='weighted')
            recall = metrics.recall_score(y_test, Y_pred_fold, average='weighted')
            f1 = metrics.f1_score(y_test, Y_pred_fold, average='weighted')

            # add the scores to the dict
            metric_results.setdefault('accuracy', []).append(accuracy)
            metric_results.setdefault('precision', []).append(precision)
            metric_results.setdefault('recall', []).append(recall)
            metric_results.setdefault('f1', []).append(f1)
            metric_results.setdefault('training_time', []).append(training_time)
            metric_results.setdefault('prediction_time', []).append(prediction_time)
            metric_results.setdefault('preprocess_time', []).append(preprocess_time)
            
            
        # Calculate the mean of the metrics over all folds
        accuracy_mean = np.mean(metric_results['accuracy'])
        precision_mean = np.mean(metric_results['precision'])
        recall_mean = np.mean(metric_results['recall'])
        f1_mean = np.mean(metric_results['f1'])
        training_time_mean = np.mean(metric_results['training_time'])
        prediction_time_mean = np.mean(metric_results['prediction_time'])
        preprocess_time_mean = np.mean(metric_results['preprocess_time'])
        
        # Calculate the standard deviation of f1
        f1_std = np.std(metric_results['f1'])

        return {'classifier': classifier.__class__.__name__, 'splitting': splitting, 'feature_selection': feature_selection, 'scaling': scaling, 'n_features_orig': n_features_orig, 'n_features': n_features, 'accuracy': accuracy_mean, 'precision': precision_mean, 'recall': recall_mean, 'f1': f1_mean, 'f1_std': f1_std, 'preprocess_time': preprocess_time_mean,'training_time': training_time_mean, 'prediction_time': prediction_time_mean}

    elif splitting == "holdout":
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=18)
        
        # preprocess the data
        # feature selection
        preprocess_start_time = time.time()
        rel_features = select_features(X_train, y_train, feature_selection)
        n_features_orig = X_train.shape[1]
        n_features = len(rel_features)
        # scaling the data
        X_train_preprocessed, X_test_preprocessed = scale(X_train[rel_features], X_test[rel_features], scaling)
        preprocess_end_time = time.time()
        preprocess_time = preprocess_end_time - preprocess_start_time


        

        # train and evaluate the model
        clf = classifier
        train_start_time = time.time()
        clf.fit(X_train_preprocessed, y_train)
        train_end_time = time.time()
        training_time = train_end_time - train_start_time
        
        pred_start_time = time.time()
        Y_pred = clf.predict(X_test_preprocessed)
        pred_end_time = time.time()
        prediction_time = pred_end_time - pred_start_time

        # calculate the accuracy, precision, recall and f1 score per fold
        accuracy = metrics.accuracy_score(y_test, Y_pred)
        precision = metrics.precision_score(y_test, Y_pred, average='weighted')
        recall = metrics.recall_score(y_test, Y_pred, average='weighted')
        f1 = metrics.f1_score(y_test, Y_pred, average='weighted')

        return {'classifier': classifier.__class__.__name__, 'splitting': splitting, 'feature_selection': feature_selection, 'scaling': scaling, 'n_features_orig': n_features_orig, 'n_features': n_features, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'preprocess_time': preprocess_time,'training_time': training_time, 'prediction_time': prediction_time}
    

### Smote function for balancing class column

For those datasets that have an imbalanced response variable the smote method can be implimented to compensate for the class imbalance in the data

In [7]:
def apply_smote(df, target_column, k_neighbors=4, random_state=321):

    X = df.drop(target_column, axis=1)
    y = df[target_column]

    sm = SMOTE(random_state=random_state, k_neighbors=k_neighbors)
    X_resampled, y_resampled = sm.fit_resample(X, y)

    return X_resampled, y_resampled

## Evaluation

The `evaluate_model` function is designed for assessing the performance of machine learning models. It takes the following parameters:

- **X**: Input features.
- **Y**: Dependent variable.
- **feature_selection**: A string specifying the feature selection method (p-value or correlation) or None if no feature selection should be applied.
- **scaling**: A string specyfying the scaling method (standard, robust, minmax) or None if no scaling should be applied.
- **classifier**: Classifier function (e.g. RandomForestClassifier).
- **splitting**: A string indicating the splitting method:
  - `"cv"` for K-Fold Cross-Validation.
  - `"holdout"` for a holdout set.
- **folds**: The number of folds for cross validation (default is 5).

Return values:
- **classifier:** The name of the classifier (e.g., 'RandomForestClassifier').
- **splitting:** The splitting method used ('cv' or 'holdout').
- **feature_selection:** The method of feature selection.
- **scaling:** The method used for scaling.
- **n_features_orig:** The original number of features in the dataset.
- **n_features:** The number of features after feature selection.
- **accuracy:** Accuracy score (mean for 'cv', single value for 'holdout').
- **precision:** Precision score (mean for 'cv', single value for 'holdout').
- **recall:** Recall score (mean for 'cv', single value for 'holdout').
- **f1:** F1 score (mean for 'cv', single value for 'holdout').
- **f1_std:** Standard deviation of F1 scores for 'cv' (0 for 'holdout').
- **preprocess_time:** Total time taken for preprocessing (feature selection and scaling).
- **training_time:** Total time taken for training the model.
- **prediction_time:** Total time taken for making predictions.

### Loading the datasets

In [11]:
bank_marketing = pd.read_csv("./preprocessed-datasets/bank_marketing_prepro.csv")
bank_marketing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     41188 non-null  int64  
 1   age                            41188 non-null  int64  
 2   default                        41188 non-null  float64
 3   housing                        41188 non-null  float64
 4   loan                           41188 non-null  float64
 5   campaign                       41188 non-null  int64  
 6   pdays                          41188 non-null  int64  
 7   previous                       41188 non-null  int64  
 8   emp.var.rate                   41188 non-null  float64
 9   cons.price.idx                 41188 non-null  float64
 10  cons.conf.idx                  41188 non-null  float64
 11  euribor3m                      41188 non-null  float64
 12  nr.employed                    41188 non-null 

In [12]:
wine_quality = pd.read_csv("./preprocessed-datasets/wine_quality_prepro.csv")
wine_quality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            6497 non-null   int64  
 1   fixed acidity         6497 non-null   float64
 2   volatile acidity      6497 non-null   float64
 3   citric acid           6497 non-null   float64
 4   residual sugar        6497 non-null   float64
 5   chlorides             6497 non-null   float64
 6   free sulfur dioxide   6497 non-null   float64
 7   total sulfur dioxide  6497 non-null   float64
 8   density               6497 non-null   float64
 9   pH                    6497 non-null   float64
 10  sulphates             6497 non-null   float64
 11  alcohol               6497 non-null   float64
 12  class                 6497 non-null   int64  
 13  wine_type             6497 non-null   int64  
dtypes: float64(11), int64(3)
memory usage: 710.7 KB


In [13]:
congression_voting = pd.read_csv("./preprocessed-datasets/CongressionVoting_prepro.csv")
congression_voting.set_index('ID', inplace=True)
congression_voting.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217 entries, 140 to 324
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   handicapped-infants                     217 non-null    float64
 1   water-project-cost-sharing              217 non-null    float64
 2   adoption-of-the-budget-resolution       217 non-null    float64
 3   physician-fee-freeze                    217 non-null    float64
 4   el-salvador-aid                         217 non-null    float64
 5   religious-groups-in-schools             217 non-null    float64
 6   anti-satellite-test-ban                 217 non-null    float64
 7   aid-to-nicaraguan-contras               217 non-null    float64
 8   mx-missile                              217 non-null    float64
 9   immigration                             217 non-null    float64
 10  synfuels-crporation-cutback             217 non-null    floa

In [14]:
reviews = pd.read_csv("./preprocessed-datasets/Review_prepro.csv")
reviews.set_index('ID', inplace=True)
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750 entries, 0 to 749
Columns: 10001 entries, V1 to class
dtypes: int64(10000), object(1)
memory usage: 57.2+ MB


### Different combinations of feature selection, scaling and classifiers are evaluated on the different datasets:

In [None]:
datasets = {
    #'wine_quality': wine_quality,
    'reviews': reviews,
    #'congression_voting': congression_voting,
    #'bank_marketing': bank_marketing
}

feature_selection_methods = [None, 'p-value', 'correlation']
scaling_methods = [None, 'standard', 'robust', 'minmax']
classifiers = [KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier()]
splitting_methods = ['holdout', 'cv']

results = []

for dataset_name, dataset in datasets.items():
    for feature_selection_method in feature_selection_methods:
        for scaling_method in scaling_methods:
            for classifier in classifiers:
                for splitting_method in splitting_methods: # ,'cv']:
                    if dataset_name == 'wine_quality' or dataset_name == 'bank_marketing':
                        target_column = 'class'
                        X,Y = apply_smote(dataset,target_column)
                    else:
                        X = dataset.drop(columns=["class"])
                        Y = dataset["class"]
                    evaluation_result = evaluate_model(X, Y, feature_selection_method, scaling_method, classifier, splitting_method)
                    print(f"Time taken for {dataset_name}, {feature_selection_method}, {scaling_method}, {classifier.__class__.__name__}, {splitting_method}: preprocessing time: {evaluation_result['preprocess_time']} seconds, training time: {evaluation_result['training_time']} seconds")
                    results.append({'dataset': dataset_name, 'result': evaluation_result})


In [186]:
df = pd.DataFrame(results)
results_df = pd.json_normalize(df['result'])
results_df.insert(0, 'dataset', df['dataset'])
#results_df.to_csv('./data/results_reviews.csv', index=False)
results_df

Unnamed: 0,dataset,classifier,splitting,feature_selection,scaling,n_features_orig,n_features,accuracy,precision,recall,f1,preprocess_time,training_time,prediction_time,f1_std
0,reviews,KNeighborsClassifier,holdout,,,10000,10000,0.168889,0.226259,0.168889,0.167173,0.183552,0.588872,0.371689,
1,reviews,KNeighborsClassifier,cv,,,10000,10000,0.190667,0.22928,0.190667,0.177675,0.033993,0.188579,0.299308,0.042618
2,reviews,DecisionTreeClassifier,holdout,,,10000,10000,0.333333,0.390857,0.333333,0.340607,0.044375,1.612597,0.094909,
3,reviews,DecisionTreeClassifier,cv,,,10000,10000,0.312,0.370688,0.312,0.312211,0.029096,1.686393,0.094763,0.048509
4,reviews,RandomForestClassifier,holdout,,,10000,10000,0.484444,0.592447,0.484444,0.481131,0.049906,2.715339,0.146309,
5,reviews,RandomForestClassifier,cv,,,10000,10000,0.529333,0.59512,0.529333,0.518244,0.031126,1.710614,0.121008,0.053394
6,reviews,KNeighborsClassifier,holdout,,standard,10000,10000,0.062222,0.082339,0.062222,0.043111,0.506777,0.048974,0.105852,
7,reviews,KNeighborsClassifier,cv,,standard,10000,10000,0.036,0.034935,0.036,0.017145,0.556181,0.057065,0.073821,0.01251
8,reviews,DecisionTreeClassifier,holdout,,standard,10000,10000,0.342222,0.388153,0.342222,0.341065,0.482351,1.311808,0.005961,
9,reviews,DecisionTreeClassifier,cv,,standard,10000,10000,0.321333,0.363486,0.321333,0.317793,0.471154,1.408854,0.006259,0.051515
