### from previous investigations we found some models that fared well with the task, and hyperparameters enabling them. 

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import json
import matplotlib.pyplot as plt
sns.set_theme()
from sklearn.metrics import accuracy_score
from utils import get_data, get_param_combinations
import itertools
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.metrics import log_loss
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:
filename = "experiments_joined"

In [None]:
results = pd.read_csv(f"../results/{filename}.csv")
results['classifier_params'] = results['classifier_params'].str.replace("'", '"')
clf_params_df = results['classifier_params'].apply(json.loads)
clf_params_df = pd.json_normalize(clf_params_df)
results = pd.concat([results.drop(columns=['classifier_params']), clf_params_df], axis = 1)
results = results[results['n_features'] < 10]
results.drop(columns = ['random_state', 'n_jobs', 'probability', 'device'])

In [None]:
results.sort_values('accuracy_top_20pc').groupby(['n_features', 'classifier']).first()

### in ensembling, we plan on using SVM, XGBoost and NN (and maybe RF if it proves viable to ensemble it with XGB)

##### what hyperparameters are best for SVM vs n_features:

In [None]:
SVM_df = results[results['classifier'] == 'SVC']
SVM_best_df = SVM_df.sort_values('accuracy_top_20pc', ascending = False).groupby(['n_features']).first()
SVM_best_df[['accuracy', 'accuracy_top_20pc', 'kernel', 'degree', 'gamma', 'coef0', 'probability']]

In [None]:
SVM_df.sort_values('accuracy_top_20pc').groupby(['n_features','kernel','degree', 'gamma', 'coef0']).first()

### SVM models by kernel vs n_features
rbf seems to be the best

In [None]:
plt.figure(figsize=(10, 5))
ax = sns.lineplot(
    data=SVM_df,
    x="n_features",
    y="accuracy_top_20pc",
    hue="kernel",
    markers=True,
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.show()

### in ensembling, we will try SVM with poly, rbf or sigmoid kernel, linear is not suited for the task

# what hyperparameters are best for XGBoost

In [None]:
XGB_df = results[results['classifier'] == 'XGBClassifier']
XGB_best_df = XGB_df.sort_values('accuracy_top_20pc', ascending = False).groupby(['n_features']).first()
XGB_best_df[['accuracy', 'accuracy_top_20pc','n_estimators', 'learning_rate','booster','min_child_weight', 'max_depth', 'tree_method']]

# what hyperparameters are best for NN

In [None]:
results.classifier.unique()

In [None]:
NN_df = results[results['classifier'] == 'MLPClassifier']
NN_best_df = NN_df.sort_values('accuracy_top_20pc', ascending = False).groupby(['n_features']).first()
NN_best_df[['accuracy', 'accuracy_top_20pc','hidden_layer_sizes', 'activation', 'solver', 'alpha', 'learning_rate_init', 'max_iter']]

# ensembling:

In [None]:
def score_function(proba, y_test, n_features):
    proba_1 = np.array([proba[:,1], y_test]).T
    proba_1 = proba_1[proba_1[:, 0].argsort()][::-1]
    top_20pc = proba_1[: int(len(proba_1) * 0.2)]
    score = sum(top_20pc[:,1]) * 10 * 1000/len(top_20pc) - n_features*200
    return score


def accuracy_top_20pc(proba, y_test):
    proba_1 = np.array([proba[:,1], y_test]).T
    proba_1 = proba_1[proba_1[:, 0].argsort()][::-1]
    top_20pc = proba_1[: int(len(proba_1) * 0.2)]
    acc_top_20pc = accuracy_score(top_20pc[:, 1], np.round(top_20pc[:, 0]))
    return acc_top_20pc

    

class Ensemble(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers_params):
        """
        Initialize the ensemble with classifiers and their parameters.
        Parameters:
        classifiers_params (list): List of tuples where each tuple is 
                                   (classifier_class, params_dict).
        """
        self.classifiers_params = classifiers_params
        self.classifiers = []
        self.probas = None

    def fit(self, X, y):
        """
        Fit all classifiers to the training data.

        Parameters:
        X (array-like): Training features.
        y (array-like): Training labels.
        """
        self.classifiers = []
        for clf_class, params in self.classifiers_params:
            clf = clf_class(**params)
            clf.fit(X, y)
            self.classifiers.append(clf)
        return self

    def predict_proba(self, X):
        """
        Predict class probabilities using soft voting.

        Parameters:
        X (array-like): Test features.

        Returns:
        dict: Dictionary of classifier class names and predicted probabilities.
        """
        self.probas = {clf.__class__.__name__: clf.predict_proba(X) for clf in self.classifiers}

        return self.probas

    def compare(self, X_test, y_test):
        """
        Compare all possible combinations of classifiers using a custom score function.

        Parameters:
        X (array-like): Test features.
        y (array-like): Test labels.

        Returns:
        DataFrame: Results with combination and custom score.
        """
        results = []
        self.predict_proba(X_test)
        classifier_names = list(self.probas.keys())
        n_classifiers = len(classifier_names)

        for r in range(1, n_classifiers + 1):
            for subset in itertools.combinations(classifier_names, r):
                probas = [self.probas[name] for name in subset]
                avg_proba = np.mean(probas, axis=0)
                score = score_function(avg_proba, y_test, X_test.shape[1])
                results.append({
                    'n_features': X_test.shape[1],
                    'Combination': subset,
                    'score': score,
                    'accuracy' : accuracy_score(np.round(avg_proba[:,1]).astype(int), y_test),
                    'accuracy_top_20pc' : accuracy_top_20pc(avg_proba, y_test)
                })

        return pd.DataFrame(results)


In [None]:
X, y = get_data()
features_by_importance = [105,100,101,102,103,104,8,113,2,391] # top 10 features by importance from RFE

In [None]:
results = pd.DataFrame()
for n_features in range(1, 7, 1):
    X_features = X[:,features_by_importance[:n_features]]
    SVC_params = {'kernel' : SVM_best_df.iloc[n_features-1].kernel,
           'degree' : SVM_best_df.iloc[n_features-1].degree.astype(int),
           'gamma' : SVM_best_df.iloc[n_features-1].gamma,
           'coef0' : SVM_best_df.iloc[n_features-1].coef0,
           'probability' : True
           }
    XGB_params = {'n_estimators' : XGB_best_df.iloc[n_features-1].n_estimators.astype(int),
                     'learning_rate' : XGB_best_df.iloc[n_features-1].learning_rate,
                     'booster' : XGB_best_df.iloc[n_features-1].booster,
                     'min_child_weight' : XGB_best_df.iloc[n_features-1].min_child_weight.astype(int),
                     'max_depth' : XGB_best_df.iloc[n_features-1].max_depth.astype(int),
                     'tree_method' : XGB_best_df.iloc[n_features-1].tree_method
                     }
    MLP_params = {'hidden_layer_sizes' : NN_best_df.iloc[n_features-1].hidden_layer_sizes,
                     'activation' : NN_best_df.iloc[n_features-1].activation,
                     'solver' : NN_best_df.iloc[n_features-1].solver,
                     'alpha' : NN_best_df.iloc[n_features-1].alpha,
                     'learning_rate_init' : NN_best_df.iloc[n_features-1].learning_rate_init,
                     'max_iter' : NN_best_df.iloc[n_features-1].max_iter.astype(int)}
    classifier_params = [
    (SVC, SVC_params),
    (XGBClassifier, XGB_params),
    (MLPClassifier, MLP_params)
    ]
    for r in range(42, 53, 1):
       ensemble = Ensemble(classifier_params)
       X_train, X_test, y_train, y_test = train_test_split(X_features, y, random_state=r, test_size=0.2)
       ensemble.fit(X_train, y_train)
       out = ensemble.compare(X_test, y_test)
       results = pd.concat([results,out])

In [None]:
results.to_csv('../results/ensemble_plot_data.csv')

In [None]:
grouped = results.groupby(['n_features', 'Combination']).agg(['mean', 'std'])
grouped.columns = ['_'.join(col) for col in grouped.columns]
grouped.to_csv('../results/ensemble_plot_data.csv')

### 4 features, grid search for SVM + XGBoost

In [None]:
n_features = 4

In [None]:
SVC_params = {'kernel' : SVM_best_df.iloc[n_features-1].kernel,
           'degree' : SVM_best_df.iloc[n_features-1].degree.astype(int),
           'gamma' : SVM_best_df.iloc[n_features-1].gamma,
           'coef0' : SVM_best_df.iloc[n_features-1].coef0,
           'probability' : True
           }
XGB_params = {'n_estimators' : XGB_best_df.iloc[n_features-1].n_estimators.astype(int),
                    'learning_rate' : XGB_best_df.iloc[n_features-1].learning_rate,
                    'booster' : XGB_best_df.iloc[n_features-1].booster,
                    'min_child_weight' : XGB_best_df.iloc[n_features-1].min_child_weight.astype(int),
                    'max_depth' : XGB_best_df.iloc[n_features-1].max_depth.astype(int),
                    'tree_method' : XGB_best_df.iloc[n_features-1].tree_method
                    }

In [None]:
# results = pd.DataFrame()
classifiers = [
    (
        XGBClassifier,
        {
            "n_estimators": [150],
            "learning_rate": [0.001, 0.01],
            "min_child_weight" : [0, 1, 2, 3, 4],
            "subsample" : [1/2],
            "lambda" : [0.1, 0.5],
            "max_depth" : [2, 6, 8],
            "tree_method" : ['approx', 'hist']
        }
    ),
    (
        SVC,
        {
        "kernel" : ['poly'],
        "degree" : [3,4,5],
        "gamma" : ['auto'],
        "coef0" : [0.01, 0.2],
        "probability" : [True]
    }
    )
]
XGB, XGB_params = classifiers[0]
SVM, SVM_params = classifiers[1]
XGB_param_combinations = get_param_combinations(XGB_params)
SVM_param_combinations = get_param_combinations(SVM_params)
for XGB_param_set in XGB_param_combinations:
    for SVM_param_set in SVM_param_combinations:
        for k in range(42,53,1):
            ensemble = Ensemble([(XGB, XGB_param_set), (SVM, SVM_param_set)])
            X_train, X_test, y_train, y_test = train_test_split(X[:,features_by_importance[:n_features]], y, test_size=0.2)
            
            start = time.time()
            ensemble.fit(X_train, y_train)
            out = ensemble.compare(X_test, y_test)
            out['XGB_params'] = [XGB_param_set]*out.shape[0]
            out['SVM_params'] = [SVM_param_set]*out.shape[0]
            results = pd.concat([results, out])
            results.to_csv('../results/ensemble_grid_search.csv', index = False)
            print(out['score'])
        print(f"{start - time.time():.2f}s")

In [None]:
results.sort_values('score', ascending = False).head(20)

# top 3 models are run on more train/test splits for better evaluation

In [None]:
df = pd.read_csv('../results/ensemble_grid_search.csv')
grouped = df.groupby(['Combination', 'XGB_params', 'SVM_params']).agg(['mean', 'std'])
grouped.columns = ['_'.join(col) for col in grouped.columns]
top_3 = grouped.sort_values('score_mean', ascending=False).head(3)

# Extract the XGB_params and SVM_params for the top 3 rows
top_3_params = top_3.index.to_frame(index=False)[['XGB_params', 'SVM_params']]

In [None]:
XGB_param_sets = top_3_params['XGB_params']
SVM_param_sets = top_3_params['SVM_params']

In [None]:
XGB_params = [json.loads(i.replace("'", '"')) for i in XGB_param_sets]
SVM_params = [json.loads(i.replace("'", '"').replace('True', '"True"')) for i in SVM_param_sets]
for i in range(3):
    SVM_params[i]['probability'] = True

In [None]:
X_4 = X[:,features_by_importance[:n_features]]

In [None]:
results = pd.DataFrame()
for XGB_param_set, SVM_param_set in zip(XGB_params, SVM_params):
    for k in range(100):
        ensemble = Ensemble([(XGBClassifier, XGB_param_set), (SVC, SVM_param_set)])
        X_train, X_test, y_train, y_test = train_test_split(X_4, y, test_size=0.2)
        ensemble.fit(X_train, y_train)
        out = ensemble.compare(X_test, y_test)
        out['XGB_params'] = [XGB_param_set]*out.shape[0]
        out['SVM_params'] = [SVM_param_set]*out.shape[0]
        out['split_size'] = [0.2] * out.shape[0]
        results = pd.concat([results, out])
        results.to_csv('../results/final_ensembles.csv', index = False)
        print(k, 'first')
    for k in range(100):
        ensemble = Ensemble([(XGBClassifier, XGB_param_set), (SVC, SVM_param_set)])
        X_train, X_test, y_train, y_test = train_test_split(X_4, y, test_size=0.35)
        ensemble.fit(X_train, y_train)
        out = ensemble.compare(X_test, y_test)
        out['XGB_params'] = [XGB_param_set]*out.shape[0]
        out['SVM_params'] = [SVM_param_set]*out.shape[0]
        out['split_size'] = [0.35] * out.shape[0]
        results = pd.concat([results, out])
        results.to_csv('../results/final_ensembles.csv', index = False)
        print(k, 'second')

In [None]:
features_by_importance = [105,100,101,102,103,104,8,113,2,391]
X_4 = X[:,features_by_importance[:n_features]]
results = pd.DataFrame()
for XGB_param_set, SVM_param_set in zip(XGB_params, SVM_params):
    for k in range(100):
        ensemble = Ensemble([(XGBClassifier, XGB_param_set), (SVC, SVM_param_set)])
        X_train, X_test, y_train, y_test = train_test_split(X_4, y, test_size=0.2)
        ensemble.fit(X_train, y_train)
        out = ensemble.compare(X_test, y_test)
        out['XGB_params'] = [XGB_param_set]*out.shape[0]
        out['SVM_params'] = [SVM_param_set]*out.shape[0]
        out['split_size'] = [0.2] * out.shape[0]
        results = pd.concat([results, out])
        results.to_csv('../results/final_ensembles_orig_features.csv', index = False)
        print(k, 'first')
    for k in range(100):
        ensemble = Ensemble([(XGBClassifier, XGB_param_set), (SVC, SVM_param_set)])
        X_train, X_test, y_train, y_test = train_test_split(X_4, y, test_size=0.35)
        ensemble.fit(X_train, y_train)
        out = ensemble.compare(X_test, y_test)
        out['XGB_params'] = [XGB_param_set]*out.shape[0]
        out['SVM_params'] = [SVM_param_set]*out.shape[0]
        out['split_size'] = [0.35] * out.shape[0]
        results = pd.concat([results, out])
        results.to_csv('../results/final_ensembles_orig_features.csv', index = False)
        print(k, 'second')