In [1]:
import time

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
    mutual_info_classif,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import (
    experiment,
    get_data,
    get_param_combinations,
    get_params_json,
    save_results,
)

from xgboost import XGBClassifier
X, y = get_data()

In [12]:
import json
import time
from itertools import product

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def get_data():
    X = pd.read_csv("../data/x_train.txt", sep=" ", header=None)
    y = pd.read_csv("../data/y_train.txt", header=None)
    scaler = StandardScaler()

    X = scaler.fit_transform(X)
    y = y.values.ravel()

    return X, y


def get_param_combinations(param_dict):
    value_prod = list(product(*param_dict.values()))
    keys = param_dict.keys()
    return [dict(zip(keys, values)) for values in value_prod]


def param_json_to_str(param_json):
    if type(param_json).__name__ == "function":
        return param_json.__name__
    return param_json


def get_params_json(params):
    params_mapped = {k: param_json_to_str(v) for k, v in params.items()}
    return json.dumps(params_mapped).replace('"', "'")


def save_results(results, filename):
    df = pd.DataFrame(
        results,
        columns=[
            "feature_selector",
            "feature_selector_params",
            "classifier",
            "classifier_params",
            "n_features",
            "accuracy",
            "accuracy_std",
            "accuracy_top_20pc",
            "elapsed_time",
        ],
    )
    df.to_csv(f"../results/{filename}.csv", index=False)


def experiment(
    X,
    y,
    fs_cls,
    fs_kwargs,
    clf_cls,
    clf_kwargs,
    n_features,
    k_param_name,
    requires_estimator,
    train_test_seeds,
):
    # Run experiment
    start = time.time()
    accs, accs_top_20pc = _experiment_internal(
        X,
        y,
        fs_cls,
        fs_kwargs,
        clf_cls,
        clf_kwargs,
        n_features,
        k_param_name,
        requires_estimator,
        train_test_seeds,
    )
    elapsed = time.time() - start
    elapsed = elapsed / len(train_test_seeds)

    acc = accs.mean()
    acc_std = accs.std()
    acc_top_20pc = accs_top_20pc.mean()

    result = (
        fs_cls.__name__,
        get_params_json(fs_kwargs),
        clf_cls.__name__,
        get_params_json(clf_kwargs),
        n_features,
        acc,
        acc_std,
        acc_top_20pc,
        elapsed,
    )

    return result


def _experiment_internal(
    X,
    y,
    fs_cls,
    fs_kwargs,
    clf_cls,
    clf_kwargs,
    n_features,
    k_param_name,
    requires_estimator,
    train_test_seeds,
):
    clf = clf_cls(**clf_kwargs)

    fs_kwargs = {
        k_param_name: n_features,
        **fs_kwargs,
    }
    if requires_estimator:
        feature_selector = fs_cls(estimator=clf, **fs_kwargs)
    else:
        feature_selector = fs_cls(**fs_kwargs)

    accs = []
    accs_top_20pc = []

    for seed in train_test_seeds:
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=0.2,
            stratify=y,
            random_state=seed,
        )

        # Feature selection
        cols = [105,100,101,102,103,104,8,113,2,391]

        X_train = X_train[:, cols[:n_features]]
        X_test = X_test[:, cols[:n_features]]

        # Training
        clf.fit(X_train, y_train)

        # Prediction
        pred = clf.predict(X_test)

        proba_1 = clf.predict_proba(X_test)[:, 1]
        proba_1 = np.array([proba_1, y_test]).T
        proba_1 = proba_1[proba_1[:, 0].argsort()][::-1]

        # Evaluation
        acc = accuracy_score(y_test, pred)
        top_20pc = proba_1[: int(len(proba_1) * 0.2)]
        acc_top_20pc = accuracy_score(top_20pc[:, 1], np.round(top_20pc[:, 0]))

        print(seed, acc)

        accs.append(acc)
        accs_top_20pc.append(acc_top_20pc)

    return np.array(accs), np.array(accs_top_20pc)

In [13]:
def run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds=[42]):
    for fs in feature_selectors:
        for clf in classifiers:
            for k in ks:
                # Generate parameter combinations
                fs_cls, fs_params, k_param_name, requires_estimator = fs
                clf_cls, clf_params = clf

                fs_param_combinations = get_param_combinations(fs_params)
                clf_param_combinations = get_param_combinations(clf_params)

                for fs_params in fs_param_combinations:
                    for clf_params in clf_param_combinations:
                        result = experiment(
                            X,
                            y,
                            fs_cls,
                            fs_params,
                            clf_cls,
                            clf_params,
                            k,
                            k_param_name,
                            requires_estimator,
                            train_test_seeds,
                        )

                        print(result)
                        print(f"Elapsed time: {result[-1]:.2f}s\n")
                        results.append(result)
                        save_results(results, filename)

In [14]:
results = []

filename = "svm_grid_search"

feature_selectors = [
     (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True)
]

classifiers = [
    (
        SVC,
        {"kernel" : ['linear', 'poly', 'rbf', 'sigmoid'],
         "degree" : [3,4,5],
         'gamma' : ['scale', 'auto'],
         'coef0' : [0, 1/8, 1/4, 1/2, 1],
         'probability' : [True]
        },
    )
]

ks = np.arange(1, 7, 1)
train_test_seeds = list(range(42, 47))

run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

42 0.509
43 0.522
44 0.54
45 0.528
46 0.514
('SelectFromModel', "{'threshold': -Infinity}", 'SVC', "{'kernel': 'linear', 'degree': 3, 'gamma': 'scale', 'coef0': 0, 'probability': true}", 1, 0.5226000000000001, 0.010873821775254557, 0.5589999999999999, 1.6629682540893556)
Elapsed time: 1.66s

42 0.509
43 0.522
44 0.54
45 0.528
46 0.514
('SelectFromModel', "{'threshold': -Infinity}", 'SVC', "{'kernel': 'linear', 'degree': 3, 'gamma': 'scale', 'coef0': 0.125, 'probability': true}", 1, 0.5226000000000001, 0.010873821775254557, 0.5650000000000001, 1.7007025241851808)
Elapsed time: 1.70s

42 0.509
43 0.522
44 0.54
45 0.528
46 0.514
('SelectFromModel', "{'threshold': -Infinity}", 'SVC', "{'kernel': 'linear', 'degree': 3, 'gamma': 'scale', 'coef0': 0.25, 'probability': true}", 1, 0.5226000000000001, 0.010873821775254557, 0.556, 1.7879930973052978)
Elapsed time: 1.79s

42 0.509
43 0.522
44 0.54
45 0.528
46 0.514
('SelectFromModel', "{'threshold': -Infinity}", 'SVC', "{'kernel': 'linear', 'degre