In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier


import random
random.seed(42)
np.random.seed(42)

In [2]:
X = pd.read_csv("data/x_train.txt", header=None, sep=" ")
y = pd.read_csv("data/y_train.txt", header=None)[0]

In [35]:
best_features = [105, 100, 102, 8, 2]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1]:
def custom_cost_function(features, y_test, y_pred):
    posit_count_true = np.sum(y==1)
    precision = precision_score(y_test, y_pred)
    numb_of_features = len(features)
    return 10*posit_count_true*precision - 200*numb_of_features

In [6]:
def get_classfier_param_grid(code, best_params={}):
    if code == 'rf':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("rf", RandomForestClassifier(n_estimators=50, random_state=0, **best_params)),
            ]), {
            "rf__max_depth": [2, 4, 6],
            "rf__min_samples_split": [2, 4, 6],
            "rf__min_samples_leaf": [2, 5, 10,],
            "rf__max_features": ["sqrt", "log2"],
            },

    elif code == 'mlp':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("mlp", MLPClassifier(early_stopping=True, tol=0.005, n_iter_no_change=8, **best_params)),
            ]), {
        'mlp__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'mlp__activation': ['tanh', 'relu'],
        'mlp__solver': ['sgd', 'adam'],
        'mlp__alpha': [0.0001, 0.05],
        'mlp__learning_rate': ['constant','adaptive']}
    
    
    elif code == 'svm':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("svm", SVC(random_state=0, **best_params)),
            ]),{
            "svm__C": [0.1, 1, 10],
            "svm__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "svm__gamma": ["scale", "auto"],
        },


    elif code == 'gp':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("gpc", GaussianProcessClassifier(kernel=RBF(1.0))),
            ]
        ), {}
    

    elif code == 'qda':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("qda", QuadraticDiscriminantAnalysis()),
            ]
        ), {}
    

    elif code == 'xgb':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("xgb", XGBClassifier(random_state=0, use_label_encoder=False, **best_params)),
            ]),{
            "xgb__n_estimators": [100, 200, 300],
            "xgb__max_depth": [3, 4, 5],
            "xgb__learning_rate": [0.01, 0.1, 0.3],
            "xgb__subsample": [0.8, 1.0],
        }
    
    
    elif code == 'nb':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("nb", GaussianNB())
            ]), {
            "nb__var_smoothing": [1e-9, 1e-8, 1e-7]
        }
    
    elif code == 'knn':
        return Pipeline(
            [
                ("scaler", StandardScaler()),
                ("knn", KNeighborsClassifier())
            ]), {
            "knn__n_neighbors": [3, 5, 7, 9],
            "knn__weights": ["uniform", "distance"],
            "knn__p": [1, 2]
        }
    
    elif code == 'ada':
        return Pipeline([
                ("scaler", StandardScaler()),
                ("ada", AdaBoostClassifier())
            ]), {
            "ada__n_estimators": [50, 100, 200],
            "ada__learning_rate": [0.1, 0.5, 1.0],
            "ada__algorithm": ["SAMME", "SAMME.R"]
        }
    elif code == 'lr':
        return Pipeline([
                ("scaler", StandardScaler()),
                ("lr", LogisticRegression())
            ]), {
            "lr__C": [0.001, 0.01, 0.1, 1.0],
            "lr__penalty": ["l1", "l2"],
            "lr__solver": ["liblinear", "saga"]
        }

    elif code == 'ert':
        return Pipeline([
                ("scaler", StandardScaler()),
                ("ert", ExtraTreesClassifier(n_estimators=50))
            ]), {
            "ert__max_depth": [None, 10, 20],
            "ert__min_samples_split": [2, 5, 10],
            "ert__min_samples_leaf": [1, 2, 4],
            "ert__max_features": ["sqrt", "log2"]
        }
    else:
        raise Exception('Code not recognizible')

In [7]:
def fit_grid_search(X, y, clf_code):
    clf, param_grid = get_classfier_param_grid(clf_code)
    return GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring="accuracy",
        cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=0),
        n_jobs=-1,
    ).fit(X=X, y=y)

In [9]:
model_codes = ['rf', 'svm', 'qda', 'nb', 'ert']

In [10]:
def rename_hyperparameters(params, code):
    new_params = {}
    for key, val in params.items():
        new_key = key.replace(f'{code}__', '') 
        new_params[new_key] = val
    return new_params

In [23]:
feature_subsets = [best_features,
                    [100, 102, 8, 2],
                    [105, 102, 8, 2],
                    [105, 100, 8, 2],
                    [105, 100, 102, 8],
                    [105, 100, 102, 2]]

In [30]:
column_names = ['Model', 'hyperparameters', 'Features', 'Accuracy', 'Precision', 'Recall', 'Profit']
model_library_2 = pd.DataFrame(columns=column_names)

In [31]:
for features in feature_subsets:  
    for code in model_codes:
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1 + random.randint(0, 100))
       print(f"Features: {features}, Model: {code}")
       clf = fit_grid_search(X_train[features], y_train, code)
       y_pred = clf.predict(X_test[features])
       row = [code, rename_hyperparameters(clf.best_params_, code), features,
              accuracy_score(y_test, y_pred),
              precision_score(y_test, y_pred),
              recall_score(y_test, y_pred),
              custom_cost_function(features, y_test, y_pred)]
       model_library_2.loc[len(model_library_2)] = row


Features: [105, 100, 102, 8, 2], Model: rf
Features: [105, 100, 102, 8, 2], Model: svm
Features: [105, 100, 102, 8, 2], Model: qda
Features: [105, 100, 102, 8, 2], Model: nb
Features: [105, 100, 102, 8, 2], Model: ert
Features: [100, 102, 8, 2], Model: rf
Features: [100, 102, 8, 2], Model: svm
Features: [100, 102, 8, 2], Model: qda
Features: [100, 102, 8, 2], Model: nb
Features: [100, 102, 8, 2], Model: ert
Features: [105, 102, 8, 2], Model: rf
Features: [105, 102, 8, 2], Model: svm
Features: [105, 102, 8, 2], Model: qda
Features: [105, 102, 8, 2], Model: nb
Features: [105, 102, 8, 2], Model: ert
Features: [105, 100, 8, 2], Model: rf
Features: [105, 100, 8, 2], Model: svm
Features: [105, 100, 8, 2], Model: qda
Features: [105, 100, 8, 2], Model: nb
Features: [105, 100, 8, 2], Model: ert
Features: [105, 100, 102, 8], Model: rf
Features: [105, 100, 102, 8], Model: svm
Features: [105, 100, 102, 8], Model: qda
Features: [105, 100, 102, 8], Model: nb
Features: [105, 100, 102, 8], Model: ert


In [33]:
model_library_2.to_csv('model_library_2.csv', index=False)

In [32]:
model_library_2.sort_values('Profit', ascending=False).head(25)

Unnamed: 0,Model,hyperparameters,Features,Accuracy,Precision,Recall,Profit
21,svm,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}","[105, 100, 102, 8]",0.655,0.710956,0.579848,16945.454545
23,nb,{'var_smoothing': 1e-09},"[105, 100, 102, 8]",0.646,0.70202,0.540856,16722.424242
8,nb,{'var_smoothing': 1e-09},"[100, 102, 8, 2]",0.624,0.684492,0.498054,16284.919786
4,ert,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","[105, 100, 102, 8, 2]",0.666,0.689655,0.57377,16213.793103
22,qda,{},"[105, 100, 102, 8]",0.662,0.681093,0.60161,16200.091116
25,rf,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...","[105, 100, 102, 2]",0.645,0.680488,0.554672,16184.97561
20,rf,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...","[105, 100, 102, 8]",0.652,0.678241,0.583665,16128.888889
17,qda,{},"[105, 100, 8, 2]",0.656,0.676923,0.547718,16096.0
2,qda,{},"[105, 100, 102, 8, 2]",0.658,0.682984,0.587174,16047.272727
5,rf,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...","[100, 102, 8, 2]",0.633,0.672457,0.535573,15984.516129
