In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

from code.model_managing import get_classfier_param_grid

import random
random.seed(42)
np.random.seed(42)

In [11]:
model_library = pd.read_csv('model_library.csv')

In [3]:
X = pd.read_csv("data/x_train.txt", header=None, sep=" ")
y = pd.read_csv("data/y_train.txt", header=None)[0]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
def recreate_clf(X, y, model_library, index):
    code, params, features = model_library.loc[index][['Model', 'hyperparameters', 'Features']]
    model, _ = get_classfier_param_grid(code, eval(params))
    return model.fit(X[eval(features)], y), eval(features)

In [40]:
clf = recreate_clf(X_train)

TypeError: recreate_clf() missing 3 required positional arguments: 'y', 'model_library', and 'index'

In [70]:
def create_meta_sets(X_train, y_train, X_test, model_library, model_indeces):
    df_train, df_test = pd.DataFrame(), pd.DataFrame()
    for i in range(len(model_indeces)):
        clf, features = recreate_clf(X_train, y_train, model_library, model_indeces[i])
        df_train[f'pred_{i}'] = clf.predict(X_train[features])
        df_test[f'pred_{i}'] = clf.predict(X_test[features])
        try: 
            df_train[f'proba_{i}'] = clf.predict_proba(X_train[features])[:,0]
            df_test[f'proba_{i}'] = clf.predict_proba(X_test[features])[:,0]
        except Exception:
            continue       
    return df_train, df_test

In [78]:
def stacking(X_train, y_train, X_test, y_test, model_library, model_indeces):
    df_train, df_test = create_meta_sets(X_train, y_train, X_test, model_library, model_indeces)
    meta_learner = XGBClassifier()
    meta_learner.fit(df_train, y_train)
    return meta_learner.predict(df_test)

In [None]:
def custom_cost_function(features, y_test, y_pred):
    posit_count_true = np.sum(y==1)
    precision = precision_score(y_test, y_pred)
    numb_of_features = len(features)
    return 10*posit_count_true*precision - 200*numb_of_features

In [82]:
model_indeces = [102, 51, 114, 214, 50, 389]

In [83]:
y_pred = stacking(X_train, y_train, X_test, y_test, model_library, model_indeces)

102
51
114
214
50
389


In [85]:
precision_score(y_test, y_pred)

0.7011494252873564

In [84]:
accuracy_score(y_test, y_pred)

0.702