In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

num_features = 500
num_samples = 4000
if os.path.isfile("full_data_{}_{}.csv".format(num_features, num_samples)) == False:
    X, y = make_classification(random_state=42,     
                            n_samples=num_samples, 
                            n_features=num_features, 
                            n_informative=200,
                            n_redundant=100,
                            class_sep=1,
                            flip_y=0.2)

    Xdf = pd.DataFrame(data = X)
    ydf = pd.DataFrame(data = y)

    Xdf.to_csv("full_data_{}_{}.csv".format(num_features, num_samples))
    ydf.to_csv("full_label_{}_{}.csv".format(num_features, num_samples))
else:
    Xdf = pd.read_csv("full_data_{}_{}.csv".format(num_features, num_samples), index_col=[0])
    ydf = pd.read_csv("full_label_{}_{}.csv".format(num_features, num_samples), index_col=[0])
Xdf.columns = [i for i in range(num_features)]
Xdf.columns = ["feature_{}".format(i) for i in range(num_features)]
Xdf.index = ['sample_' + str(i) for i in range(num_samples)]

feature_mat = dict()
feature_mat["GW1"] = Xdf[[item for item in Xdf.columns if int(item.replace("feature_", "")) < 100]]
feature_mat["GW2"] = Xdf[[item for item in Xdf.columns if (int(item.replace("feature_", "")) < 250) and (int(item.replace("feature_", "")) >= 100)]]
feature_mat["GW3"] = Xdf[[item for item in Xdf.columns if (int(item.replace("feature_", "")) < 350) and (int(item.replace("feature_", "")) >= 250)]]
feature_mat["GW4"] = Xdf[[item for item in Xdf.columns if (int(item.replace("feature_", "")) < 500) and (int(item.replace("feature_", "")) >= 350)]]

##### train test split for each feature matrix
feature_train = dict()
feature_test = dict()
feature_unseen = dict()
feature_unseen2 = dict()
feature_train_for_stack = dict()
for i in feature_mat.keys():
    X  = feature_mat[i].to_numpy()
    y = ydf["0"].to_numpy()
    X_train, X_unseen, y_train, y_unseen = train_test_split(X, y, random_state=42, test_size=0.2)
    X_train, X_unseen2, y_train, y_unseen2 = train_test_split(X_train, y_train, random_state=42, test_size=0.2)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42, test_size=0.2)
    X_train_for_stack, X_train, y_train_for_stack, y_train = train_test_split(X_train, y_train, random_state=42, test_size=0.5)
    feature_train[i] = (X_train, y_train)
    feature_train_for_stack[i] = (X_train_for_stack, y_train_for_stack)
    feature_test[i] = (X_test, y_test)
    feature_unseen[i] = (X_unseen, y_unseen)
    feature_unseen2[i] = (X_unseen2, y_unseen2)
    
#####-----------------------------------------------------------------#####
##### DEFINE MODELS
#####-----------------------------------------------------------------#####
models = {
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1],
            'kernel': ['rbf'],
            'probability': [True]
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100],
            'max_depth': [30,40, 50]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [0.1, 1, 5, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [10, 20, 30, 40],
            'criterion': ['gini', 'entropy']
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'probability': [True]
        }
    }
}

selected_model = dict()
best_score = dict()
selected_model_name = dict()
for feat in feature_mat.keys():
    X_train, y_train = feature_train_for_stack[feat][0], feature_train_for_stack[feat][1]
    best_models = {}
    best_scores = {}
    for model_name, model_info in models.items():
        clf = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='accuracy')
        clf.fit(X_train, y_train)
        best_models[model_name] = clf.best_estimator_
        best_scores[model_name] = clf.best_score_ 
        
    selected_model_name[feat] = [item for item in best_scores.keys() if best_scores[item] == max(best_scores.values())][0]
    best_score[feat] = best_scores[selected_model_name[feat]]
    selected_model[feat] = best_models[selected_model_name[feat]]
    
acc_test = dict()
acc_unseen = dict()
acc_unseen2 = dict()
for feat in feature_mat.keys():
    acc_test[feat] = accuracy_score(selected_model[feat].predict(feature_test[feat][0]), feature_test[feat][1])
    acc_unseen[feat] = accuracy_score(selected_model[feat].predict(feature_unseen[feat][0]), feature_unseen[feat][1])
    acc_unseen2[feat] = accuracy_score(selected_model[feat].predict(feature_unseen2[feat][0]), feature_unseen2[feat][1])

modeldf = pd.DataFrame.from_dict(selected_model_name, orient='index', columns = ["model"]).reset_index()
cv_scoredf  = pd.DataFrame.from_dict(best_score, orient='index', columns = ["cv_score"]).reset_index()
acc_testdf = pd.DataFrame.from_dict(acc_test, orient='index', columns = ["acc_test"]).reset_index()
acc_unseendf = pd.DataFrame.from_dict(acc_unseen, orient='index', columns = ["acc_unseen"]).reset_index()
acc_unseen2df = pd.DataFrame.from_dict(acc_unseen2, orient='index', columns = ["acc_unseen2"]).reset_index()

resdf = pd.merge(modeldf, cv_scoredf, on='index')
tmpdf = pd.merge(acc_testdf, acc_unseendf, on="index")
resdf = pd.merge(resdf, tmpdf, on = "index")
resdf = pd.merge(resdf, acc_unseen2df, on = "index")

stack_feature_train = dict()
stack_feature_test = dict()
stack_feature_unseen = dict()
stack_feature_unseen2 = dict()

for feat in feature_mat.keys():
    tmp_model = selected_model[feat]
    stack_feature_train[feat] = tmp_model.predict_proba(feature_train[feat][0])[:,0]
    stack_feature_test[feat] = tmp_model.predict_proba(feature_test[feat][0])[:,0]
    stack_feature_unseen[feat] = tmp_model.predict_proba(feature_unseen[feat][0])[:,0]
    stack_feature_unseen2[feat] = tmp_model.predict_proba(feature_unseen2[feat][0])[:,0]

stack_feature_train = pd.DataFrame.from_dict(stack_feature_train, orient='index').T
stack_feature_test = pd.DataFrame.from_dict(stack_feature_test, orient='index').T
stack_feature_unseen = pd.DataFrame.from_dict(stack_feature_unseen, orient='index').T
stack_feature_unseen2 = pd.DataFrame.from_dict(stack_feature_unseen2, orient='index').T

best_models = {}
best_scores = {}
for model_name, model_info in models.items():
    clf = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='accuracy', refit = True)
    clf.fit(stack_feature_train.to_numpy(), y_train)
    best_models[model_name] = clf.best_estimator_
    best_scores[model_name] = clf.best_score_ 
    
selected_model_name = [item for item in best_scores.keys() if best_scores[item] == max(best_scores.values())][0]
best_stack_score = best_scores[selected_model_name]
selected_stack_model = best_models[selected_model_name]
acc_test = accuracy_score(selected_stack_model.predict(stack_feature_test.to_numpy()), y_test)
acc_unseen = accuracy_score(selected_stack_model.predict(stack_feature_unseen.to_numpy()), y_unseen)
acc_unseen2 = accuracy_score(selected_stack_model.predict(stack_feature_unseen2.to_numpy()), y_unseen2)


stack_resdf = pd.DataFrame(data = [selected_stack_model], columns=["Model"])
stack_resdf["cv_score"] = best_stack_score
stack_resdf["acc_test"] = acc_test
stack_resdf["acc_unseen"] = acc_unseen
stack_resdf["acc_unseen2"] = acc_unseen2

In [4]:
resdf

Unnamed: 0,index,model,cv_score,acc_test,acc_unseen,acc_unseen2
0,GW1,RandomForest,0.611291,0.572266,0.60375,0.592187
1,GW2,SVC,0.616217,0.597656,0.64875,0.648438
2,GW3,SVC,0.652329,0.615234,0.63375,0.634375
3,GW4,SVC,0.630861,0.667969,0.6725,0.678125


In [5]:
stack_resdf

Unnamed: 0,Model,cv_score,acc_test,acc_unseen,acc_unseen2
0,DecisionTreeClassifier(max_depth=30),0.52736,0.490234,0.48625,0.49375
