In [18]:
import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import KNNImputer
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

network_types = ["large"]
target_dbs = ["DRH"]


def hyperparameter_search(features: pd.DataFrame, labels: pd.DataFrame, scoring="roc_auc", random_state=1, n_iter=50,
                          sampling=None):
    n_repeats = 5
    n_splits = 5
    n_jobs = -1

    params = {
        'clf__n_estimators': [500, 1000, 2000, 5000],
        'clf__criterion': ['gini', 'entropy'],
        'clf__max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__max_features': ['log2', 'sqrt', None],
        'clf__bootstrap': [True, False],
        'clf__class_weight': [None, 'balanced', 'balanced_subsample'],
        'imputer__n_neighbors': [1, 5, 10, 15, 20],
        'imputer__weights': ['uniform', 'distance'],
        'var_thresh__threshold': [0.0, 0.05, 0.1, 0.15, 0.2],
    }
    rkf = RepeatedStratifiedKFold(n_repeats=n_repeats, n_splits=n_splits, random_state=random_state)
    if sampling is None:
        pipe = Pipeline(
            [('var_thresh', VarianceThreshold()),
             ('scaler', StandardScaler()),
             ('imputer', KNNImputer()),
             ('clf', BalancedRandomForestClassifier(random_state=random_state, n_jobs=n_jobs, sampling_strategy='all',
                                                    replacement=True))])
    else:
        pipe = Pipeline(
            [('var_thresh', VarianceThreshold()),
             ('scaler', StandardScaler()),
             ('imputer', KNNImputer()),
             ('sampler', sampling),
             ('clf', BalancedRandomForestClassifier(random_state=random_state, n_jobs=n_jobs, sampling_strategy='all',
                                                    replacement=True))])

    X, X_test, y, y_test = train_test_split(features, labels, test_size=0.2, shuffle=True, stratify=labels,
                                            random_state=random_state)

    rs_result = RandomizedSearchCV(pipe, params, random_state=random_state, n_iter=n_iter, scoring=scoring, cv=rkf,
                                   refit=True, n_jobs=n_jobs)
    rs_result.fit(X, y)
    rs_results_cv = pd.DataFrame(rs_result.cv_results_)
    mask = [x for x in rs_results_cv.columns if "split" in x and "param" not in x]
    rs_results_cv['min_test_score'] = rs_results_cv.loc[:, mask].min(axis=1)
    rs_results_cv['max_test_score'] = rs_results_cv.loc[:, mask].max(axis=1)
    rs_result_df = rs_results_cv.loc[:,
                   ['params', 'mean_test_score', 'std_test_score', 'min_test_score', 'max_test_score']]
    rs_result_df = rs_result_df.sort_values('mean_test_score', ascending=False)

    return rs_result_df


In [19]:
scoring = 'roc_auc'
random_state = 231222
n_iter = 100
sampling = RandomUnderSampler(random_state=random_state)

for network_type in network_types:
    print(network_type)
    for target_db in target_dbs:
        print(target_db)
        original = pd.read_csv(f'../../2_feature_extraction/{network_type}/features_{target_db} Target.csv',
                               index_col=0)
        original = original.drop(['Target', 'Neighbor'], axis=1)
        original = original[original['Outcome'] != 'Very Weak']
        original = original[original['Outcome'] != 'Low']
        original['Outcome'] = original['Outcome'].replace({'Weak': 'Effect'})
        original['Outcome'] = original['Outcome'].replace({'Strong': 'Effect'})
        original['Outcome'] = original['Outcome'].replace({'No-Effect': 0, 'Cyto': -1, 'Effect': 1})
        outcomes = ['No-Effect vs. Effect', 'No-Effect vs. Cyto', 'Cyto vs. Effect']
        results = {}
        results_random = {}
        for outcome in outcomes:
            print(" " + outcome)
            data = original.copy()
            os = outcome.split(' vs. ')
            drop_outcome = None
            if 'Effect' not in os:
                drop_outcome = 1
                data = data[data['Outcome'] != drop_outcome]
                data['Outcome'] = data['Outcome'].replace({-1: 1})
            elif 'Cyto' not in os:
                drop_outcome = -1
                data = data[data['Outcome'] != drop_outcome]
            elif 'No-Effect' not in os:
                drop_outcome = 0
                data = data[data['Outcome'] != drop_outcome]
                data['Outcome'] = data['Outcome'].replace({-1: 0})
            else:
                raise Exception()

            data = data.dropna()
            labels = data.copy().loc[:, 'Outcome']
            features = data.copy().drop('Outcome', axis=1)

            rs_result_df = hyperparameter_search(features=features, labels=labels, sampling=sampling, scoring=scoring, random_state=random_state, n_iter=n_iter)

            rs_result_df.to_csv(f'{outcome}.csv')
            display(rs_result_df)
            rs_result_random_df = hyperparameter_search(features=features, labels=labels.sample(frac=1.0, random_state=random_state), sampling=sampling, scoring=scoring, random_state=random_state, n_iter=n_iter)
            rs_result_random_df.to_csv(f'{outcome.replace(" vs. ", "_")}_rand.csv')



large
DRH
 No-Effect vs. Effect


Unnamed: 0,params,mean_test_score,std_test_score,min_test_score,max_test_score
19,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.701479,0.041535,0.616703,0.768264
18,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.701209,0.043503,0.614886,0.777373
7,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.700995,0.044495,0.609584,0.779998
13,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.698672,0.044547,0.604281,0.781667
62,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.697282,0.044193,0.599347,0.782085
...,...,...,...,...,...
43,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.691836,0.039088,0.613511,0.755106
12,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.691547,0.039960,0.606294,0.771701
97,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.691246,0.042575,0.603226,0.779532
15,"{'var_thresh__threshold': 0.05, 'imputer__weig...",0.691123,0.039196,0.628535,0.774156


 No-Effect vs. Cyto


Unnamed: 0,params,mean_test_score,std_test_score,min_test_score,max_test_score
80,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.773852,0.051530,0.690966,0.857022
76,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.771191,0.053230,0.681464,0.872626
44,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.770149,0.052421,0.683489,0.870081
53,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.768700,0.053865,0.673520,0.866520
99,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.768632,0.054168,0.669159,0.863467
...,...,...,...,...,...
15,"{'var_thresh__threshold': 0.05, 'imputer__weig...",0.723100,0.081920,0.606854,0.892300
17,"{'var_thresh__threshold': 0.1, 'imputer__weigh...",0.722740,0.081415,0.605763,0.887720
73,"{'var_thresh__threshold': 0.15, 'imputer__weig...",0.722480,0.082115,0.605062,0.890943
37,"{'var_thresh__threshold': 0.15, 'imputer__weig...",0.721610,0.081967,0.604517,0.889671


 Cyto vs. Effect


Unnamed: 0,params,mean_test_score,std_test_score,min_test_score,max_test_score
45,"{'var_thresh__threshold': 0.15, 'imputer__weig...",0.739506,0.057537,0.566986,0.820946
42,"{'var_thresh__threshold': 0.15, 'imputer__weig...",0.739428,0.058213,0.581340,0.831081
95,"{'var_thresh__threshold': 0.2, 'imputer__weigh...",0.739243,0.058170,0.564593,0.822072
87,"{'var_thresh__threshold': 0.1, 'imputer__weigh...",0.738116,0.061105,0.564593,0.824324
14,"{'var_thresh__threshold': 0.2, 'imputer__weigh...",0.738105,0.062188,0.550239,0.816441
...,...,...,...,...,...
19,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.713221,0.079587,0.464115,0.860360
62,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.710637,0.075029,0.472488,0.853070
13,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.709580,0.078997,0.442584,0.839912
7,"{'var_thresh__threshold': 0.0, 'imputer__weigh...",0.706820,0.086585,0.430622,0.846847
