In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, auc, matthews_corrcoef

In [2]:
def permutation_test_between_clfs(y_test, pred_proba_1, pred_proba_2, nsamples=1000):
    auc_differences = []
    auc1 = roc_auc_score(y_test.ravel(), pred_proba_1.ravel())
    auc2 = roc_auc_score(y_test.ravel(), pred_proba_2.ravel())
    observed_difference = auc1 - auc2
    for _ in range(nsamples):
        mask = np.random.randint(2, size=len(pred_proba_1.ravel()))
        p1 = np.where(mask, pred_proba_1.ravel(), pred_proba_2.ravel())
        p2 = np.where(mask, pred_proba_2.ravel(), pred_proba_1.ravel())
        auc1 = roc_auc_score(y_test.ravel(), p1)
        auc2 = roc_auc_score(y_test.ravel(), p2)
        auc_differences.append(auc1 - auc2)
   # return observed_difference, np.mean(auc_differences >= observed_difference)
    return observed_difference, np.mean(auc_differences)

In [3]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def sample_data(X, y, samp_type, samp_strat, seed=0):
    if samp_type == 'over':
        sampler = RandomOverSampler(sampling_strategy=samp_strat, random_state=seed)
    elif samp_type == 'under':
        sampler = RandomUnderSampler(sampling_strategy=samp_strat, random_state=seed)
    else:
        print("Invalid 'samp_type'")
        
    # fit and apply the transform
    X_res, y_res = sampler.fit_resample(X, y)
    # summarize class distribution
    #print(Counter(y_res))
    #print(X_res.shape)
    
    return X_res, y_res

In [4]:
algs = []
import itertools
for item in itertools.combinations(['a','b','c'], 2):
    print(item)

('a', 'b')
('a', 'c')
('b', 'c')


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
def run_clf(X, y, clf, under=False, samp_strat=1.0, seed=0):
    stats_df = pd.DataFrame()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=seed)

    if under:
        # Undersample the training data
        #print('Undersampling')
        X_res, y_res = sample_data(X_train, y_train, "under", samp_strat=samp_strat, seed=seed)
    else:
        #print('No Undersampling')
        X_res, y_res = X, y # Not subsampled - for use with class_weight='balanced'
            
#        print(f'In kfold_cv: train_y.shape = {train_y.shape}')
#        if sample_weights:
#            weights = class_weight.compute_sample_weight('balanced', y=y_res)
#            print(f'np.unique(weights): {np.unique(weights)}')
#            clf.fit(X_res, y_res, sample_weight=weights)
#        else:
#            clf.fit(X_res, y_res)
#            
    clf.fit(X_res, y_res)
    y_pred = clf.predict(X_test)
    probs = clf.predict_proba(X_test)
    proba = clf.predict_proba(X_test)[:,1]
    prob1 = probs[:,1]
    for idx in range(len(proba)):
        assert(proba[idx] == prob1[idx])
        
    mcc = matthews_corrcoef(y_pred=y_pred, y_true=y_test)
    print(f'MCC = {mcc}')

    return y_test, prob1

In [6]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
#X = X[corr_vars]
y = df['trans_loss'].values

In [7]:
%%time
clf = LogisticRegression(C=0.1,solver="liblinear",class_weight="balanced",random_state=7)
print(clf.get_params())
y_test_LR, prob1_LR = run_clf(X, y, clf, under=True, samp_strat=0.5)

{'C': 0.1, 'class_weight': 'balanced', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 7, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
MCC = 0.24344305950994968
CPU times: user 3.36 s, sys: 246 ms, total: 3.6 s
Wall time: 3.01 s


In [8]:
corr_df = pd.read_csv('../../data/csl/CramerTheil/Cramer_PI_Tl_coeff_Union50.csv', index_col=0, header=None, delimiter='\t')
corr_vars = list(corr_df.index.values)
len(corr_vars)

66

In [9]:
df = pd.read_csv('../../data/csl/CSL_tl_PI.csv', index_col=0)
X = df.drop('trans_loss', axis=1, inplace=False)
X = X[corr_vars]
y = df['trans_loss'].values

In [10]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=128, criterion="gini",max_depth=40, min_samples_leaf=16,
                                     min_samples_split=2,max_leaf_nodes=100,max_features="auto",
                                     class_weight="balanced",random_state=7)
print(clf)
y_test_RF, prob1_RF = run_clf(X, y, clf, under=False)

RandomForestClassifier(class_weight='balanced', max_depth=40,
                       max_leaf_nodes=100, min_samples_leaf=16,
                       n_estimators=128, random_state=7)
MCC = 0.25868352792106786
CPU times: user 19.6 s, sys: 49 ms, total: 19.7 s
Wall time: 19.7 s


In [11]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=70, criterion="friedman_mse",max_depth=11, min_samples_leaf=50,
                                     min_samples_split=900,max_leaf_nodes=None,max_features=12,subsample=0.9,
                                     learning_rate=0.1,random_state=7)
print(clf.get_params())
y_test_GB, prob1_GB = run_clf(X, y, clf, under=True)

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 11, 'max_features': 12, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 900, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 70, 'n_iter_no_change': None, 'random_state': 7, 'subsample': 0.9, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
MCC = 0.2608890511902654
CPU times: user 1.98 s, sys: 2.82 ms, total: 1.99 s
Wall time: 1.98 s


In [12]:
for idx in range(len(y_test_GB)):
    assert(y_test_GB[idx] == y_test_LR[idx])
    assert(y_test_GB[idx] == y_test_RF[idx])
    assert(y_test_LR[idx] == y_test_RF[idx])

In [13]:
%%time
permutation_test_between_clfs(y_test_LR, prob1_GB, prob1_RF, nsamples=1000)

CPU times: user 29.2 s, sys: 7.49 ms, total: 29.2 s
Wall time: 29.2 s


(-0.005299695569188856, -1.5352893877671692e-05)

In [14]:
%%time
permutation_test_between_clfs(y_test_LR, prob1_LR, prob1_RF, nsamples=1000)

CPU times: user 27.6 s, sys: 2.05 ms, total: 27.6 s
Wall time: 27.6 s


(-0.02640234209095549, -7.512043906406596e-05)

In [15]:
%%time
permutation_test_between_clfs(y_test_LR, prob1_LR, prob1_GB, nsamples=1000)

CPU times: user 27.9 s, sys: 6.19 ms, total: 27.9 s
Wall time: 27.9 s


(-0.021102646521766633, -3.2724699970076876e-05)