In [1]:
from experiments.helpers import *
from experiments.learning_curve import *

  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
X = sp.load_npz(get_s3('semi_ssd/data/splice/splice_X.npz', bucket='fau-bigdata'))
Y = np.load(get_s3('semi_ssd/data/splice/splice_Y.npy', bucket='fau-bigdata'))

In [3]:
check_data(X, Y)

(4627840, 100000)
(4627840,) (array([0, 1]), array([4613291,   14549]))


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2, SelectKBest, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pos_ratio = np.sum(Y) / Y.shape[0]
random_state = 42

In [5]:
pipeline = [
    VarianceThreshold(0),
    StandardScaler(with_mean=False),
    SelectKBest(score_func=chi2, k=1000),
    LogisticRegression(random_state=random_state),
]

In [6]:
def run_model(X, Y, test_pos_size, train_pos_size, pseudo_pos_size=None, run=0):
    start = datetime.now()
    
    # get test data
    test_total = int(test_pos_size / pos_ratio)
    test_neg_size = test_total - test_pos_size
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                        test_size=test_total, random_state=random_state + run)
    
    # get train data
    train_total = int(train_pos_size / pos_ratio)
    train_neg_size = train_total - train_pos_size
    
    if pseudo_pos_size: # if pseudo
        pseudo_total = int(pseudo_pos_size / pos_ratio)
        pseudo_neg_size = pseudo_total - pseudo_pos_size
        x_unlab, x_lab, y_unlab, y_lab = train_test_split(x_train, y_train, 
                                                          test_size=pseudo_total, random_state=random_state + run)
        
        y_pseudo, _ = pseudo_label(pipeline, x_lab, y_lab, x_unlab, y_unlab)
        x_train_samp, y_train_samp = sample_data(x_unlab, y_pseudo, 
                                                 train_pos_size, train_neg_size, random_state + run)
    else:
        x_train_samp, y_train_samp = sample_data(x_train, y_train, 
                                                 train_pos_size, train_neg_size, random_state + run)
    
    # evaluate model
    model = make_pipeline(*pipeline)
    model.fit(x_train_samp, y_train_samp)
    
    y_predict = model.predict_proba(x_test)[:, 1]
    auc = roc_auc_score(y_test, y_predict)
    
    out = {
        'run': run,
        'test_pos_size': test_pos_size,
        'test_neg_size': test_neg_size,
        'train_pos_size': train_pos_size,
        'test_neg_size': train_neg_size,
        'auc': auc,
        'time': datetime.now() - start
    }
    if pseudo_pos_size:
        out['pseudo_pos_size'] = pseudo_pos_size
        out['pseudo_neg_size'] = pseudo_neg_size
    print(f"RUN={run} Size={test_pos_size} AUC={round(auc, 4)} Time: {datetime.now() - start}")
    return out

In [7]:
small_args = [(X, Y, 2500, 50, None, r) for r in range(25)]
full_args = [(X, Y, 2500, 2500, None, r) for r in range(25)]
pseudo_args = [(X, Y, 2500, 2500, 50, r) for r in range(25)]
run_args = small_args + full_args + pseudo_args

In [8]:
out = []
for ar in run_args:
    out.append(run_model(*ar))
to_csv_s3(pd.DataFrame(out), 'semi_ssd/results/splice_semi_compare.csv', bucket='fau-bigdata')

RUN=0 Size=2500 AUC=0.4858 Time: 0:00:21.668933
RUN=1 Size=2500 AUC=0.4835 Time: 0:00:21.551915
RUN=2 Size=2500 AUC=0.5 Time: 0:00:21.633948
RUN=3 Size=2500 AUC=0.484 Time: 0:00:21.543689
RUN=4 Size=2500 AUC=0.5032 Time: 0:00:21.501067
RUN=5 Size=2500 AUC=0.4863 Time: 0:00:21.468974
RUN=6 Size=2500 AUC=0.4933 Time: 0:00:21.633713
RUN=7 Size=2500 AUC=0.5096 Time: 0:00:21.606536
RUN=8 Size=2500 AUC=0.4744 Time: 0:00:21.578911
RUN=9 Size=2500 AUC=0.4927 Time: 0:00:21.496983
RUN=10 Size=2500 AUC=0.5106 Time: 0:00:21.577023
RUN=11 Size=2500 AUC=0.5015 Time: 0:00:21.565767
RUN=12 Size=2500 AUC=0.5037 Time: 0:00:21.715034
RUN=13 Size=2500 AUC=0.5016 Time: 0:00:21.530517
RUN=14 Size=2500 AUC=0.4897 Time: 0:00:21.490785
RUN=15 Size=2500 AUC=0.5019 Time: 0:00:21.651124
RUN=16 Size=2500 AUC=0.5079 Time: 0:00:21.544587
RUN=17 Size=2500 AUC=0.4883 Time: 0:00:21.470754
RUN=18 Size=2500 AUC=0.4989 Time: 0:00:21.436814
RUN=19 Size=2500 AUC=0.4778 Time: 0:00:21.523248
RUN=20 Size=2500 AUC=0.497 Time: 0