In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import collections
from snorkel import SnorkelSession
from snorkel.models import Candidate, Label, LabelKey
from tcre import supervision
from tcre.supervision import SPLIT_DEV, SPLIT_TEST, SPLIT_VAL
from tcre.env import *
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [2]:
#candidate_class = classes.inducing_transcription_factor

In [3]:
cands = session.query(Candidate).filter(Candidate.split.in_([SPLIT_DEV, SPLIT_TEST, SPLIT_VAL])).all()
len(cands)

3957

In [4]:
df_label_key = pd.DataFrame(session.query(LabelKey.id, LabelKey.name).all())
df_label_key.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 2 columns):
id      66 non-null int64
name    66 non-null object
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


In [5]:
df_label_key.head()

Unnamed: 0,id,name
0,1,LF_indck_comp_imexpresso_nonneg
1,2,LF_indck_comp_neg_sec
2,3,LF_indck_comp_xor
3,4,LF_indck_dsup_imexpresso
4,5,LF_indck_heur_closer_ck_to_ct


In [87]:
from snorkel.annotations import load_label_matrix
from snorkel.learning.utils import LabelBalancer

def get_features(candidate_class, split):
    key_names = {r[0]: r[1] for r in session.query(LabelKey.id, LabelKey.name).all()}
    cids_query = supervision.get_cids_query(session, candidate_class, split)
    X = load_label_matrix(session, split=split, load_as_array=False, cids_query=cids_query, key_group=candidate_class.index)
    index = [X.row_index[i] for i in np.arange(X.shape[0])]
    columns = [key_names[X.col_index[i]] for i in np.arange(X.shape[1])]
    df = pd.DataFrame(X.toarray(), columns=columns, index=index)
    return df

def get_labels(candidate_class, split):
    y = supervision.get_gold_labels(session, candidate_class, split)
    return y

def get_data(candidate_class, split, balance=.5):
    X, y = get_features(candidate_class, split), get_labels(candidate_class, split).map({-1:0, 1:1})
    assert y.notnull().all()
    if balance is not None:
        balancer = LabelBalancer(y.values)
        keep_idx = balancer.get_train_idxs(
            rebalance=balance, split=.5,
            rand_state=np.random.RandomState(TCRE_SEED)
        )
        X, y = X.iloc[keep_idx], y.iloc[keep_idx]
    assert np.all(X.index == y.index)
    return X, y

In [26]:
X, y = get_data(classes.inducing_cytokine, SPLIT_DEV, balance=.5)
#df = get_features(classes.secreted_cytokine, SPLIT_TEST)
#df = get_features(classes.inducing_transcription_factor, SPLIT_DEV)

In [27]:
y.value_counts(normalize=True)

1    0.5
0    0.5
dtype: float64

In [99]:
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
test_fold = [0, 0, -1, -1]
ps = PredefinedSplit(test_fold)
for tr, te in ps.split():
    print(tr, te)
for tr, te in ps.split():
    print(tr, te)

[2 3] [0 1]
[2 3] [0 1]


In [106]:
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
LogisticRegressionCV()

In [122]:
est = RidgeClassifierCV()
est.fit(np.array([[0, 1], [3, 1], [0, 2]]), np.array([0, 0, 1]))

RidgeClassifierCV(alphas=array([ 0.1,  1. , 10. ]), class_weight=None, cv=None,
                  fit_intercept=True, normalize=False, scoring=None,
                  store_cv_values=False)

In [123]:
est.predict(np.array([[0, 1], [3, 1], [0, 2]]))

array([0, 0, 0])

In [None]:
LogisticRegression()

In [143]:
np.logspace(-3, 2, 15)

array([1.00000000e-03, 3.59381366e-03, 1.29154967e-02, 4.64158883e-02,
       1.66810054e-01, 5.99484250e-01, 2.15443469e+00, 7.74263683e+00,
       2.78255940e+01, 1.00000000e+02])

In [179]:
import collections
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score


def run_modeling(X, y, est):
    get_data(candidate_class, split, balance=.5)
    
def get_cv_data(data):
    n_dev, n_val = len(data[SPLIT_DEV][0]), len(data[SPLIT_VAL][0])
    X_train = np.concatenate((data[SPLIT_DEV][0], data[SPLIT_VAL][0]))
    y_train = np.concatenate((data[SPLIT_DEV][1], data[SPLIT_VAL][1]))
    X_test, y_test = data[SPLIT_TEST][0], data[SPLIT_TEST][1]
    X_val, y_val = data[SPLIT_VAL][0], data[SPLIT_VAL][1]
    X_dev, y_dev = data[SPLIT_DEV][0], data[SPLIT_DEV][1]
    
    # Creating single fold with DEV as training and VAL as test
    fold = np.zeros(n_dev + n_val, dtype=int)
    fold[:n_dev] = -1
    cv = PredefinedSplit(fold)
    assert cv.get_n_splits() == 1
    return (X_train, y_train, X_test, y_test, X_val, y_val, X_dev, y_dev), cv
    
def get_estimators(cv):
    def scorer(est, X, y_true):
        y_pred = np.squeeze(est.predict(X))
        y_true = np.squeeze(y_true)
        assert np.all(np.in1d(y_pred, [0, 1]))
        assert np.all(np.in1d(y_true, [0, 1]))
        if len(np.unique(y_pred)) < 2:
            return np.nan
        return f1_score(y_true, y_pred)
    
    def get_linear_model_gs(est):
        return GridSearchCV(est, param_grid=dict(C=np.logspace(-3, 2, 15)), cv=cv, scoring=scorer)
        
    ests = collections.OrderedDict([
        ('gbr', GridSearchCV(
            GradientBoostingClassifier(random_state=TCRE_SEED), 
            param_grid=dict(
                learning_rate=[.1, .05, .01],
                max_depth=[1,3,5],
                min_samples_leaf=[1,3]
            ), 
            cv=cv, 
            scoring=scorer
        )),
        ('ridge', Pipeline([
            ('normalize', StandardScaler()),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l2', solver='lbfgs')))
        ])),
        ('ridge2', Pipeline([
            ('normalize', StandardScaler()),
            ('feat', PolynomialFeatures(degree=2)),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l2', solver='lbfgs')))
        ])),
        ('lasso', Pipeline([
            ('normalize', StandardScaler()),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l1', solver='liblinear')))
        ])),
        ('lasso2', Pipeline([
            ('normalize', StandardScaler()),
            ('feat', PolynomialFeatures(degree=2)),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l1', solver='liblinear')))
        ])),
        
    ])
    return ests

def get_scores(y_true, y_pred, y_proba):
    return {
        'f1': f1_score(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_proba),
        'pr_auc': average_precision_score(y_true, y_proba)
    }
    
def process(candidate_class, balance=.5):
    splits = [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST]
    data = {split: get_data(candidate_class, split, balance=balance) for split in splits}
    (X_train, y_train, X_test, y_test, X_val, y_val, X_dev, y_dev), cv = get_cv_data(data)
    ests = get_estimators(cv)
    res = []
    
    def score(est, X, y):
        y_pred = est.predict(X)
        y_proba = est.predict_proba(X)
        assert y_proba.ndim == 2
        assert y_proba.shape[1] == 2
        y_proba = y_proba[:, 1]
        return get_scores(y, y_pred, y_proba)
        
    for k, est in ests.items():
        print(f'Processing model {k} for class {candidate_class.field}')
        est = None
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=ConvergenceWarning)
            est = est.fit(X_train, y_train)
        res.append(dict(
            type=candidate_class.field, est_name=k, est=est,
            dev_scores=score(est, X_dev, y_dev),
            validation_scores=score(est, X_val, y_val), 
            test_scores=score(est, X_test, y_test)
        ))
    return res

In [180]:
res = [
    r 
    for c in classes
    for r in process(classes[c])
]

Processing model gbr for class inducing_cytokine
Processing model ridge for class inducing_cytokine
Processing model ridge2 for class inducing_cytokine
Processing model lasso for class inducing_cytokine
Processing model lasso2 for class inducing_cytokine
Processing model gbr for class secreted_cytokine
Processing model ridge for class secreted_cytokine
Processing model ridge2 for class secreted_cytokine
Processing model lasso for class secreted_cytokine
Processing model lasso2 for class secreted_cytokine
Processing model gbr for class inducing_transcription_factor
Processing model ridge for class inducing_transcription_factor
Processing model ridge2 for class inducing_transcription_factor
Processing model lasso for class inducing_transcription_factor
Processing model lasso2 for class inducing_transcription_factor


In [174]:
dfs = pd.concat([
    pd.DataFrame([
        pd.Series(dict(type=r['type'], est_name=r['est_name'], split=split)).append(pd.Series(r[split]))
        for r in res
    ])
    for split in ['validation', 'test']
])
dfs.head()

Unnamed: 0,type,est_name,split,f1,accuracy,precision,recall,roc_auc,pr_auc
0,inducing_cytokine,gbr,validation,0.363636,0.5,0.5,0.285714,0.520408,0.571006
1,inducing_cytokine,ridge,validation,0.363636,0.5,0.5,0.285714,0.484694,0.572957
2,inducing_cytokine,ridge2,validation,0.444444,0.464286,0.461538,0.428571,0.586735,0.663551
3,inducing_cytokine,lasso,validation,0.363636,0.5,0.5,0.285714,0.545918,0.583367
4,inducing_cytokine,lasso2,validation,0.5,0.571429,0.6,0.428571,0.535714,0.626056


In [175]:
dfs

Unnamed: 0,type,est_name,split,f1,accuracy,precision,recall,roc_auc,pr_auc
0,inducing_cytokine,gbr,validation,0.363636,0.5,0.5,0.285714,0.520408,0.571006
1,inducing_cytokine,ridge,validation,0.363636,0.5,0.5,0.285714,0.484694,0.572957
2,inducing_cytokine,ridge2,validation,0.444444,0.464286,0.461538,0.428571,0.586735,0.663551
3,inducing_cytokine,lasso,validation,0.363636,0.5,0.5,0.285714,0.545918,0.583367
4,inducing_cytokine,lasso2,validation,0.5,0.571429,0.6,0.428571,0.535714,0.626056
5,secreted_cytokine,gbr,validation,0.763158,0.75,0.725,0.805556,0.844907,0.795004
6,secreted_cytokine,ridge,validation,0.736842,0.722222,0.7,0.777778,0.827546,0.792029
7,secreted_cytokine,ridge2,validation,0.794872,0.777778,0.738095,0.861111,0.876157,0.84005
8,secreted_cytokine,lasso,validation,0.763158,0.75,0.725,0.805556,0.849923,0.79869
9,secreted_cytokine,lasso2,validation,0.738462,0.763889,0.827586,0.666667,0.848765,0.793121


In [75]:
split = SPLIT_VAL
candidate_class = classes.secreted_cytokine

In [76]:
splits = [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST]
data = {split: get_data(candidate_class, split, balance=.5) for split in splits}

In [77]:
from sklearn.metrics import f1_score

In [78]:
from sklearn.ensemble import GradientBoostingClassifier
est = GradientBoostingClassifier()

In [79]:
est = est.fit(data[SPLIT_DEV][0].values, data[SPLIT_DEV][1].values)

In [80]:
yp = est.predict(data[split][0].values)

In [81]:
f1_score(data[split][1].values, yp), f1_score(data[split][1].values, np.ones(len(yp)))

(0.7123287671232876, 0.6666666666666666)