In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np
import collections
from snorkel import SnorkelSession
from snorkel.models import Candidate, Label, LabelKey
from tcre import supervision
from tcre.supervision import SPLIT_DEV, SPLIT_TEST, SPLIT_VAL
from tcre.env import *
session = SnorkelSession()
classes = supervision.get_candidate_classes()

In [2]:
cands = session.query(Candidate).filter(Candidate.split.in_([SPLIT_DEV, SPLIT_TEST, SPLIT_VAL])).all()
len(cands)

3957

In [3]:
df_label_key = pd.DataFrame(session.query(LabelKey.id, LabelKey.name).all())
df_label_key.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 2 columns):
id      80 non-null int64
name    80 non-null object
dtypes: int64(1), object(1)
memory usage: 1.3+ KB


In [4]:
df_label_key.head()

Unnamed: 0,id,name
0,1,LF_indck_comp_imexpresso_nonneg
1,2,LF_indck_comp_neg_sec
2,3,LF_indck_comp_xor
3,4,LF_indck_dsup_imexpresso
4,5,LF_indck_heur_closer_ck_to_ct


In [5]:
from snorkel.annotations import load_label_matrix
from snorkel.learning.utils import LabelBalancer

def get_features(candidate_class, split):
    key_names = {r[0]: r[1] for r in session.query(LabelKey.id, LabelKey.name).all()}
    cids_query = supervision.get_cids_query(session, candidate_class, split)
    X = load_label_matrix(session, split=split, load_as_array=False, cids_query=cids_query, key_group=candidate_class.index)
    index = [X.row_index[i] for i in np.arange(X.shape[0])]
    columns = [key_names[X.col_index[i]] for i in np.arange(X.shape[1])]
    df = pd.DataFrame(X.toarray(), columns=columns, index=index)
    return df

def get_labels(candidate_class, split):
    y = supervision.get_gold_labels(session, candidate_class, split)
    return y

def get_data(candidate_class, split, balance=.5):
    X, y = get_features(candidate_class, split), get_labels(candidate_class, split).map({-1:0, 1:1})
    assert y.notnull().all()
    if balance is not None:
        balancer = LabelBalancer(y.values)
        keep_idx = balancer.get_train_idxs(
            rebalance=balance, split=.5,
            rand_state=np.random.RandomState(TCRE_SEED)
        )
        X, y = X.iloc[keep_idx], y.iloc[keep_idx]
    assert np.all(X.index == y.index)
    return X, y

In [6]:
X, y = get_data(classes.inducing_cytokine, SPLIT_DEV)
y.value_counts()

1    137
0    137
dtype: int64

In [12]:
import collections
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score


def run_modeling(X, y, est):
    get_data(candidate_class, split, balance=.5)
    
def get_cv_data(data):
    n_dev, n_val = len(data[SPLIT_DEV][0]), len(data[SPLIT_VAL][0])
    X_train = np.concatenate((data[SPLIT_DEV][0], data[SPLIT_VAL][0]))
    y_train = np.concatenate((data[SPLIT_DEV][1], data[SPLIT_VAL][1]))
    X_test, y_test = data[SPLIT_TEST][0], data[SPLIT_TEST][1]
    X_val, y_val = data[SPLIT_VAL][0], data[SPLIT_VAL][1]
    X_dev, y_dev = data[SPLIT_DEV][0], data[SPLIT_DEV][1]
    
    # Creating single fold with DEV as training and VAL as test
    fold = np.zeros(n_dev + n_val, dtype=int)
    fold[:n_dev] = -1
    cv = PredefinedSplit(fold)
    assert cv.get_n_splits() == 1
    return (X_train, y_train, X_test, y_test, X_val, y_val, X_dev, y_dev), cv
    
def get_estimators(cv):
    def scorer(est, X, y_true):
        y_pred = np.squeeze(est.predict(X))
        y_true = np.squeeze(y_true)
        assert np.all(np.in1d(y_pred, [0, 1]))
        assert np.all(np.in1d(y_true, [0, 1]))
        if len(np.unique(y_pred)) < 2:
            return np.nan
        return f1_score(y_true, y_pred)
    
    def get_linear_model_gs(est):
        return GridSearchCV(est, param_grid=dict(C=np.logspace(-3, 2, 15)), cv=cv, scoring=scorer)
        
    ests = collections.OrderedDict([
        ('gbr', GridSearchCV(
            GradientBoostingClassifier(random_state=TCRE_SEED), 
            param_grid=dict(
                n_estimators=[50, 100],
                learning_rate=[.1, .05, .01],
                max_depth=[1,3,5],
                min_samples_leaf=[1,3]
            ), 
            cv=cv, 
            scoring=scorer
        )),
        ('xgb', GridSearchCV(
            XGBClassifier(random_state=TCRE_SEED), 
            param_grid=dict(
                n_estimators=[50, 100, 200],
                learning_rate=[.1, .05, .01],
                max_depth=[1,3,5]
            ), 
            cv=cv, 
            scoring=scorer
        )),
        ('ridge', Pipeline([
            ('normalize', StandardScaler()),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l2', solver='lbfgs')))
        ])),
        ('ridge2', Pipeline([
            ('normalize', StandardScaler()),
            ('feat', PolynomialFeatures(degree=2)),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l2', solver='lbfgs')))
        ])),
        ('lasso', Pipeline([
            ('normalize', StandardScaler()),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l1', solver='liblinear')))
        ])),
        ('lasso2', Pipeline([
            ('normalize', StandardScaler()),
            ('feat', PolynomialFeatures(degree=2)),
            ('est', get_linear_model_gs(LogisticRegression(random_state=TCRE_SEED, penalty='l1', solver='liblinear')))
        ])),
        
    ])
    return ests

def get_scores(y_true, y_pred, y_proba):
    return {
        'f1': f1_score(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_proba),
        'pr_auc': average_precision_score(y_true, y_proba)
    }
    
def process(candidate_class, balance=.5):
    splits = [SPLIT_DEV, SPLIT_VAL, SPLIT_TEST]
    data = {split: get_data(candidate_class, split, balance=balance) for split in splits}
    (X_train, y_train, X_test, y_test, X_val, y_val, X_dev, y_dev), cv = get_cv_data(data)
    ests = get_estimators(cv)
    res = []
    
    def score(est, X, y):
        y_pred = est.predict(X)
        y_proba = est.predict_proba(X)
        assert y_proba.ndim == 2
        assert y_proba.shape[1] == 2
        y_proba = y_proba[:, 1]
        return get_scores(y, y_pred, y_proba)
        
    for k, est in ests.items():
        print(f'Processing model {k} for class {candidate_class.field}')
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=ConvergenceWarning)
            est = est.fit(X_train, y_train)
        res.append(dict(
            task=candidate_class.field, est_name=k, est=est,
            dev_scores=score(est, X_dev, y_dev),
            validation_scores=score(est, X_val, y_val), 
            test_scores=score(est, X_test, y_test)
        ))
    return res

In [None]:
res = [
    r 
    for c in classes
    for r in process(classes[c])
]

In [9]:
dfs = pd.concat([
    pd.DataFrame([
        pd.Series(dict(task=r['task'], est_name=r['est_name'], split=split)).append(pd.Series(r[split + '_scores']))
        for r in res
    ])
    for split in ['validation', 'test', 'dev']
])
dfs.head()

Unnamed: 0,type,est_name,split,f1,accuracy,precision,recall,roc_auc,pr_auc
0,inducing_cytokine,gbr,validation,0.56,0.607143,0.636364,0.5,0.612245,0.630464
1,inducing_cytokine,ridge,validation,0.592593,0.607143,0.615385,0.571429,0.737245,0.800234
2,inducing_cytokine,ridge2,validation,0.72,0.75,0.818182,0.642857,0.910714,0.916539
3,inducing_cytokine,lasso,validation,0.592593,0.607143,0.615385,0.571429,0.737245,0.800234
4,inducing_cytokine,lasso2,validation,0.814815,0.821429,0.846154,0.785714,0.941327,0.940128


In [10]:
dfs

Unnamed: 0,type,est_name,split,f1,accuracy,precision,recall,roc_auc,pr_auc
0,inducing_cytokine,gbr,validation,0.56,0.607143,0.636364,0.5,0.612245,0.630464
1,inducing_cytokine,ridge,validation,0.592593,0.607143,0.615385,0.571429,0.737245,0.800234
2,inducing_cytokine,ridge2,validation,0.72,0.75,0.818182,0.642857,0.910714,0.916539
3,inducing_cytokine,lasso,validation,0.592593,0.607143,0.615385,0.571429,0.737245,0.800234
4,inducing_cytokine,lasso2,validation,0.814815,0.821429,0.846154,0.785714,0.941327,0.940128
5,secreted_cytokine,gbr,validation,0.842105,0.833333,0.8,0.888889,0.895448,0.874389
6,secreted_cytokine,ridge,validation,0.883117,0.875,0.829268,0.944444,0.896605,0.872567
7,secreted_cytokine,ridge2,validation,0.864198,0.847222,0.777778,0.972222,0.911265,0.891907
8,secreted_cytokine,lasso,validation,0.846154,0.833333,0.785714,0.916667,0.894676,0.867076
9,secreted_cytokine,lasso2,validation,0.821918,0.819444,0.810811,0.833333,0.890818,0.863173
