## Dump the model to Kipoi

In [1]:
import numpy as np
import m_kipoi
from copy import deepcopy
import pandas as pd
from plotnine import *
import matplotlib.pyplot as plt
from m_kipoi.config import get_data_dir
from tqdm import tqdm
ddir = get_data_dir()

  from pandas.core import datetools


In [2]:
# Use these Kipoi models
models = ["MaxEntScan/3prime", "MaxEntScan/5prime", "HAL", "labranchor"]

### Data

#### ClinVar

In [3]:
from m_kipoi.exp.splicing.data import get_clinvar_ext_Xy, get_dbscsnv_Xy

In [4]:
X_clinvar, y_clinvar = get_clinvar_ext_Xy()

In [5]:
X_clinvar['early_stop'] = X_clinvar.early_stop.astype(bool)

In [6]:
y_clinvar = y_clinvar[~X_clinvar['early_stop']]
X_clinvar = X_clinvar[~X_clinvar['early_stop']]

#### dbscSNV

In [7]:
X_dbscsnv, y_dbscsnv = get_dbscsnv_Xy()

## Features

In [8]:
kipoi_features = [x for m in models for x in X_dbscsnv.columns[X_dbscsnv.columns.str.startswith(m)]]

In [9]:
clinvar_kipoi_features = [x for m in models for x in X_clinvar.columns[X_clinvar.columns.str.startswith(m)]]

In [63]:
for x in clinvar_kipoi_features:
    print(f"{x}")

MaxEntScan/3prime_ref
MaxEntScan/3prime_alt
MaxEntScan/3prime_isna
MaxEntScan/5prime_ref
MaxEntScan/5prime_alt
MaxEntScan/5prime_isna
HAL_ref
HAL_alt
HAL_isna
labranchor_ref
labranchor_alt
labranchor_isna


In [23]:
kipoi_features

['MaxEntScan/3prime_alt',
 'MaxEntScan/3prime_diff',
 'MaxEntScan/3prime_ref',
 'MaxEntScan/3prime_isna',
 'MaxEntScan/5prime_alt',
 'MaxEntScan/5prime_ref',
 'MaxEntScan/5prime_diff',
 'MaxEntScan/5prime_isna',
 'HAL_ref',
 'HAL_alt',
 'HAL_diff',
 'HAL_isna',
 'labranchor_alt',
 'labranchor_diff',
 'labranchor_logit',
 'labranchor_logit_alt',
 'labranchor_logit_ref',
 'labranchor_ref',
 'labranchor_isna']

In [24]:
dbscsnv_model_features = ['PWM_ref', 'PWM_alt',
                          'MES_ref', 'MES_alt', 
                          'NNSplice_ref', 'NNSplice_alt', 
                          'HSF_ref', 'HSF_alt', 
                          'GeneSplicer_ref', 'GeneSplicer_alt', 
                          'GENSCAN_ref', 'GENSCAN_alt', 
                          'NetGene2_ref', 'NetGene2_alt', 
                          'SplicePredictor_ref', 'SplicePredictor_alt']

In [25]:
conservation_features = ['phyloP46way_placental', 'phyloP46way_primate', 'CADD_raw', 'CADD_phred']

In [26]:
dbscsnv_dbscsnv_feat = ['dbscSNV_rf_score', 'dbscSNV_rf_score_isna']

In [27]:
clinvar_cons_feat = ['CADD_raw', 'CADD_phred', 'phyloP46way_placental', 'phyloP46way_primate']
clinvar_dbscSNV_feat = ['dbscSNV_rf_score', 'dbscSNV_rf_score_isna']
spidex_feat = ['dpsi_max_tissue', 'dpsi_zscore', 'dpsi_max_tissue_isna', 'dpsi_zscore_isna']

In [28]:
dbscsnv_features = dbscsnv_model_features + conservation_features

## Modeling

In [29]:
# Scikit-learn imports
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn_pandas import DataFrameMapper

In [30]:
class ZeroImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return pd.DataFrame(X).fillna(0).copy()

In [31]:
def preproc(features):
    """Pre-processing pipeline
    """
    return DataFrameMapper([
            (features, [ZeroImputer(), 
                        sklearn.preprocessing.StandardScaler(),
                       ]),
        ])

In [32]:
def preproc(features):
    """Pre-processing pipeline
    """
    return DataFrameMapper([
            (features, [ZeroImputer()]),
        ])

In [45]:
X = preproc(clinvar_kipoi_features).fit_transform(X_clinvar)

In [50]:
model = Pipeline([('preproc', sklearn.preprocessing.StandardScaler()),
                  ('model', LogisticRegression())])

In [52]:
model.fit(X, y_clinvar)

Pipeline(memory=None,
     steps=[('preproc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [58]:
from sklearn.externals import joblib

In [59]:
joblib.dump(model, '/users/avsec/.kipoi/models/KipoiSplice/4/model_files/model.pkl') 

FileNotFoundError: [Errno 2] No such file or directory: '/users/avsec/.kipoi/models/KipoiSplice/4/model_files/model.pkl'

In [62]:
joblib.dump(model, 'model.pkl') 

['model.pkl']

In [133]:
def evaluate(df, y, features, model, model_name):
        ret = cross_validate(Pipeline([('preproc', preproc(features)), 
                                       ('model', model)]), 
                             df, y, scoring=['accuracy', 'roc_auc'], cv=10, n_jobs=10, return_train_score=True)
        means = pd.DataFrame(ret).describe().loc['mean']
        means.index = "mean_" + means.index
        sds = pd.DataFrame(ret).describe().loc['std']
        sds.index = "std_" + sds.index
        return pd.DataFrame([{**dict(means), **dict(sds), "model_name": model_name}])

In [134]:
def run_model_groups(df, y, model_groups,
                     model=LogisticRegressionCV(penalty="l1", solver='liblinear', scoring="roc_auc", cv=3, n_jobs=1)):
    res = []
    use_features = []
    for mg in tqdm(model_groups):
        use_features += [f for f in df.columns if f.startswith(mg)]
        res.append(evaluate(df, y, use_features, model, model_name=mg))
    return pd.concat(res)

In [316]:
res_dbscsnv = run_model_groups(X_dbscsnv, y_dbscsnv, models, model=model)

100%|██████████| 4/4 [00:05<00:00,  1.44s/it]


In [317]:
res_dbscsnv = res_dbscsnv.append(evaluate(X_dbscsnv, y_dbscsnv, kipoi_features + conservation_features, 
                                          model=model,
                                          model_name="Kipoi4 w/ cons."))