# Model Evaluation - Multilabel

In [2]:
import pandas as pd
import numpy as np
import boto3
import s3fs
import os
import sys
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, accuracy_score, hamming_loss
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, f1_score, precision_recall_curve
import xgboost as xgb
import warnings
import itertools
from scipy import stats
from matplotlib import pyplot as plt
plt.style.use('ggplot')
warnings.filterwarnings(action='ignore')
import jupyterthemes
from jupyterthemes import jtplot
jtplot.style(theme='oceans16')
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Evaluator import Evaluator

In [3]:
# Init class for model evaluation - detailed in Evaluation.py
ev = Evaluator()

In [4]:
filepath = "s3://voightlab-data/grouped/FeatureSelected/chi2/"

X_train = pd.read_csv(filepath + "X_train.csv", index_col=0)
y_train = pd.read_csv(filepath + "y_train.csv", index_col=0)
X_test = pd.read_csv(filepath + "X_test.csv", index_col=0)
y_test = pd.read_csv(filepath + "y_test.csv", index_col=0)

In [5]:
# If loading in feature selected datasets, need to convert labels back into binarized labels
y_train['is_t2d'] = y_train['label'].apply(lambda x: 1 if (x == 1 or x == 3) else 0)
y_train['is_lipids'] = y_train['label'].apply(lambda x: 1 if (x == 2 or x == 3) else 0)

# Do same for test data
y_test['is_t2d'] = y_test['label'].apply(lambda x: 1 if (x == 1 or x == 3) else 0)
y_test['is_lipids'] = y_test['label'].apply(lambda x: 1 if (x == 2 or x == 3) else 0)

In [6]:
# Drop the label column now that we have binarized labels
y_train = y_train.drop(['label'], axis=1)
y_test = y_test.drop(['label'], axis=1)
y_train.head()

Unnamed: 0,is_t2d,is_lipids
1804,1,0
8603,0,1
9643,0,0
5414,0,0
9989,1,1


In [7]:
# Normalize the snpcount column which is continuous, to fall between 0 and 1
if 'snpcount' in X_train.columns:
    X_train['snpcount'] = (X_train['snpcount'] - X_train['snpcount'].min()) / (X_train['snpcount'].max() - X_train['snpcount'].min())
    X_test['snpcount'] = (X_test['snpcount'] - X_test['snpcount'].min()) / (X_test['snpcount'].max() - X_test['snpcount'].min())

In [8]:
X_train.head()

Unnamed: 0,HepG2_ChIP-seq_SIN3A_ENCFF002CLA.bed,HepG2_ChIP-seq_POLR2A_ENCFF002CKX.bed,HepG2_ChIP-seq_POLR2AphosphoS5_ENCFF002CKW.bed,HepG2_ChIP-seq_POLR2A_ENCFF002DAY.bed,cardiac_mesoderm_ChIP-seq_H3K36me3_ENCSR000DSH_gappedpeak.bed,HepG2_ChIP-seq_MAZ_ENCFF002CUK.bed,HepG2_ChIP-seq_MAX_ENCFF002CUJ.bed,HepG2_ChIP-seq_BHLHE40_ENCFF002CTT.bed,HepG2_ChIP-seq_MXI1_ENCFF002CUL.bed,HepG2_ChIP-seq_EP300_ENCFF002CUN.bed,...,E118-H3K9ac.gappedPeak,HepG2.stretchEnhancers.bed,hepatocyte_p300_DMSO.bed,FAT.ADIP.NUC-EnhG,Hepg2-EnhA,Hepg2-TssA,LIV.ADLT-TssFlnkU,PancIslt-EnhA,hsap_HNF4A_hg19.bed,GSM1208630_batch1_chrom1_LoVo_HES4_PassedQC_peaks_hg19.bed
1804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8603,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,0,1,1
9643,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
'''
For multilabel problems, using subset accuracy can be a harsh metric since it requires an exact match
between the label vectors. The hamming score gives credit for partially matchin label vectors, i.e if the
model predicts some but not all of the labels correctly. 
'''
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    
    for index, value in enumerate(y_true):

        # Return indices where the label == 1
        set_true = set( np.where(y_true[index])[0] )
        set_pred = set( np.where(y_pred[index])[0] )
        row_acc = None
        
        # Hamming score equation for a given sample
        if len(set_true) == 0 and len(set_pred) == 0:
            row_acc = 1
        else:
            row_acc = len(set_true.intersection(set_pred)) / float(len(set_true.union(set_pred)))
        acc_list.append(row_acc)
        
    return np.mean(acc_list)

### ExtraTreesClassifier

The sklearn ExtraTreesClassifier is very similar to a Random Forest, with two major differences:
* When choosing variables at a split, samples are drawn from the entire training set, rather than a bootstrap sample of the training set. 
* When samples are split at a node, the split threshold is chosen at random from the possible range of values at each split.

So why use one over the other? Random forests are generally more compact but in certain situations ETC's generalize better.

In [10]:
# n_jobs = -1 allows training to be done on all available cores
etc_model = ExtraTreesClassifier(n_jobs=-1, class_weight='balanced',bootstrap=True, oob_score=True)

# Parameter grid for grid search
etc_params = {'n_estimators': stats.randint.rvs(100, 1000, size=50),
			 'max_depth': stats.randint.rvs(2, len(X_train.columns) - 2, size=15), 
             'min_samples_leaf': stats.uniform(0, 0.4),
             'class_weight':['balanced', 'balanced_subsample']}

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3)

In [11]:
# Grid search to iteratively search parameter space
etc_grid = RandomizedSearchCV(etc_model, n_iter=100, param_distributions=etc_params,
                             cv=cv, scoring='f1_micro')
etc_grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f" % (etc_grid.best_params_, etc_grid.best_score_))

etc_grid_score = etc_grid.score(X_test, y_test)
print ("Grid search's score on new test data was {}".format(etc_grid_score))

The best parameters are {'class_weight': 'balanced_subsample', 'max_depth': 2, 'min_samples_leaf': 0.16654053274230757, 'n_estimators': 610} with a score of 0.17
Grid search's score on new test data was 0.1778565999493286


In [12]:
predictions = etc_grid.predict(X_test)
print("Hamming score is {}".format(hamming_score(y_test, predictions)))
print("Hamming loss is {}".format(hamming_loss(y_test, predictions)))

KeyError: 0

In [None]:
# Use 'average' argument to calculate precision and recall across classes
etc_metrics = ev.summarize_performance(etc_grid, X_test, y_test,
                                       average='micro', 
                                       proba=True, 
                                       return_stats=True)

In [None]:
ev.plot_roc_curve(etc_grid, X_train, y_train, X_test, y_test, classes=[[0,0], [0,1], [1,0], [1,1]])