## Model Evaluation

Fitting models to data after performing feature selection

In [86]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, accuracy_score
import warnings
from matplotlib import pyplot as plt
plt.style.use('ggplot')
warnings.filterwarnings(action='ignore')
import jupyterthemes
from jupyterthemes import jtplot
jtplot.style(theme='oceans16')
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from Evaluator import Evaluator

Helper functions for evaluation and plotting

In [79]:
'''
Predict class using a model that predicts probabilities with a user 
defined threshold for classification
'''
def predict_class(model, Xtest, threshold):
    logits = model.predict_proba(Xtest)
    predictions = [1 if float(sample[1]) > threshold else 0 for sample in logits]
    return predictions


'''
Summarize model performance with precision and recall statistics
'''
def summarize_performance(model, X, y, threshold=0.5, proba=True):
    if proba:
        predictions = predict_class(model, X, threshold)
    else:
        predictions = model.predict(X)
    
    precision = precision_score(y_true=y, y_pred=predictions)
    recall = recall_score(y_true=y, y_pred=predictions)
    accuracy = accuracy_score(y_true=y, y_pred=predictions)
    print ("Model Performance:\n Precision: {}\n Recall: {}\n Accuracy: {}".format(precision, recall, accuracy))

In [80]:
'''
Pass coefficents and feature names to see most important features 
'''
def feat_importance(scores, names, n=10, one_dim=True):
    imp = scores
    if not one_dim:
        imp,names = zip(*sorted(zip(imp[0],names)))
    else:
        imp,names = zip(*sorted(zip(imp,names)))
    fig = plt.figure(num=None, figsize=(10, 7), dpi=80)
    plt.barh(range(len(names[-n:])), imp[-n:], align='center')
    plt.yticks(range(len(names[-n:])), names[-n:])
    plt.title("Most Important Features \n")
    plt.xlabel("score")
    plt.ylabel("features")
    plt.show()

'''
Display ROC curve and AUC for a given model
'''
def plot_roc_curve(model, Xtest, ytest):
    # calculate the fpr and tpr for all thresholds of the classification
    probs = model.predict_proba(Xtest)
    preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(ytest, preds)
    roc_auc = auc(fpr, tpr)

    # method I: plt
    plt.title('Receiver Operating Characteristic\n')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [87]:
ev = Evaluator()

TypeError: __init__() takes 0 positional arguments but 1 was given

In [81]:
filepath = "../data/T2D_data/FeatureSelected/var_threshold/"

X_train = pd.read_csv(filepath + "X_train.csv", index_col=0)
y_train = pd.read_csv(filepath + "y_train.csv", index_col=0, header=None)
X_test = pd.read_csv(filepath + "X_test.csv", index_col=0)
y_test = pd.read_csv(filepath + "y_test.csv", index_col=0, header=None)

In [82]:
X_train.head()

Unnamed: 0,E051-H3K4me3.gappedPeak,E067-H3K27me3.gappedPeak,E008-H3K9me3.gappedPeak,E045-H3K27me3.gappedPeak,E093-H3K27ac.gappedPeak,hepatocyte_H3K27Ac_Rif.bed,E017-H2AK5ac.gappedPeak,E008-H3K9ac.gappedPeak,E078-H3K27ac.gappedPeak,H1-hESC_ChIP-seq_SUZ12_ENCFF001SVQ.bed,...,E003-H3K4me1.gappedPeak,E030-H3K27me3.gappedPeak,E044-H3K27me3.gappedPeak,E127-H3K4me1.gappedPeak,E021-H3K4me1.gappedPeak,E002-H3K36me3.gappedPeak,E114-H3K4me1.gappedPeak,E095-H3K4me1.gappedPeak,E084-H3K9me3.gappedPeak,snpcount
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,11
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,5
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,1,0,1,0,1,0,0,1,1,1,...,0,0,0,1,1,1,1,1,1,36
4,0,0,0,0,1,1,0,1,0,1,...,1,1,0,0,0,0,0,1,0,2


### Logistic Regression

In [83]:
log_model = LogisticRegressionCV(solver='liblinear', 
                                 penalty='l1', 
                                 class_weight='balanced',
                                 scoring='roc_auc', 
                                 cv=10,
                                 max_iter=5000,
                                 n_jobs=-1)

log_model.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight='balanced', cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=5000,
           multi_class='warn', n_jobs=-1, penalty='l1', random_state=None,
           refit=True, scoring='roc_auc', solver='liblinear', tol=0.0001,
           verbose=0)

In [84]:
print("Training Dataset Logistic", end=" ")
ev.summarize_performance(log_model, X_train, y_train, threshold=0.5)

Training Dataset Logistic 

NameError: name 'predict_class' is not defined

In [None]:
print("Test Dataset Logistic", end=" ")
ev.summarize_performance(log_model, X_test, y_test, threshold=0.5)

In [None]:
ev.plot_roc_curve(log_model, X_test, y_test)

In [None]:
# Plot feature importance
coefs = log_model.coef_
feature_names = X_train.columns
ev.feat_importance(coefs, feature_names, 20, one_dim=False)

In [None]:
dir(ev)