In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.style as style
import numpy as np
import scipy.stats as spstat
import seaborn as sns
import math

def create_classic():
    style.reload_library()
    style.use('classic')
    mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
    mpl.rcParams['figure.figsize'] = [6.0, 4.0]
    mpl.rcParams['figure.dpi'] = 100
    
def create_ggplot():
    style.reload_library()
    style.use('classic')
    mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
    mpl.rcParams['figure.figsize'] = [6.0, 4.0]
    mpl.rcParams['figure.dpi'] = 100

In [7]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score
import io
import numpy as np
import numpy.ma as ma
from io import StringIO
import boto3
import re

def write_labels_scores(labels, scores, file):
    preds = list(zip(labels, scores))
    df = pd.DataFrame(preds, columns=['label', 'score'])
    df.to_pickle(file)

def read_labels_scores(file):
    df = pd.read_pickle(file) 
    labels=df["label"].values
    scores=df["score"].values
    return(labels, scores) 

def write_tpr_fpr(tpr, fpr, file):
    preds = list(zip(tpr, fpr))
    df = pd.DataFrame(preds, columns=['tpr', 'fpr'])
    df.to_pickle(file)
    
def write_tpr_fpr_s3(tpr, fpr, file):
    preds = list(zip(tpr, fpr))
    df = pd.DataFrame(preds, columns=['tpr', 'fpr'])
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket_name, file).put(Body=csv_buffer.getvalue())

def read_tpr_fpr(file):
    df = pd.read_pickle(file) 
    print(df.columns)
    tpr=df["tpr"].values
    fpr=df["fpr"].values
    return tpr,fpr 

def read_tpr_fpr_s3(file, awsAccessKeyId=None, awsSecretKey=None):
    import boto
    if (awsAccessKeyId is None):
        conn = boto.connect_s3()
    else:
        conn = boto.connect_s3(awsAccessKeyId, awsSecretKey)
    bucket = conn.get_bucket(bucket_name)
    df = pd.read_csv(bucket.get_key(file))
    tpr=df['tpr'].values
    fpr=df['fpr'].values
    return tpr, fpr

def read_tpr_fpr_s3_files(path, awsAccessKeyId=None, awsSecretKey=None):
    import boto
    if (awsAccessKeyId is None):
        conn = boto.connect_s3()
    else:
        conn = boto.connect_s3(awsAccessKeyId, awsSecretKey)
    bucket = conn.get_bucket(bucket_name)
    regex = re.compile('^' + path)
    selected_files = filter(regex.match, [f.name for f in bucket])

    df = pd.DataFrame()
    for file in selected_files:
        #print(file)
        if df.empty:
            df = pd.read_csv(bucket.get_key(file))
            print(df.shape)
        else:
            df = df.append(pd.read_csv(bucket.get_key(file)))
    labels = df['label'].values
    scores = df['score'].values
    print('df shape', df.shape, 'labels', len(labels), 'scores', len(scores))
    if 'weight' in df.columns:
        weights = df['weight'].values
        fpr, tpr, _ = roc_curve(labels, scores, sample_weight=weights)
        print('weights', len(weights), 'tpr', len(tpr), 'fpr', len(fpr))
    else:
        fpr, tpr, _ = roc_curve(labels, scores, drop_intermediate=False)
        print('tpr', len(tpr), 'fpr', len(fpr))
    return tpr, fpr

In [4]:
def plot_precision_recall_curve(labels, scores, **kwargs):
    precision, recall, _ = precision_recall_curve(labels, scores)
    fpr, tpr, _ = roc_curve(labels, scores)
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.show()
    return

def plot_roc_curve(labels, scores, **kwargs):
    fpr, tpr, _ = roc_curve(labels, scores)
    
    label_auc = ''
    if 'auc' in kwargs:
        if kwargs['auc']:
            score = roc_auc_score(labels, scores)
            label_auc = 'AUC:%0.2f' %score
            
    label = ''
    if 'label' in kwargs:
        label= kwargs['label']
        
    if 'logx' in kwargs:
        plt.semilogx(fpr, tpr, label= label + label_auc, linewidth=3)
    else:
        plt.step(fpr, tpr, color='b', alpha=0.5, where='post', label = label_auc) 
   
    xlabel='fpr' 
    if 'xlabel' in kwargs:
        xlabel = kwargs['xlabel']
    plt.xlabel(xlabel)
    
    ylabel='tpr'
    if 'ylabel' in kwargs:
        ylabel=kwargs['ylabel']
    plt.ylabel(ylabel)
    
    fpr_label = ''
    if 'fpr_label' in kwargs:
        fpr_label = kwargs['fpr_label']
        
    if 'fpr_threshold' in kwargs:
        fpr_threshold = kwargs['fpr_threshold']
        for j in np.linspace(0.1, 0.2, 11):
            delta=fpr_threshold * j
            i =np.argwhere(np.logical_and(fpr < fpr_threshold + delta, fpr > fpr_threshold - delta))
            if (len(i) > 0):
                tpr_mean = tpr[i].mean()
                plt.axvline(fpr_threshold, color='r', label= fpr_label + '(threshold:%0.4f, %s:%0.3f)' % (fpr_threshold,ylabel, tpr_mean))
                break
    
    if 'xlim' in kwargs:
        plt.xlim(kwargs['xlim'])
    else:
        plt.xlim([0.0, 1.0])
    if 'ylim' in kwargs:
        plt.ylim(kwargs['ylim'])
    else:
        plt.ylim([0.0, 1.0])
    
    if 'title' in kwargs:
        title = kwargs['title']
    else:
        title=""
    plt.title(title) 
    plt.legend(loc="upper left")
    #plt.show()
    return 

def plot_precision_reach_curve(labels, scores, **kwargs):
    users = 7000000 # Assume we have 7M users
    precision, recall, _ = precision_recall_curve(labels, scores)
    ordered_labels = labels[np.argsort(scores)[::-1]]
    n = len(labels)
    
    masks = [ np.concatenate([np.ones(i), np.zeros(n-i)]) for i in range(1, n)]
    tp = [np.sum(tp) for tp in [ma.array(m, mask=1-ordered_labels) for m in masks]]
    fp = [np.sum(fp) for fp in [ma.array(m, mask=ordered_labels) for m in masks]]
    tn = [np.sum(tn) for tn in [ma.array(1-m, mask=ordered_labels) for m in masks]]
    fn = [np.sum(fn) for fn in [ma.array(1-m, mask=1-ordered_labels) for m in masks]]
 
    tp = np.array(tp)
    fp = np.array(fp)
    tn = np.array(tn)
    fn = np.array(fn)
    precision = tp / (tp + fp)
    reach = (tp + fp) / n * users
    
    plt.step(reach, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(reach, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Reach')
    plt.ylabel('Precision')
    plt.ylim([0.0, 0.5])
    plt.xlim([0.0, users])
    plt.show()
    print(reach[-1],precision[-1])
    print(reach[n* 1//7], precision[n* 1//7])
    return

In [5]:
%run evaluation.ipynb
def plot_tpr_fpr(tpr, fpr, **kwargs):
    label_auc=''
    if 'auc' in kwargs:
        if kwargs['auc']:
            score = get_auc_from_fpr_tpr(fpr, tpr, verbose=False)
            label_auc = 'AUC:%0.2f' %score
    
    label = ''
    if 'label' in kwargs:
        label= kwargs['label']
    
    if 'logx' in kwargs:
        plt.semilogx(fpr, tpr, label= label + label_auc, linewidth=3)
    else:
        plt.step(fpr, tpr, color='b', alpha=0.5, where='post', label= label + label_auc, linewidth=3)
        
    plt.fill_between(fpr, tpr, step='post', alpha=0.2, color='b')
    plt.plot([0.0, 1.0], [0.0, 1.0], linestyle='--', c='gray')
    
    xlabel = 'fpr'
    if 'xlabel' in kwargs:
        xlabel = kwargs['xlabel']
        
    plt.xlabel(xlabel)
        
    ylabel = 'tpr'
    if 'ylabel' in kwargs:
        ylabel = kwargs['ylabel']

    plt.ylabel(ylabel)
    
    fpr_label = ''
    if 'fpr_label' in kwargs:
        fpr_label = kwargs['fpr_label']
        
    if 'fpr_threshold' in kwargs:
        fpr_threshold = kwargs['fpr_threshold']
        for t in np.linspace(0.1, 0.2, 11):
            delta=fpr_threshold * t
            i =np.argwhere(np.logical_and(fpr < fpr_threshold + delta, fpr > fpr_threshold - delta))
            if (len(i) >0):
                tpr_mean = tpr[i].mean()
                j = int(np.round(np.mean(i)))
                plt.axvline(fpr_threshold, color='r', label=fpr_label + '(threshold:%0.4f, %s:%0.3f)' % (fpr_threshold,ylabel, tpr_mean))
                break
  
    if 'xlim' in kwargs:
        plt.xlim(kwargs['xlim'])
    else:
        plt.xlim([0.0, 1.0])
    if 'ylim' in kwargs:
        plt.ylim(kwargs['ylim'])
    else:
        plt.ylim([0.0, 1.0])
    
    if 'title' in kwargs:
        title = kwargs['title']
    else:
        title=""
    plt.title(title) 
    plt.legend(loc="upper left")
    return

In [6]:
import pydotplus  # you can install pydotplus with: pip install pydotplus 
from IPython.display import Image
from sklearn.tree import DecisionTreeClassifier, export_graphviz

def print_graph(clf, feature_names):
    """Print decision tree."""
    graph = export_graphviz(
        clf,
        label="root",
        proportion=True,
        impurity=False, 
        out_file=None, 
        feature_names=feature_names,
        class_names={0: "D", 1: "R"},
        filled=True,
        rounded=True
    )
    graph = pydotplus.graph_from_dot_data(graph)  
    return Image(graph.create_png())

In [None]:
def plot_ensemble_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):
    """Plot the roc curve for base learners and ensemble."""
    plt.figure(figsize=(10, 8))
    plt.plot([0, 1], [0, 1], 'k--')
    
    cm = [plt.cm.rainbow(i)
      for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
    
    for i in range(P_base_learners.shape[1]):
        p = P_base_learners[:, i]
        fpr, tpr, _ = roc_curve(ytest, p)
        plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])

    fpr, tpr, _ = roc_curve(ytest, P_ensemble)
    plt.plot(fpr, tpr, label=ens_label, c=cm[0])
        
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(frameon=False)
    plt.show()