# General Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Default Plot Settings
Sets more readable/presentable matplotlib settings

In [None]:
# format style
plt.style.use('fivethirtyeight')
# lineweight
plt.rc('lines', linewidth=3)
# figure size
plt.rc('figure', figsize=(12, 7))
# title fontsize
plt.rc('axes', titlesize=33) 
# axes label fontsize
plt.rc('axes', labelsize=28)
# axes values fontsize
plt.rc('xtick', labelsize=18)
plt.rc('ytick', labelsize=18)
# legend fontsize
plt.rc('legend', fontsize=18)

# Model Evaluation Function
Function to create evaluation plots given test input

In [3]:
def mod_eval(model, X, y):
    '''
    Return test r-squared
    Plot model accuracy, residuals and probability plots
    
    Parameters
    ----------
    model : fitted model to evaluate
    X : test feature data
    y : test target data
    '''

    # output r-squared score for model
    print('R-squared:', model.score(X, y))
    
    # cacluate model residuals
    predictions = model.predict(X)
    residuals = y - predictions
    
    # evaluation plots
    fig, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1, figsize=(7, 20))
    ax1.scatter(y, predictions)
    ax1.set_title('Accuracy')
    ax1.set_xlabel('Actual Value')
    ax1.set_ylabel('Predicted Value')
    ax2.scatter(predictions, residuals)
    ax2.set_title('Residual Plot')
    ax2.set_xlabel('Predicted Value')
    ax2.set_ylabel('Residual')
    stats.probplot(residuals, dist='norm', plot=plt);

# Feature Weight Sorting
Class of functions to sort feature weights from greatest to least and display bar plot of results.

In [None]:
class FeatSort():
    '''
    Sort (in descending order) feature weights for Linear Regression or Logistic Regression models
    
    Parameters
    ----------
    weights : array of feature weights from model
    labels : list of feature labels
    '''
    
    def __init__(self, weights, labels):
        
        self.weights = weights
        self.labels = labels
        # create feature weight dataframe
        self.df = pd.DataFrame(weights, index=labels, columns=['feat_wgt'])
        self.df_unsort = self.df.copy()
    
    def wgts(self):
        '''
        Return dataframe of feature weights paired with their labels
        '''
        
        return(self.df_unsort)
   

    def sort_wgts(self, num_ret='all', ret_0=False):
        '''
        Sort feature weights (greatest to least)
        
        Parameters
        ----------
        num_ret : number of top features to return
        ret_0 : return weights with value of zero
        '''
        
        # reset feature dataframe
        self.df = self.df_unsort.copy()
        # sort feature weight dataframe
        self.df.sort_values(by='feat_wgt', ascending=False, inplace=True)
        
        # 0 values
        if ret_0 == False:
            # drop weights == 0
            self.df = self.df[self.df.iloc[:, 0] != 0]
        
        # top number of features to return
        if num_ret != 'all':
            # slice number of rows in dataframe
            self.df = self.df.iloc[:num_ret, :]

        # return dataframe
        return(self.df)

    
    def sort_abs(self, num_ret='all', ret_0=False):
        '''
        Sort feature weights (greatest to least) based on absolute values

        Parameters
        ----------
        num_ret : number of top features to return
        ret_0 : return weights with value of zero
        '''
        
        # reset feature dataframe
        self.df = self.df_unsort.copy()
        # create column identifying positive weights
        self.df['positive'] = self.df['feat_wgt'] > 0
        # transform weights to absolute value
        self.df['feat_wgt'] = self.df['feat_wgt'].apply(abs)
        # sort feature weight dataframe
        self.df.sort_values(by='feat_wgt', ascending=False, inplace=True)
        
        # 0 values
        if ret_0 == False:
            # drop weights == 0
            self.df = self.df[self.df.iloc[:, 0] != 0]
            
        # top number of features to return
        if num_ret != 'all':
            # slice number of rows in dataframe
            self.df = self.df.iloc[:num_ret, :]

        # return dataframe
        return(self.df)
    
    
    def sort_pct(self, num_ret='all', ret_0=False, rnd=1):
        '''
        Sort feature weights (greatest to least) and return as percentage values
        
        Parameters
        ----------
        num_ret : number of top features to return
        ret_0 : return weights with value of zero
        rnd : decimal precision for rounding
        '''
        
        # reset feature dataframe
        self.df = self.df_unsort.copy()
        # create column identifying positive weights
        self.df['positive'] = self.df['feat_wgt'] > 0
        # transform weights to absolute value
        self.df['feat_wgt'] = self.df['feat_wgt'].apply(abs)
        # sort weights
        self.df.sort_values(by='feat_wgt', ascending=False, inplace=True)
        # transform weights to percentages
        self.df['feat_wgt'] = round(self.df['feat_wgt'] / self.df['feat_wgt'].sum() * 100, rnd)
        # rename column
        col_rename = self.df.columns.values
        col_rename[0] = 'feat_%'
        self.df.columns = col_rename
        
        # 0 values
        if ret_0 == False:
            # drop weights == 0
            self.df = self.df[self.df.iloc[:, 0] != 0]
            
        # top number of features to return
        if num_ret != 'all':
            # slice number of rows in dataframe
            self.df = self.df.iloc[:num_ret, :]

        # return dataframe
        return(self.df)  
    
    
    def plot(self, feat_lab='df_idx'):
        '''
        Return plot of feature weights

        Parameters
        ----------
        feat_lab : labels for feature affects, default is dataframe index
        '''

        # initialize plot
        ax = plt.gca()

        # define feature labels
        if feat_lab == 'df_idx':
            feat_lab = self.df.index
        
        # plot feature weights with positive and negative differentiation 
        if len(self.df.columns) == 2:
            
            # bar plot of feature weight magnitudes with mapped colors for pos/neg
            self.df.iloc[:, 0].plot(kind='bar', ax=ax, color=self.df.iloc[:, 1]
                                    .map({True: 'g', False: 'r'}))

            # legend
            # positive label for legend
            pos_patch = mpatches.Patch(color='green', label='Positive')
            # negative label for legend
            neg_patch = mpatches.Patch(color='red', label='Negative')
            # display legend
            legend = plt.legend(title='Correlation', handles=[pos_patch, neg_patch])
            legend.get_title().set_fontsize('18')
        
        # plot feature weights without pos/neg differentiation
        else:
            sorter.sort_pct(7).plot(kind='bar', legend=False, ax=ax)
            
        # labels
        # set x-tick labels to feature labels
        ax.set_xticklabels(feat_lab)
        ax.set_xlabel('Feature')
        ax.set_ylabel('Weight')
        ax.set_title('Model Feature Weights')

# Feature Weight Sorting
Function to sort feature weights from greatest to least.

In [None]:
def feat_sort(values, labels, num_ret='all', ret_abs=False, ret_pct=False, ret_0=False):
    '''
    Return sorted (descending) feature weights
    
    Parameters
    ----------
    values : feature weight values from analysis
    labels : names of each feature
    num_ret : number of top features to return
    ret_abs : return absolute value of weight values (will track original sign)
    ret_pct : return each weight as percentage of total (based on absolute value)
    ret_0 : return weights with value of zero
    '''
    
    # create feature weight dataframe
    df = pd.DataFrame(values, index=labels, columns=['feat_wgt']).copy()
        
    # ret_abs check
    if ret_abs == True:
        # create column identifying positive weights
        df['positive'] = df['feat_wgt'] > 0
        # transform weights to absolute value
        df['feat_wgt'] = df['feat_wgt'].apply(abs)
        
    # sort weights (largest to smallest)
    df.sort_values(by='feat_wgt', ascending=False, inplace=True)
    
    # ret_pct check
    if ret_pct == True:
        if ret_abs == False:
            df['feat_wgt'] = df['feat_wgt'].apply(abs)
            df.sort_values(by='feat_wgt', ascending=False, inplace=True)
        # transform weights to percentages
        df['feat_wgt'] = round(df['feat_wgt'] / df['feat_wgt'].sum() * 100, 0).astype(int)
        
    # ret_0 check
    if ret_0 == False:
        # drop weights == 0
        df = df[df['feat_wgt'] != 0]
        
    # return dataframe
    if num_ret == 'all':
        return(df)
    else:
        return(df.iloc[:num_ret, :])

# Plot Feature Affects
Bar plot of positive and negative feature weights for a given model.

In [5]:
import matplotlib.patches as mpatches

In [6]:
def feat_plot(df_feats, feat_lab='df_idx'):
    '''
    Return plot of positive and negative feature weights for a given model
    
    Parameters
    ----------
    df_feats : dataframe of a model's feature wieght values and if positive
    feat_lab : labels for features, default is dataframe index
    '''
    
    # define feature labels
    if feat_lab == 'df_idx':
        feat_lab = df_feats.index
    
    # initialize plot
    ax = plt.gca()
    # plot feature weights with positive and negative differentiation
    df_feats.iloc[:, 0].plot(kind='bar', ax=ax, color=df_feats.iloc[:, 1]
                            .map({True: 'g', False: 'r'}))
    ax.set_title('Feature Weights')
    ax.set_xlabel('Feature')
    ax.set_ylabel('Model Coefficient')
    # set x-tick labels to feature labels
    ax.set_xticklabels(feat_lab)
    
    # positive label for legend
    pos_patch = mpatches.Patch(color='green', label='Positive')
    # negative label for legend
    neg_patch = mpatches.Patch(color='red', label='Negative')
    # display legend
    legend = plt.legend(title='Correlation', handles=[pos_patch, neg_patch])
    legend.get_title().set_fontsize('18')

# Markdown Table
Prints markdown formmatted data table from pandas dataframe

In [2]:
def mrkdwn_tbl(df):
    '''
    Return markdown code for formatted table
    Prints column names and all dataframe values
    
    Parameters
    ----------
    df : pandas dataframe 
    
    '''
    
    top_row = '| ' + ' | '.join(df.columns) + ' |'
    sec_row = '| ' + ' | '.join(['---'] * len(df.columns)) + ' |'
    print(top_row)
    print(sec_row)
    for i in range(len(df)):
        print('| ' + ' | '.join(df.iloc[i, :].astype(str).values) + ' |')

# Table Image
Render formmatted image of data frame as a table.

In [7]:
import six

In [8]:
def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax