In [None]:
from matplotlib import pyplot as plt
from matplotlib.gridspec import GridSpec

#### Generate charts with a single function

In [2]:
def chart_gen(chart, bucket, col, df, features, target = None):
    """
    Uses the parameters to generate charts
    Parameters:
        chart (string): Chart type, options are hist, bar, and scatter
        bucket (int): if hist is chosen, the number of bins must be selected. If hist is not chosen,
        any value will work.
        col (int): number of charts to be in a single grid row for presentation purposes
        df (DataFrame): dataframe that the series are derived from
        features (list): list of features to be charted 
        target (string): name of target variable for scatter plot
    Returns:
        charts for the features chosen unless an incorrect chart name is given. Then a message
        requests the user to use a valid chart name.
    """
    if chart == 'hist':
        COUNT = 0
        fig = plt.figure(figsize=(25, 5))
        gs = GridSpec(nrows=1, ncols=col)

        for i in features:
            ax = fig.add_subplot(gs[0, COUNT])
            ax.hist(df[i], bins=bucket)
            plt.title("Histogram of " + str(i) + " Values")
            COUNT += 1

        plt.show()    
    
    elif chart == 'bar':
        COUNT = 0
        fig = plt.figure(figsize=(25, 5))
        gs = GridSpec(nrows=1, ncols=col)

        for i in features:
            counts = df[i].value_counts().sort_values()
            ax = fig.add_subplot(gs[0, COUNT])
            ax.bar(counts.index, counts)
            plt.xticks(rotation='vertical')
            plt.title("Histogram of " + str(i) + " Values")
            COUNT += 1
    
        plt.show()
        
    elif chart == 'box':
        COUNT = 0
        fig = plt.figure(figsize=(25, 5))
        gs = GridSpec(nrows=1, ncols=col)

        for i in features:
            ax = fig.add_subplot(gs[0, COUNT])
            ax.boxplot(df[i])
            plt.ylabel("Boxplot of " + str(i) + " Values")
            COUNT += 1
        
        fig.tight_layout()
        plt.show()
        
    elif chart == 'scatter':
        COUNT = 0
        fig = plt.figure(figsize=(25, 5))
        gs = GridSpec(nrows=1, ncols=col)

        for i in features:
            ax = fig.add_subplot(gs[0, COUNT])
            ax.scatter(df[i], df[target])
            plt.title("Scatter plot of " + str(i) + " and " + str(target))
            COUNT += 1
        
        fig.tight_layout()
        plt.show()
        
    elif chart == 'scatter-string':
        COUNT = 0
        fig = plt.figure(figsize=(25, 5))
        gs = GridSpec(nrows=1, ncols=col)

        for i in features:
            x_names = df[i].astype('category').drop_duplicates()
            x_values = df[i].astype('category').cat.codes
            ax = fig.add_subplot(gs[0, COUNT])
            ax.scatter(x_values, df[target])
            ax.set_xticks(range(0, len(x_names)))
            ax.set_xticklabels(list(x_names))
            plt.title("Scatter plot of " + str(i) + " and " + str(target))
            COUNT += 1
        
        fig.tight_layout()
        plt.show()
         
    else:
        return print('Please choose a valid chart type: hist, bar, box, scatter, or scatter-string')

#### Generate tables and charts for model analysis

In [1]:
def model_analysis(model, x_input):
    
    # Outliers
    student_resid = model.outlier_test()
    student_resid = np.delete(student_resid, [1, 2], 1)
    df_student_resid = pd.DataFrame(student_resid, columns = ['student_resid'])
    df_student_resid['student_resid'] = df_student_resid['student_resid'].abs()
    outliers = df_student_resid[df_student_resid['student_resid'] > 3]
    print(outliers)
    
    # Leverage
    influence = model.get_influence()
    leverage = influence.hat_matrix_diag
    df_leverage = pd.DataFrame(leverage, columns = ['Leverage'])
    leverage_points = df_leverage[df_leverage['Leverage'] > ((2*len(x_input[0]))+2)/len(x_input)]
    print(leverage_points)

    # Combined Leverage and Outliers
    combo = leverage_points.merge(outliers, how = 'inner', left_index = True, right_index = True)
    print(combo)
    
    # Collinearity
    X = sm.add_constant(x_input)
    # Apply inverse hyperbolic sine since there are values of 0
    X = np.arcsinh(X)
    vif = pd.Series([variance_inflation_factor(X, i) 
               for i in range(X.shape[1])])
    print(vif)
    
    # Chart plotting influence of each observation
    with plt.rc_context():
        plt.rc("figure", figsize=(20, 20))
        sm.graphics.influence_plot(model, criterion="cooks")

#### non-linearity assumption check

In [6]:
def non_linear_assumption(model, num_vars):
    for i in range(1, num_vars + 1):
        column = 'x' + str(i)
        fig = plt.figure(figsize=(12,8))
        fig = sm.graphics.plot_regress_exog(model, column, fig=fig)
    