## Project Functions

### Description

Below are the different functions used in the analysis.

In [1]:
# Import libraries
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Histogram Plots and Descriptive Stats Function

In [None]:
# Histogram plots for all numeric data minus target value followed by supporting stats
def num_univariate_histogram(df, length, width, rows, col, font):
    if len(df.columns) > 1:
        X_num = df.select_dtypes(include = ['float64', 'int64'])
        X_num = X_num[X_num.columns[0:-1]]
        sns.set(font_scale = font, style = 'white')
        X_num.hist(bins = 50, figsize = (width, length), layout = (rows, col))
        plt.show()
        print('\n' + 'X continuous descriptive stats:')
        print(X_num.describe())
    else:
        X_num = df.select_dtypes(include = ['float64', 'int64'])
        sns.set(font_scale = 1, style = 'white')
        X_num.hist(bins = 50, figsize = (width, length), layout = (rows, col))
        plt.show()
        print('\n' + 'X continuous descriptive stats:')
        print(X_num.describe())

### Frequency Plots and Descriptive Stats Function

In [None]:
# frequency plot for all categorical data followed by supporting stats
def cat_univariate_freq(df, length, width, index_rows, index_col, font):
    X_cat = df.select_dtypes(include = ['object'])
    X_cat = X_cat.columns[index_rows : index_col]

    for X in X_cat:
        series = round((df[X].value_counts() / len(df)) * 100, 0)
        series = series.sort_values(ascending = True)
        sns.set(font_scale = font, style = 'white')
        series.plot.barh(figsize = (width, length))
        plt.title(X + ' frequencies')
        plt.xlabel('percent')
        plt.ylabel(X)
        plt.show()
        series = series.sort_values(ascending = False)
        print(series)

### Target Scatter Plot and Descriptive Stats Function

In [None]:
# Individual scatter plot with set x and y labels followed by supporting stats
def target_univariate_scatter(df, x, y, length, width, font):
    df = df.reset_index()
    sns.set(font_scale = font, style = 'white')
    plt.figure(figsize = (width, length))
    sns.scatterplot(data = df, x = x, y = y)
    plt.title('season ' + y + ' by ' + x)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()
    print(df[y].describe())

### Scatter Plots By Many X Variables and One y Variable Function

In [None]:
# Scatter plots for numeric data when x is set to an index of columns and the target vairable for y
def num_bivariate_scatter(df, y, x, font, length, width):
    X_num = df.select_dtypes(include = ['float64', 'int64'])
    sns.set(font_scale = font, style = 'white')
    plot = sns.pairplot(data = df, y_vars = y, x_vars = x, diag_kind = None)
    plot.fig.set_size_inches(width, length)
    plt.show()

### Corrolation Heat Map With Corrolation Scores to Target Function

In [None]:
# Corrolation heat map for all variables against target variable followed by supporting stats
def num_bivariate_corr_target(df, target, font, length, width):
    X_corr = df.corr(method = 'pearson')
    X_corr = X_corr[[target]]
    sns.set(font_scale = font, style = 'white')
    fig, ax = plt.subplots()
    fig.set_size_inches(width, length)
    sns.heatmap(X_corr)
    plt.title('corrolation matrix')
    plt.show()
    X_corr = X_corr.sort_values(by = [target], ascending = False)
    print(X_corr)

### Average Numeric Data Per Categorical Data Bar Chart <br> and Stats Function

In [None]:
# Bar plot to visulaize the average of a given numeric data to a given categorical target followed by supporting stas
def cat_bivariate_avg_target(df, index_rows, index_col, target, length, width, font):
    X_cat = df.select_dtypes(include = ['object'])
    X_cat = X_cat.columns[index_rows : index_col]

    for X in X_cat:
        label = X
        label = df[[X, target]]
        label = label.sort_values(by = [target], ascending = False)
        label = round(label.groupby([X]).mean(), 0)
        label = label.sort_values(by = [target], ascending = True)
        sns.set(font_scale = font, style = 'white')
        label[target].plot.barh(figsize = (width, length))
        plt.title('average ' + target + ' per ' + X)
        plt.xlabel('average '+ target)
        plt.ylabel(X)
        plt.show()
        label = label.sort_values(by = [target], ascending = False)
        print(label)

### Outlier Function

In [None]:
# Provides high and low gate for outliers per given column
def remove_outliers(df, col):
    p_25 = df[col].quantile(.25)
    p_75 = df[col].quantile(.75)
    iqr = (p_75 - p_25) * 1.5
    low_outliers = p_25 - iqr
    high_outliers = p_75 + iqr
    df = df.loc[(df[col] > low_outliers) & (df[col] < high_outliers)]
    return ('low end outliers:', low_outliers, 'high end outliers', high_outliers)

### Sum of Categorical Variables Per Categorical Variable Bar Chart <br> and Stats Function

In [None]:
# Counts binary target vairable as a percent per given cetegorical variable followed by supporting stats
def class_cat_bivariate(df, flag, length, width, index_rows, index_col):
    X_cat = df.select_dtypes(include = ['object'])
    X_cat = X_cat.columns[index_rows : index_col]
    
    for X in X_cat:
        label1 = df[[X, flag]]
        label1 = round(label1.groupby([X]).sum(), 0)

        label2 = df[[X, flag]]
        label2 = round(label2.groupby([X]).count(), 0)

        label3 = pd.concat([label1, label2], axis = 1)
        label3.columns = ['sum', 'count']
        label3['rate'] = round((label3['sum'] / label3['count']) * 100, 0)
        label3 = label3.sort_values(by = ['rate'], ascending = True)

        label3['rate'].plot.barh(figsize = (width, length))
        plt.title('average ' + flag + ' per ' + X)
        plt.xlabel('rate of '+ flag)
        plt.ylabel(X)
        plt.show()
        label3 = label3.sort_values(by = ['rate'], ascending = False)
        return print(label3)