In [251]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [435]:
sns.set_style('white') 
plt.rc('axes', titlesize = 14)
plt.rc('axes', labelsize=13)   
plt.rc('xtick', labelsize=12)    
plt.rc('ytick', labelsize=12)    
plt.rc('legend', fontsize=12)    
plt.rc('font', size=12)    

colour_palette = ['#ff8b94', '#ffaaa5', '#ffd3b6', '#dcedc1', '#a8e6cf', '#bae1ff', '#d9d2e9']

# colour_dict = {'red': '#ff8b94', 'pink': '#ffaaa5', 'orange': '#ffd3b6', 'green': '#dcedc1', 'turquoise': '#a8e6cf', 'blue': '#bae1ff', 'purple': '#d9d2e9'}



In [275]:
# Histograms 

def cont_dist(df, feature):

    sns.set_style('white') 
    sns.set(style="ticks")

    fig, (ax_box, ax_hist) = plt.subplots(2, sharex = True, gridspec_kw={"height_ratios": (.15, .85)})
    
    sns.boxplot(df[feature], ax=ax_box, orient = 'h', color = colour_palette[0]) 
    sns.distplot(df[feature], ax=ax_hist, kde=True, color = colour_palette[0])
    ax_box.set(yticks=[])
    sns.despine(ax=ax_hist)
    sns.despine(ax=ax_box, left=True)
    plt.title(feature)
    



In [277]:
df_train = pd.read_csv("/Users/gracebarringer/Machine Learning Projects/Kaggle/Obesity Risk - Multi-Class/data/train.csv")
df_train_copy = df_train.copy()

In [357]:
def cat_dist(df, feature):
    # Defining plot space
    fig, (ax1, ax2) = plt.subplots(ncols = 2)

    categories = df[feature].unique().tolist()
    colour_list = []
    for i in range(len(df[feature].unique())):
        colour_list.append(colour_palette[i])

    
    df_order = df[feature].value_counts()
    # Making pie chart 
    pie_chart = df_order.plot.pie(ax = ax2, autopct='%1.1f%%', labels = None, legend = None, ylabel='', xlabel = '', colors = colour_list, figsize = (12,4))
#     plt.title(feature, loc = 'left')
#     fig.title(title = feature, loc = 'left')
#     ax1.legend(labels = categories, bbox_to_anchor=(0,0), fontsize = 10, ncol = 3, loc = 'upper left')
#     ax1.legend(bbox_to_anchor=(1.05, 1.0))


#     # Making bar chart 
#     bar_chart = df_train_copy[feature].value_counts().plot.bar(ax =ax2, xlabel = "", figsize = (12,4))
    ax1.barh(categories, df_order, label = categories, color = colour_list)
    for container in ax1.containers:
        ax1.bar_label(container, fmt = '{:,.0f}')
    ax1.set(title = "", autoscaley_on = True)
    ax1.get_xaxis().set_visible(False)
#     ax2.set_yticklabels(ax2.get_yticklabels())
    ax1.spines['top'].set_edgecolor('white')
    ax1.spines['right'].set_edgecolor('white')
    ax1.spines['bottom'].set_edgecolor('white')
    ax1.spines['left'].set_edgecolor('white')
    ax1.set(title = feature)
   


In [432]:
def cat_cont_dist(df, var1, var2):
    
    fig, (ax1, ax3, ax2) = plt.subplots(ncols = 3, figsize = (12,6))
    
    categories = df[var1].unique().tolist()
    colour_list = []
    for i in range(len(df[var1].unique())):
        colour_list.append(colour_palette[i])
    
#     sns.set(rc={'figure.figsize':(11.7,8.27)})
    df_mean = df.groupby(var1)[var2].mean().reset_index()
    df_mean = df_mean.sort_values(var2, ascending = False)
    sns.barplot(ax = ax1, y=var1, x=var2, data = df_mean, errorbar=('ci', False), palette = colour_list)
    ax1.set(title = 'Mean '+var2+' by '+var1)
    for container in ax1.containers:
        ax1.bar_label(container, fmt = '{:,.1f}')
    ax1.spines['top'].set_edgecolor('white')
    ax1.spines['right'].set_edgecolor('white')
    ax1.spines['bottom'].set_edgecolor('white')
    ax1.spines['left'].set_edgecolor('white')

    df_median = df.groupby(var1)[var2].median().reset_index()
    df_median = df_median.sort_values(var2, ascending = False)
    sns.barplot(ax = ax3, y=var1, x=var2, data = df_median, errorbar=('ci', False), palette = colour_list)
    ax3.set(title = 'Median '+var2+' by '+var1)
    ax3.set(yticklabels = [], ylabel = '')
    for container in ax3.containers:
        ax3.bar_label(container, fmt = '{:,.1f}')
    ax3.spines['top'].set_edgecolor('white')
    ax3.spines['right'].set_edgecolor('white')
    ax3.spines['bottom'].set_edgecolor('white')
    ax3.spines['left'].set_edgecolor('white')
    
#     df_violin = df.sort_values()
    sns.boxplot(ax=ax2, y= var1, x=var2, data = df, palette = colour_list, order = df_mean[var1])
    ax2.set(yticklabels = [], ylabel = '')
    ax2.set(title = 'Distribution of '+var2+' by '+var1)
    ax2.spines['top'].set_edgecolor('white')
    ax2.spines['right'].set_edgecolor('white')
    ax2.spines['bottom'].set_edgecolor('white')
    ax2.spines['left'].set_edgecolor('white')

In [257]:
def scatter_plot(df, var1, var2):
    
    sns.jointplot(x=df[var2], y=df[var1], color = colour_palette[0])


In [258]:
def scatter_matrix(df, hue_var, hue):
    if hue == False:
        sns.pairplot(df) 
    else:    
        sns.pairplot(df, hue = hue_var, palette = colour_palette)

In [428]:
def corr_heat_map(df):
    fig, ax = plt.subplots(figsize=(12, 12))
    sns.heatmap(df.corr(), ax=ax, annot=True, color = colour_palette)
    plt.show()

In [417]:
def cat_heat_map(df, var1, var2):
    df_new = df[[var1,var2]].copy()
    df_agg = df_new.groupby([var1, var2]).value_counts().reset_index()
    df_agg['perc'] = df_agg['count']/df_agg['count'].sum()
    df_pivot = df_agg.pivot_table(values = 'perc', index = var1, columns = var2)
    fig, ax = plt.subplots(figsize=(4, 4))
    sns.heatmap(df_pivot, ax=ax, annot=True, color = colour_palette, fmt = '.1%')


In [358]:
def cat_cat_dist(df, var1, var2):
    sns.set_style('white') 
    df = df.sort_values(by = [var1, var2])
    plt.figure(figsize=(20,5))
    plt.subplot(121)
    sns.countplot(df, x = var2, hue=df[var1], palette = colour_palette)
    plt.title(var2)    
    plt.xticks(rotation=90)
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc= 'upper left')



In [261]:
def cont_cont_target_dist(df, target, var1, var2, alph):
    sns.set_style('white') 
    sns.jointplot(x=df[var1], y=df[var2], color = colour_palette[0], hue=df[target], palette = colour_palette, alpha = alph)
    plt.legend(bbox_to_anchor=(1.05, 1.0), loc= 'upper left')

In [541]:
def cat_target_dist(df, target, var1):
    
    cat_list = df[var1].unique().tolist()
#     print(cat_list)
    cat_dict = {}
    for i in cat_list:
        cat_dict[i] = df[df[var1] == i][target].value_counts().sort_index()
    
#     for i in cat_dict.keys():
#         cat_dict[i].sort_index
    if len(cat_list)>2:
        fig, axs = plt.subplots(len(cat_list)//2+1, 2, figsize=(16, 12))
        axs = axs.flatten()
    else:
        fig, axs = plt.subplots(1, 2, figsize=(16, 12))
        axs = axs.flatten()

    for i, cat in enumerate(cat_dict.values()):
        axs[i].pie(x=cat.values, labels = cat.index, colors = colour_palette, autopct='%.0f%%', textprops={'fontsize': 11})
        axs[i].set_title(var1+'-'+cat_list[i]+" vs. "+target+" Distribution")
        
    if len(cat_list)%2!=0:
        axs[-1].axis('off')
        
    plt.tight_layout()
    plt.show()
    