In [None]:
# HELPER FUNCTIONS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict

DEBUG = False
#DEBUG = True

# for testing
def hello_world():
    print("Hello world!")
    
def printD(farg, *args):
    if (DEBUG):
        print(farg, end=' ', flush=True)
        for arg in args:
            print(arg)
        print('')

In [None]:
# function to create a plot with multiple indexes, with a legend specified
def create_bar_plot(df, title, xticks=[], legend="", ylabel = "",xlabel="", ylim=(0,100), rot = 0, hatches=True,
                   showLegend=True, hidexlabels=False):
    # create a bar plot
    plot = df.plot.bar(colors=["white", "white", "white", "white"], edgecolor = ["black","black"], 
                       linewidth = 2, ylim=ylim, rot = rot)
    
    # set the title
    plot.set_title(title)
    if (ylabel != ""):
        plot.set_ylabel(ylabel)
    if (xlabel != ""):
        plot.set_xlabel(xlabel)
    # set the xticks if needed
    if xticks!=[]: plot.set_xticklabels(xticks, rotation=45)
    
    if (hatches):
        bars = plot.patches
        print(len(bars))
        hatches = ''.join(h*df.shape[0] for h in 'xO/+')
        print(hatches)

    for bar, hatch in zip(bars, hatches):
        bar.set_hatch(hatch)
        
    if (hidexlabels):
        plot.set_xticklabels(["" for i in range(pivot.T.shape[0])])
    if (showLegend):
        if (legend != ""):
            plot.legend(ncol = 1, labels = legend, handles=hatches)
        else:
            plot.legend(ncol = 1)
    else:
        plot.legend().set_visible(False)
    return plot
 

# function to create a box plot
def plot_box(df, title, legend=[], xticks=[],ylabel="",xlabel=""):
    plot = df.plot(kind='box', title=title)
     # set the title
    plot.set_title(title)
    # set the xticks if needed
    if xticks!=[]: plot.set_xticklabels(xticks, rotation=45)
    # set the legend if needed                     
    if legend != []: plot.legend(legend) 
    if ylabel != "": plot.set_ylabel(ylabel)
    if xlabel != "": plot.set_xlabel(xlabel)
  
 
def plot_means(df, categories, title, vals, category_labels=None, legend=[], xticks=[]):
    '''Plots the means of values (vals) in (categories). Labels with category labels '''
    printD("getting means")
    pivot = pd.pivot_table(df, index=categories, values=vals, aggfunc= np.mean)
    if (category_labels is not None):
        print(category_labels)
        pivot.index = pivot.index.map(lambda x: category_labels[x])
        pivot = pivot.reindex(index=category_labels.values())
        
    printD("means")
    print(pivot)    
    create_bar_plot(pivot, title, legend=legend, xticks=xticks, ylim=(0,1.1*pivot.max(numeric_only=True).max()))
    #plot = create_bar_plot(pivot, title)
    #plot.set_ylim(0,1)
    
    
# function to create a bar plot of the means
def plot_mean(df, title, legend=[], xticks=[]):
    printD(df.columns)
    printD(df.head())
    df2 = df.agg([np.mean, np.std])
    printD(df2.head())
    printD(df2.loc['mean'])
    printD(df2.loc['std'])
    return df2.loc['mean'].plot(kind='bar', yerr=df2.loc['std'], title=title)
  
# function to stack according to some thresholds
# some example data
def plot_thresholds(df, title, bins=70, legend=[], xticks=[]):
    return df.plot.hist(alpha=0.5, bins=bins)
       
# function to plot a histogram
def plot_histogram(df, title, bins=50):
    plt.figure()
    df.diff().hist(color='k', alpha=0.5, bins=bins)
    
# function to extract a pivot table with percentages from the data frame and plot them
def plot_percentages(df, categories, title, vals, xticks=[], agg=lambda x: x, levels=None,
                     max=100, hidexlabels = True, rot = 1, legend = True, legend_labels = "", hatches = True):
    '''Plots the percentage of values (vals) in (categories) '''
    
    printD(categories)
    printD(levels)
    
    pivot = pd.pivot_table(df,index=categories, values = vals, aggfunc=agg)
    print(pivot)
    
    pivot.index.names = [x[:-4] for x in categories]
    print(pivot.index)
    if levels is not None:
        pivot.index= pivot.index.set_levels(levels)
        
    printD(pivot.index)
        
    ax = pivot.T.plot(kind = "bar" ,title=title, ylim=(0,max), rot = rot, 
                     colors=["white", "white", "white", "white"], edgecolor = ["black","black"], linewidth = 2)
    if (hatches):
        bars = ax.patches
        print(len(bars))
        hatches = ''.join(h*pivot.T.shape[0] for h in 'xO/+')
        print(hatches)

        for bar, hatch in zip(bars, hatches):
            bar.set_hatch(hatch)
    if (hidexlabels):
        ax.set_xticklabels(["" for i in range(pivot.T.shape[0])])
    if (legend):
        if (legend_labels != ""):
            ax.legend(ncol = 1, labels = legend_labels)
        else:
            ax.legend(ncol = 1)
    else:
        ax.legend().set_visible(False)
        
    # set the xticks if needed
    if xticks!=[]: ax.set_xticklabels(xticks, rotation=45)

    return ax



In [None]:
def get_labels(mergeconfig, cols):
    labels = {}
    for col in cols:
        categories = mergeconfig["mergecols"][col]
        categories = categories.copy()
        categories.pop("Description", None)
       
        printD(categories)
        codes = list(categories.values())
        categories = list(categories.keys())
        
        if codes:
            printD("has labels...", col)
                
            printD("categories: ", categories)
            
            labels[col] = dict(zip(codes, categories))
            printD(labels[col])
    return labels
 
# create a copy and run any setup code
def prep_for_plot(cols, setup, df):
    df2 = df.copy()
    df2 = df2[cols]
    return setup(df2)
 
    
#narrow down on relevant cols, set index, etc
def prep_for_analysis(cols, index, df):
    """narrow down on relevant cols, set index, etc"""
    # double check that we don't include columns that haven't been calculated
    # i.e. if a scale is currently commented out above.
    # delete columns that are not calculated
    #printD(df.columns)
    useful_cols = df.columns.intersection(cols)
    print("cols remaining, ", useful_cols)
    
    df = df.loc[:, useful_cols]
    return df

def percentage_groupby(df, cols, split, col_labels=[], label_tots=True, min_pple=5, divisor='category'):
    ''' df is the data set; col is the column to calculate percentages for
        split is the grouping column, and labels are labels.
        min_pple is the minimum number of people to include in an entry in the table (i.e. drop the category otherwise)
        divisor is whether to divide by the total for the category (e.g. total women) 
           or the total for that group (e.g. total discriminated against). Default is category
        
        For example, if you want to figure out what parcentages of men and women 
        (possible values in a col about gender) report discrimination or not, you should use
        [gender] for your cols, and 'discrimination' for split. Then you can provide
        labels for the genders (e.g. {0:'female',1:'male}). If you choose 'split' as your
        divisor, the percentages will be of all people reporting discrimination (or not). If
        you choose 'category' as your divisor the percentages will be of all women (or men) '''
    
    df2 = df[cols+[split]] # select the relevant columns
    table = df2.groupby(split) # group by the split column
    printD("group by ", split)
    index_items = df2[split].unique() # get the possible values for group
    counts = df2[split].value_counts()
    printD(counts)
    category_counts = pd.DataFrame(index=index_items, columns=[])  # prepare to gather results
    for col in cols: # loop through the columns to be organized
        try:
            labels = col_labels[col] # if the column has labels, use them
        except:
            labels = []
    
        for name, group in table:                     # for each group  (e.g. Disc=FALSE or Disc=TRUE)
            printD ("labels: ",labels)
            # for each column (e.g. gender)
            vals = group[col].value_counts()      # count up the values (e.g. men and women for Disc=FALSE)
            for label, value in vals.iteritems(): # loop through the values and store in category_counts
                if (not labels):                  # determine a label for this column
                    label = col + "_" + str(label) 
                else:
                    label = labels[label] 
                category_counts.loc[name,label] = value # store the data
        printD(category_counts)  

    tots = category_counts.agg(np.sum) # calculate the total number of people in a column
    # drop columns with too few people
    printD(tots)
    for col in category_counts.columns:
        if tots[col] < min_pple:
            printD(col,"has too few people")
            category_counts= category_counts.drop(columns=[col])
            
    print(divisor)
    if (divisor=='category'):
        printD(tots)
        category_counts = category_counts/tots  # change to percentages
        printD(category_counts)
        printD("---------------")
    else: # divisor equals split, so divide by the total in each part of the split
        for col in list(category_counts.columns):
            rowtots = df.loc[:,split].value_counts()
            printD(rowtots)
            category_counts.loc[:,col] = category_counts.loc[:,col]/rowtots
    printD(category_counts)
    
    
    if (label_tots):                    # generate new column names
        cols = category_counts.columns  # grab the columns
        mapping = {}                    # create a mapper
        for col in cols:
            mapping[col] = col+"("+str(tots[col])+")"
        category_counts = category_counts.rename(mapping, axis='columns')
        printD(mapping)

    category_counts = category_counts.fillna(0)

    print(category_counts)
    return category_counts


# takes in a list of columns
# and a data frame, and generates a table of percentages
# order is the order the new table's columns should be in
def percentage_for_values(dv, cols, order=[]):
    '''takes in a list of columns and a data frame, and generates a table of percentages.
       Each row of the table is one of the values in a column (typically all the columns
       should have values in common). Percentages are calculated based on the total
       items in df
       
       order is the order the new table's columns should be in. 
       '''
    dp = pd.DataFrame({})
    
    # calculate the percentages
    tot = len(dv.index) # the number of people in the data set
    printD(cols)
    
    for col in cols:
        counts = dv[col].value_counts() # get the number of times each value occurs
        printD(counts)
        if not order: 
            order = counts.index
            printD(order)
        for label in order: # loop through the value names
#             print("label",label)
            try: 
                num = counts.loc[label] # get the number of times the value occurs
            except KeyError:
                num = 0
            dp.at[label, 'perc_'+col] = 100*float(num)/tot # calculate the percentage
    return dp

def answer_percentages(df, col, labels=None):
    '''Answer percentages calculates the percentage of each answer to column col in df.
       Labels converts numeric answers to categorical names. 
       If provided, answers are ordered according to the label order.'''
    df['id'] = df.index 
    printD(df.head())
    printD(col)
    printD(labels)
    printD(df)
    df[col] = df[col].fillna(-1)
    vals = df.groupby(col).agg({'id':pd.Series.nunique})  
    printD(vals)
    people = len(df['id'])
    vals = 100 * (vals / people)
    if(labels is not None):
        printD("reindexing")
        printD("values",labels.values())
        printD(vals)
        vals.index = vals.index.map(lambda x: labels[x])
        printD(vals.index)
        printD("duplicates",vals[vals.index.duplicated()])
    
        vals = vals.reindex(index=labels.values())
    vals.rename(index=str, columns={'id': col})
    printD(vals)
    vals = vals.fillna(0)
    return vals

def answer_assign_bins(df, cols, cutoffs, labels):
    ''' Function organize numerical data into bins'''
    
    for col in cols: 
        df.loc[:,col] = pd.cut(df.loc[:,col], bins=cutoffs, labels=labels)
    return df

def percent_matching(condition, seq):
    """Returns the percent of items in seq that return true from condition"""
    
    printD("percent_matching ", seq)
    printD(len(seq))
    tot = sum(1 for item in seq if condition(item))
    return 100.0*(tot)/float(len(seq))

In [None]:
from textwrap import wrap
def describe(mergeconfig, cols, sep):
    '''Replaces a list with a string version of the list'''
    res = ' '
    tmp_sep = sep 
    sep = '' # no separater for the first item
    for col in cols:
        labels = mergeconfig["mergecols"][col]
        printD("for description")
        printD(labels)
        res = res + sep + labels["Description"]
        sep = tmp_sep
    res = " \n".join(wrap(res, 40))

    return res

In [None]:
from statsmodels.stats import weightstats
from scipy import stats
import matplotlib.pyplot as plt
def plot_group(df, value_cols, index_cols, mergeconfig, plot_type = "box", remove_neg_value = True, ymin = 0, ymax = 7,
               title = "", ylabel = "Score"):
    '''
        value_cols: y values of the Botxplots
        index_cols: group variables, each element of the index_cols will act as one type of grouping, corresponding to one boxplot
        remove_neg_value: flag to remove negative value in value_cols during plotting
    '''
    axes = []
    for value_col in value_cols:
        for index_col in index_cols:
            df2 = df[[value_col,index_col]]
            #print(df2[col].value_counts())
            #print(df2.head())
            if (remove_neg_value):
                df2 = df2.loc[(df2[index_col] >= 0) & (df2[value_col] >= 0)]
            else:
                df2 = df2.loc[(df2[index_col] >= 0)]

            fig, ax = plt.subplots()
            if (plot_type == "box"):
                df2.boxplot(by=index_col, ax = ax)
            elif (plot_type == "bar"):
                df2_group = df2.groupby(index_col)
                df2_group_mean = df2_group.mean()
                df2_group_std = df2_group.std()
                df2_group_mean.plot.bar(yerr=df2_group_std, ax = ax, rot=0, grid = True,
                                        color = ["white", "white"], edgecolor = ["black","black"],
                                        linewidth = 2, capsize = 5)
            ax.set_ylim([ymin,ymax])
            ax.xaxis.grid()
            ax.set_xlabel("")
            ax.set_ylabel(ylabel)
            if (title == ""): title = plot_type.capitalize() + "plot of "
            analysis_title = title + describe(mergeconfig, [value_col,index_col], " for ")
            ax.set_title(analysis_title)
            left = df2[(df2[index_col] == 0)][value_col].tolist()
            right = df2[(df2[index_col] == 1)][value_col].tolist()
            print("---------------")
            print(analysis_title)
            print(len(left), len(right))
            print(np.mean(left), np.mean(right))
            print(stats.mannwhitneyu(left, right))
            a = weightstats.ttest_ind(left, right, alternative='larger', usevar = "unequal")
            b = weightstats.ttest_ind(left, right, alternative='smaller', usevar = "unequal")
            if (a[1] < b[1]):
                print("t-test with unequal variance: t-value =", round(a[0],2), "p-value =", round(a[1],2))
            else:
                print("t-test with unequal variance: t-value =", round(b[0],2), "p-value =", round(b[1],2))
    
            labels = get_labels(mergeconfig, [index_col])
#             ax.set_xticklabels(list(labels[index_col].values()))
            ax.set_xticklabels([labels[index_col][0],labels[index_col][1]])
            ax.legend().set_visible(False)
            if (ylabel == "Stress Level"):
                ax.set_yticklabels(["Not at all : 1", "2", "3", "Somewhat : 4", "5", "6", "Extreme : 7"])
            axes.append(ax)
            plt.suptitle("")
            print("---------------")
    return axes

In [None]:
from statsmodels.stats import weightstats
from scipy import stats
import matplotlib.pyplot as plt
def plot_group_sample(df, value_cols, index_cols, mergeconfig, plot_type = "box", remove_neg_value = True, ymin = 0, ymax = 7,
               title = "", ylabel = "Score"):
    '''
        value_cols: y values of the Botxplots
        index_cols: group variables, each element of the index_cols will act as one type of grouping, corresponding to one boxplot
        remove_neg_value: flag to remove negative value in value_cols during plotting
    '''
    axes = []
    for value_col in value_cols:
        for index_col in index_cols:
            df2 = df[[value_col,index_col]]
            #print(df2[col].value_counts())
            #print(df2.head())
            if (remove_neg_value):
                df2 = df2.loc[(df2[index_col] >= 0) & (df2[value_col] >= 0)]
            else:
                df2 = df2.loc[(df2[index_col] >= 0)]
            
            df2_left =  df2[(df2[index_col] == 0)]
            df2_right = df2[(df2[index_col] == 1)]
            df2_left_buf = copy.deepcopy(df2_left)
            df2_right_buf = copy.deepcopy(df2_right)
            xxx = 1
            while(xxx > 0.05):
                if (df2_left.shape[0] > df2_right.shape[0]):
                    df2_left_buf = df2_left.sample(df2_right.shape[0], replace=True, random_state = np.random.RandomState())
                else:
                    df2_right_buf = df2_right.sample(df2_left.shape[0], replace=True, random_state = np.random.RandomState())
                left = df2_left_buf[value_col].tolist()
                right = df2_right_buf[value_col].tolist()
                xxx = weightstats.ttest_ind(left, right, usevar = "unequal")[1]
                print(xxx)
            df2 = pd.concat([df2_left_buf, df2_right_buf])

            fig, ax = plt.subplots()
            if (plot_type == "box"):
                df2.boxplot(by=index_col, ax = ax)
            elif (plot_type == "bar"):
                df2_group = df2.groupby(index_col)
                df2_group_mean = df2_group.mean()
                df2_group_std = df2_group.std()
                df2_group_mean.plot.bar(yerr=df2_group_std, ax = ax, rot=0, grid = True,
                                        color = ["white", "white"], edgecolor = ["black","black"],
                                        linewidth = 2, capsize = 5)
            ax.set_ylim([ymin,ymax])
            ax.xaxis.grid()
            ax.set_xlabel("")
            ax.set_ylabel(ylabel)
            if (title == ""): title = plot_type.capitalize() + "plot of "
            analysis_title = title + describe(mergeconfig, [value_col,index_col], " for ")
            ax.set_title(analysis_title)
            left = df2[(df2[index_col] == 0)][value_col].tolist()
            right = df2[(df2[index_col] == 1)][value_col].tolist()
            print("---------------")
            print(analysis_title)
            print(len(left), len(right))
            print(np.mean(left), np.mean(right))
            print(stats.mannwhitneyu(left, right))
            a = weightstats.ttest_ind(left, right, alternative='larger', usevar = "unequal")
            b = weightstats.ttest_ind(left, right, alternative='smaller', usevar = "unequal")
            if (a[1] < b[1]):
                print("t-test with unequal variance: t-value =", round(a[0],2), "p-value =", round(a[1],2))
            else:
                print("t-test with unequal variance: t-value =", round(b[0],2), "p-value =", round(b[1],2))
    
            labels = get_labels(mergeconfig, [index_col])
            ax.set_xticklabels([labels[index_col][0],labels[index_col][1]])
            ax.legend().set_visible(False)
            if (ylabel == "Stress Level"):
                ax.set_yticklabels(["Not at all : 1", "2", "3", "Somewhat : 4", "5", "6", "Extreme : 7"])
            axes.append(ax)
            plt.suptitle("")
            print("---------------")
    return axes