<a href="https://colab.research.google.com/github/jirvingphd/my_data_science_notes/blob/master/collection_of_functions_JMI%2BMVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mod1Project EDA Functions

### def multiplot

In [0]:
# MULTIPLOT
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


def multiplot(df):
    """Plots results from df.corr() in a correlation heat map for multicollinearity.
    Returns fig, ax objects"""
    sns.set(style="white")

    # Compute the correlation matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(16, 16))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
                
    square=True, linewidths=.5, cbar_kws={"shrink": .5})
    return f, ax

### def plot_hist_scat_sns [USE ME!!!!]

In [0]:
# Plots histogram and scatter (vs price) side by side
# Plots histogram and scatter (vs price) side by side
def plot_hist_scat_sns(df, target='index'):
    """Plots seaborne distplots and regplots for columns im datamframe vs target.

    Parameters:
    df (DataFrame): DataFrame.describe() columns will be used. 
    target = name of column containing target variable.assume first coluumn. 
    
    Returns:
    Figures for each column vs target with 2 subplots.
   """
    import matplotlib.ticker as mtick
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    with plt.style.context(('dark_background')):
        ###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------##


#         plt.style.use('dark_background')
        figsize=(9,7)

        # Axis Label fonts
        fontTitle = {'fontsize': 14,
                   'fontweight': 'bold',
                    'fontfamily':'serif'}

        fontAxis = {'fontsize': 12,
                   'fontweight': 'medium',
                    'fontfamily':'serif'}

        fontTicks = {'fontsize': 8,
                   'fontweight':'medium',
                    'fontfamily':'serif'}

        # Formatting dollar sign labels
        fmtPrice = '${x:,.0f}'
        tickPrice = mtick.StrMethodFormatter(fmtPrice)


        ###  PLOTTING ----------------------------- ------------------------ ##

        # Loop through dataframe to plot
        for column in df.describe():
#             print(f'\nCurrent column: {column}')

            # Create figure with subplots for current column
            fig, ax = plt.subplots(figsize=figsize, ncols=2, nrows=2)

            ##  SUBPLOT 1 --------------------------------------------------##
            i,j = 0,0
            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)

            # Define graphing keyword dictionaries for distplot (Subplot 1)
            hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
            kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}

            # Plot distplot on ax[i,j] using hist_kws and kde_kws
            sns.distplot(df[column], norm_hist=True, kde=True,
                         hist_kws = hist_kws, kde_kws = kde_kws,
                         label=column+' histogram', ax=ax[i,j])


            # Set x axis label
            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)

            # Get x-ticks, rotate labels, and return
            xticklab1 = ax[i,j].get_xticklabels(which = 'both')
            ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=0)
            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())


            # Set y-label 
            ax[i,j].set_ylabel('Density',fontdict=fontAxis)
            yticklab1=ax[i,j].get_yticklabels(which='both')
            ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())


            # Set y-grid
            ax[i, j].set_axisbelow(True)
            ax[i, j].grid(axis='y',ls='--')




            ##  SUBPLOT 2-------------------------------------------------- ##
            i,j = 0,1
            ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)

            # Define the kwd dictionaries for scatter and regression line (subplot 2)
            line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
            scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}

            # Plot regplot on ax[i,j] using line_kws and scatter_kws
            sns.regplot(df[column], df[target], 
                        line_kws = line_kws,
                        scatter_kws = scatter_kws,
                        ax=ax[i,j])

            # Set x-axis label
            ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)

             # Get x ticks, rotate labels, and return
            xticklab2=ax[i,j].get_xticklabels(which='both')
            ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=0)
            ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())

            # Set  y-axis label
            ax[i,j].set_ylabel(target,fontdict=fontAxis)

            # Get, set, and format y-axis Price labels
            yticklab = ax[i,j].get_yticklabels()
            ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
            ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())

    #         ax[i,j].get_yaxis().set_major_formatter(tickPrice) 

            # Set y-grid
            ax[i, j].set_axisbelow(True)
            ax[i, j].grid(axis='y',ls='--')       

            ## ---------- Final layout adjustments ----------- ##
            # Deleted unused subplots 
            fig.delaxes(ax[1,1])
            fig.delaxes(ax[1,0])

            # Optimizing spatial layout
            fig.tight_layout()
            figtitle=column+'_dist_regr_plots.png'
#             plt.savefig(figtitle)
    return 

# Outliers & Data Transformation


In [0]:
detect_outliers()

### def detect_outliers

In [0]:
# Tukey's method using IQR to eliminate 
def detect_outliers(df, n, features):
    """Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.
    Parameters:
    df (DataFrame): DataFrane containing columns of features
    n: default is 0, multiple outlier cutoff  
    
    Returns:
    Index of outliers for .loc
    
    Examples:
    Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value
    df.loc[Outliers_to_drop] # Show the outliers rows
    data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
"""

# Drop outliers    

    outlier_indices = []
    # iterate over features(columns)
    for col in features:
        
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
        # select observations containing more than 2 outliers
        outlier_indices = Counter(outlier_indices)        
        multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers 


In [0]:
# describe_outliers -- calls detect_outliers
def describe_outliers(df):
    """ Returns a new_df of outliers, and % outliers each col using detect_outliers.
    """
    out_count = 0
    new_df = pd.DataFrame(columns=['total_outliers', 'percent_total'])
    for col in df.columns:
        outies = detect_outliers(df[col])
        out_count += len(outies) 
        new_df.loc[col] = [len(outies), round((len(outies)/len(df.index))*100, 2)]
    new_df.loc['grand_total'] = [sum(new_df['total_outliers']), sum(new_df['percent_total'])]
    return new_df

# Mod 2 Project-Specific Functions

## SQL

### def get_col_info

In [0]:
def  get_col_info(col_name):
    """Gets the column names and data types from the alchamey inspector object.
    Returns column_info dataframe of table details.
    """
    col_list = inspector.get_columns(col_name)
    
    column_info = [['table','column','dtype']]
    print(f'Table Name: {col_name}\n')

    for col in col_list:
        column_info.append([str(col_name),col['name'], col['type']])
        
    df = list2df(column_info)
    return column_info

### def get_full_table_info

In [0]:
def  get_full_table_info(engine):
    """Gets the table names, their column namesand data types engine.
    Returns column_info dataframe of table details.
    """
    column_info = [['table','column','dtype']]
    
    list_tables= engine.table_names()
    
    for table in list_tables:
        
        col_list = inspector.get_columns(table)
        
        for col in col_list:
            
            column_info.append([str(table),col['name'], col['type'],col['']])
            inspector.get_foreign_keys()
    
    df = list2df(column_info)
    return df

### How to: use sqlalchemy on Google Colab

In [0]:
# Using sqlalchemy to import sql tables in google drive
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import Session, sessionmaker

from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

filepath = '/content/drive/My Drive/Colab Notebooks/datasets/Northwind_small.sqlite'
engine = create_engine('sqlite:///'+filepath,echo=True)
inspector = inspect(engine);

# df_employee = pd.read_sql_query("SELECT Id, Title, LastName, HireDate , BirthDate  FROM [EMPLOYEE]", engine )
# df_cust_ord = pd.read_sql_query("SELECT *FROM [Order] JOIN [Customer] ON [Customer].Id = [Order].CustomerId", engine)
print(inspector.get_table_names())



#IMPORTING TABLES
DB_Order = pd.read_sql_table('Order',engine);
DB_OrderDetail = pd.read_sql_table('OrderDetail',engine);

#  Mod 2 Data Processing / Production

### making month_dict for month # to name mapping

In [0]:
#make list of month names (strings) 
months = ['jan','feb', 'mar', 'apr', 'may' , 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']#creating label names
# month_code =
month_dict = dict(zip( list(range(1,len(months)+1)),months)) # zip the two into a dictionary

# MAP THE MONTH_DICT ONTO NEW COLUMN month_name
df_price_geo['month_name'] = df_price_geo['month'].map(month_dict)
df_price_geo['month_name'].value_counts()

### Separatig df into new vars based on discounts [Mike]

In [0]:
# From mike, separateing into discounts by percetnage
# dict and target for running through calc_effect_sizes
disc_5 = np.array(df['Quantity'].loc[(df['Discount'] > 0) & (df['Discount'] <= 0.05)])
disc_10 = np.array(df['Quantity'].loc[(df['Discount'] > 0.05) &(df['Discount'] <= 0.10)])
disc_15 = np.array(df['Quantity'].loc[(df['Discount'] > 0.10) &(df['Discount'] <= 0.15)])
disc_20 = np.array(df['Quantity'].loc[(df['Discount'] > 0.15) &(df['Discount'] <= 0.20)])
disc_25 = np.array(df['Quantity'].loc[(df['Discount'] > 0.20) &(df['Discount'] <= 0.25)])
samples_dict = {'disc_5':disc_5, 'disc_10':disc_10, 'disc_15':disc_15, 'disc_20':disc_20, 'disc_25':disc_25}
full_array = np.array(df['Quantity'].loc[df['Discount'] == 0])

###def calc_product_price & def_ calc_order_total 

In [0]:
# James: 04/02/19 
# Source of df = pd.read_sql_query("SELECT * FROM OrderDetail",  engine)

# Define calc_product_review to add product price column
def calc_product_price(row):
    price = row['UnitPrice']*(1-row['Discount'])*row['Quantity']
    row['price'] = price
    if row['Discount']>0:
        row['OnSale'] = True
    else:
        row['OnSale'] = False
    return row    

# Use calc_order_total to fill in order_total column
def calc_order_total(row,df):
    order = row['OrderId']
    df_temp = df.groupby('OrderId').get_group(order)

    
    if any(df_temp['OnSale']):
        row['discounted_order'] = True
    else:
        row['discounted_order'] = False
    
    order_total = df_temp['price'].sum()
    row['order_total'] = order_total
    
    return row
# Apply calc_product_price to every row 
df_price = df.apply(lambda x: calc_product_price(x),axis=1)
# df_price['order_total'] = None


# Apply_calc_order_total to every row
df_price = df_price.apply(lambda x: calc_order_total(x,df_price), axis=1)  
df_price.head()

### Saving hypothesis 3 dataframes to csv for Tableau

In [0]:
# Ssave df_price_Geo 
save = input(prompt='Would you like to export the dataframe above? (y/n)\n')
if save.lower()=='y':
    filename ='df_H3_price_w_dates_products.csv'
    df_H3_price_w_dates_products = df_price_geo.copy()
    df_H3_price_w_dates_products.to_csv(filename)
    print(f'df_price exported and saved as {filename}...')
    print(f'if you are running this on Colab:...\nOpen File sidebar, click Refresh, right click on {filename} to Download.')
    
    filename ='df_H3_price_w_dates_orders.csv'
    df_H3_price_w_dates_orders = df_H3_price_w_dates_products.drop_duplicates(subset=['OrderId'])
    df_H3_price_w_dates_orders.to_csv(filename)
    print(f'if you are running this on Colab:...\nOpen File sidebar, click Refresh, right click on {filename} to Download.')

else:
    print('No .csv exported.')

# Hypothesis Testing Statistics

In [0]:
#### Cohen's d
def Cohen_d(group1, group2):
    '''Compute Cohen's d.
    # group1: Series or NumPy array
    # group2: Series or NumPy array
    # returns a floating point number 
    '''
    diff = group1.mean() - group2.mean()

    n1, n2 = len(group1), len(group2)
    var1 = group1.var()
    var2 = group2.var()

    # Calculate the pooled threshold as shown earlier
    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    
    # Calculate Cohen's d statistic
    d = diff / np.sqrt(pooled_var)
    
    return d


def plot_pdfs(cohen_d=2):
    """Plot PDFs for distributions that differ by some number of stds.
    
    cohen_d: number of standard deviations between the means
    """
    group1 = scipy.stats.norm(0, 1)
    group2 = scipy.stats.norm(cohen_d, 1)
    xs, ys = evaluate_PDF(group1)
    pyplot.fill_between(xs, ys, label='Group1', color='#ff2289', alpha=0.7)

    xs, ys = evaluate_PDF(group2)
    pyplot.fill_between(xs, ys, label='Group2', color='#376cb0', alpha=0.7)
    
    o, s = overlap_superiority(group1, group2)
    print('overlap', o)
    print('superiority', s)
   

In [0]:
# DEFINING FUNCTION 
#data_in={'name':data}
def normtest_results(dict_data):

    results_normtest_shap = [['DataIn','Test','stat','p']]
    results_normtest_dagp = [['DataIn','Test','stat','p']]

    for key,val in dict_data.items():

        data_in = val
        name = key
        test = 'Shapiro'
        stat, p = shapiro(data_in)
        results_normtest_shap.append([name , test, stat , p ])
        test = 'D’Agostino’s'
        stat, p = normaltest(data_in)
        results_normtest_dagp.append([name,test,stat, p])

    results_normtest = pd.concat([list2df(results_normtest_shap), list2df(results_normtest_dagp)]) 
    
    return results_normtest, list2df(results_normtest_shap),list2df(results_normtest_dagp)


In [0]:
# from Mike: 04/02/19
def overlap_superiority(group1, group2, prints=False):
  """Calculates the overlap and superiority of two samples,
    tailored for small populatioin sizes
    group1 and group2 are np.arrays of 1 dimension
  """
  #make sure both samples are of the same size so they can be zipped
  if len(group1) < len(group2):
    group2 = np.random.choice(group2, len(group1))
  
  elif len(group1) > len(group2):
    group1 = np.random.choice(group1, len(group2))

  # Identify the threshold between samples
  thresh = (group1.mean() + group2.mean()) / 2
  if prints == True: 
    print('Threshold:',thresh)
  
  # Calculate no. of values above and below for group 1 and group 2 respectively
  above = sum(group1 < thresh)
  below = sum(group2> thresh)
  
  # Calculate the overlap
  overlap = (above + below) / len(group1)
  
  # Calculate probability of superiority
  superiority = sum(x > y for x, y in zip(group1, group2)) / len(group1)
  if prints == True:
    print('Overlap:',overlap,'\n''Superiority:', superiority)
  
  return overlap, superiority

In [0]:
def calc_effect_sizes(target, samples, prints=False):
  """target is the sample to be compared against, an np. array
     samples is a dict. of np.arrays to compare against target
     if prints = True, prints out results, otherwise just returns dict
     of values.
  """
  effect_dict = {}
  for k, v in samples.items():
    
    if prints == True:
      print(f'Effect of sample size for {k}, target:')
      effect = overlap_superiority(v, target, prints=True)
      print('\n')
      
    else:
      effect = overlap_superiority(v, target)
    
    effect_dict[k] = {'overlap':effect[0], 'superiority':effect[1]}
    
  return effect_dict

#### How to Run Tukey's tests and turn results into a dataframe

In [0]:
# Importing tukey's test
from statsmodels.stats.multicomp import pairwise_tukeyhsd as tukey

# Defome the dataframe containing column of interest and group labels. 
df_test_hypothesis = df_year_orders[['order_total','month_name','week_day']]
grp_labels = df_test_hypothesis['month_name']


# Run tukey's test
tukey_results =tukey(df_test_hypothesis['order_total'], grp_labels, 0.05)


# Save the results into a dataframe
dfH_tukey = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0])
dfH_tukey
# dfH_tukey.loc[dfH_tukey['reject']==True] # To show just significant results.

## SHORT PIPELINE FOR HYPOTHESIS TESTING

#### Aim 2.1: Test for Normality

In [0]:
H3_tests = [['Group:','TestName','Test Purpose','stat','p','sig?']  ]

In [0]:
from scipy.stats import normaltest

for month,  df  in dict_to_test.items(): #month = key, df = values

    arrA = dict_to_test[month]['order_total']

    #1. Test for normality
    test_purpose = 'Normality'
    test_to_run = 'normaltest'

    arrA = np.array(arrA)
    statA, pA = eval(test_to_run)(arrA)

    H3_tests.append([month, test_to_run, test_purpose ,statA, pA, pA<0.05])
    
arrB = np.array(df_year_orders['order_total'])
stat, p = eval(test_to_run)(arrB)
H3_tests.append(['Total Pop', test_to_run, test_purpose,stat, p,p<0.05])


H3_results_norm = list2df(H3_tests)
H3_results_norm

#### Aim 2.2: Test for Homogneity of Variance
- Levenes Test

In [0]:
from scipy.stats import levene

for month,  df  in dict_to_test.items(): #month = key, df = values

    arrA = dict_to_test[month]['order_total']
    arrB = df_year_orders[df_year_orders['month_name']!= month]['order_total']
    #1. Test for normality
    test_to_run = 'levene'
    test_purpose = 'Equal Variance'

    arrA = np.array(arrA)
    arrB = np.array(arrB)

    stat, p = eval(test_to_run)(arrA,arrB,center='median')
    
    H3_tests.append([f'{month} vs. Other Months', test_to_run, test_purpose ,stat, p, p<0.05])


list2df(H3_tests)

#### Use Tukey's Pairwise Multiple Comparison test.
```statsmodels.stats.multicomp.pairwise_tukeyhsd```

In [0]:
# Importing tukey's test
from statsmodels.stats.multicomp import pairwise_tukeyhsd as tukey

# Defome the dataframe containing column of interest and group labels. 
df_test_hypothesis = df_year_orders[['order_total','month_name','week_day']]
grp_labels = df_test_hypothesis['month_name']


# Run tukey's test
tukey_results =tukey(df_test_hypothesis['order_total'], grp_labels, 0.05)


# Save the results into a dataframe
dfH_tukey = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0])
dfH_tukey
# dfH_tukey.loc[dfH_tukey['reject']==True] # To show just significant results.

Unnamed: 0,group1,group2,meandiff,lower,upper,reject
0,apr,aug,-429.4049,-1421.7047,562.8950,False
1,apr,dec,-207.6797,-1111.0347,695.6754,False
2,apr,feb,-22.6780,-913.5247,868.1687,False
3,apr,jan,82.7094,-793.8885,959.3073,False
4,apr,jul,-249.8789,-1259.4613,759.7034,False
5,apr,jun,-472.0174,-1727.6696,783.6348,False
6,apr,mar,-291.8645,-1133.0197,549.2908,False
7,apr,may,-116.3951,-1188.8343,956.0440,False
8,apr,nov,-173.3675,-1160.2354,813.5004,False
9,apr,oct,-54.9710,-1016.8412,906.8993,False


# Pandas Tricks /  Data Filtering and Selection

### def list2df [USE ME!!!]

In [0]:
def list2df(list):#, sort_values='index'):
    """ Take in a list where row[0] = column_names and outputs a dataframe.
    
    Keyword arguments:
    set_index -- df.set_index(set_index)
    sortby -- df.sorted()
    """    
    
    df_list = pd.DataFrame(list[1:],columns=list[0])
#     df_list = df_list[1:]

    return df_list

### def df_drop_regex

In [0]:
def df_drop_regex(DF, regex_list):
    '''Use a list of regex to remove columns names. Returns new df.
    
    Parameters:
        DF -- input dataframe to remove columns from.
        regex_list -- list of string patterns or regexp to remove.
    
    Returns:
        df_cut -- input df without the dropped columns. 
        '''
    df_cut = DF.copy()
    
    for r in regex_list:
        
        df_cut = df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]
        print(f'Removed {r}\n')
        
    return df_cut

# Plotting in Pandas

## Plotting bar graph with mean + standard error of the mean


In [0]:
## Plotting bar graph with mean + standard error of the mean -Cell 1 of 2

# WANT TO CALCULATE MEAN AND SEM FOR BAR PLOT FOR DF
# Calc Standard Error of the Mean for PLotting.
from scipy.stats import sem

d_plot={}
d_plot['mean'] = df_year_orders.groupby(['month'])['order_total'].mean()
d_plot['sem'] =df_year_orders.groupby(['month'])['order_total'].sem()
df_plot = pd.DataFrame.from_dict(d_plot)

# Convert month index to month_num column
df_plot['month_num'] = df_plot.index

# Use month_dict to get months labeled with names
df_plot['month_name']=df_plot['month_num'].map(month_dict)
df_plot.set_index('month_name',drop=False,inplace=True)

In [0]:
## Plotting bar graph with mean + standard error of the mean -Cell 2 of 2
# Specify keywords to feed into df_plot
plt.style.use('default')
bar_kws = {'figsize':[6,4],
          'title': 'Order Totals By Month',
          'grid':False,
          'legend':False,
          'rot':45,
           'yerr':'sem',
          'ylim': [0,2500]}

fig = df_plot.plot(kind='bar',x=df_plot.index.str.title(),y='mean',**bar_kws)#,table=True)
fig.set(**{'xlabel':'Month','ylabel':'Order Total($)'})

# Mod 2 Project Plotting - For Presentation or to Make into Functions

## Mike's Figure Functions


### def make_violinplot

In [0]:
# plotting order totals per month in violin plots

def make_violinplot(x,y, title=None, hue=None, ticklabels=None):
  
  '''Plots a violin plot with horizontal mean line, inner stick lines'''
  
  plt.style.use('dark_background')
  fig,ax =plt.subplots(figsize=(12,10))


  sns.violinplot(x, y,cut=2,split=True, scale='count', scale_hue=True,
                 saturation=.5, alpha=.9,bw=.25, palette='Dark2',inner='stick', hue=hue).set_title(title)

  ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
  ax.set_xticklabels(ticklabels)

  plt.legend()
  plt.show()
  x= df_year_orders['month']
  y= df_year_orders['order_total']
  title = 'Order totals per month with or without discounts'
  hue=df_year_orders['Discount']>0
    
    
### Example usage
# #First, declare variables to be plotted
# x = df_year_orders['month']
# y = df_year_orders['order_total']
# ticks = [v for v in month_dict.values()] 
# title = 'Order totals per month with or without discounts'
# hue = df_year_orders['Discount']>0

### Then call function
# make_violinplot(x,y,title,hue, ticks), 

### def make_stripplot

In [0]:
def make_stripplot(x, y, title=None, hue=None, ticklabels=None):

  plt.style.use('dark_background')
  fig,ax =plt.subplots(figsize=(8,6))


  sns.stripplot(x, y, jitter=True, size=12,edgecolor='gray',linewidth=1.5, alpha=.5, palette='Dark2',marker='d', hue=hue).set_title(title)

  ax.axhline(y.mean(),label='total mean', ls=':', alpha=.5, color='xkcd:yellow')
  ax.set_xticklabels(ticklabels)

  plt.legend()
  plt.show()


### def draw_histograms

In [0]:
plt.style.use('default')
def draw_histograms(df, variable, sample_dict, n_rows, n_cols):

  '''Takes dataframe, variable is column name , plots histograms '''
  
  with plt.style.context('seaborn-paper'):

    fig = plt.figure(figsize=(10,10))

    for k,v in sample_dict.items():

      month = df[df[variable] == k]['order_total']
      month_mean = round(np.mean(month),2)

      year =  df[df['month'] != k]['order_total']
      year_mean = round(np.mean(year),2)

      ax = fig.add_subplot(n_rows,n_cols,k)
      ax.tick_params(labelsize=8)

      plt.hist(year, bins=90,alpha=.7, label='Rest of Year')
      plt.hist(month, alpha=.6,label= v.title())

      ax.set_title(v.title(),fontsize=14)

      plt.axvline(month_mean, color='xkcd:fuchsia',linestyle='--',
                  label='Sample Mean \n'+str(month_mean))

      plt.axvline(year_mean,color='xkcd:green',linestyle='-',
                  label='Pop. Mean \n'+str(year_mean))

      plt.legend(fontsize=6, frameon=False, ncol = 2 )

    fig.tight_layout()    
    plt.show()

### draw_histograms_from_sample

In [0]:
def draw_histograms_from_sample(population,sample, sample_dict, n_rows, n_cols):

  fig = plt.figure(figsize=(8.5,7.5))
  count = 0
  
  for k,v in sample_dict.items():
    
    count += 1                        

    month = sample_dict[k] #pop_samp_month_dict[k]
    month_mean = round(np.mean(v),2)
    
    year = population
    year_mean = round(np.mean(population),2)

    ax = fig.add_subplot(n_rows,n_cols, count)
    ax.tick_params(labelsize=8)

    plt.hist(year, alpha=.8, label='All Months')
    plt.hist(month, alpha=.6, label = v.title())

    ax.set_title(k.title(),fontsize=14)

    plt.axvline(month_mean, color='xkcd:fuchsia',linestyle='--',
                label='Sample Mean \n'+str(month_mean))
    plt.axvline(year_mean,color='xkcd:green',linestyle='-',
                label='Pop. Mean \n'+str(year_mean))
    
    plt.legend(fontsize=6, frameon=False)
    
  fig.tight_layout()    
  plt.show()

## BOOKMARK <img src="https://www.dropbox.com/s/6xqzendi1iyzls8/bookmark.png?raw=1" width=25> 


### Using gridspec to plot KDE and bar plots Hypothesis 1

In [0]:
# Plotting Histogram/kde for most of width of figure, then bar graph on right. 
## ADDING add_gridspec usage
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
from scipy.stats import sem


from matplotlib import rcParams
from matplotlib import rc
rcParams['font.family'] = 'serif'
rcParams['font.sans-serif'] = ['Tahoma']
# rcParams[]

# import matplotlib.pyplot as plt


%matplotlib inline

# Plot distributions of discounted vs full price groups
plt.style.use('default')
# with plt.style.context(('tableau-colorblind10')):
with plt.style.context(('seaborn-notebook')):
    
    ## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
   # Axis Label fonts
    fontSuptitle ={'fontsize': 24,
               'fontweight': 'bold',
                'fontfamily':'serif'}
    
    fontTitle = {'fontsize': 12,
               'fontweight': 'medium',
                'fontfamily':'serif'}

    fontAxis = {'fontsize': 12,
               'fontweight': 'medium',
                'fontfamily':'serif'}

    fontTicks = {'fontsize': 10,
               'fontweight':'medium', 
                'fontfamily':'sans-serif'}
 



    fig = plt.figure(constrained_layout=True, figsize=(10,4))
    gs = fig.add_gridspec(nrows=1,ncols=10)
    
    ax0 = fig.add_subplot(gs[0, 0:-3])
    ax1 = fig.add_subplot(gs[7:9])
    
    ax = [ax0,ax1]
    
    plt.suptitle('Quantity of Units Sold', fontdict=fontSuptitle)
    

     ## ----------- DEFINE SUBPLOT GROUPS DATA, LABELS, AND STYLE ----------- ##

    # Group 1: data, label, hist_kws and kde_kws
    
    plot_me1 = {'data': df_fullprice['Quantity'], 
                'label': 'full price'.title(),
                
               'hist_kws' :
                {'edgecolor': 'black','color':'darkgray','alpha': 0.8, 'lw':0.5},
                
               'kde_kws':
                {'color':'gray','linestyle': '--', 'linewidth':2,'label':'kde'}}

    
    sns.distplot(plot_me1['data'], label=plot_me1['label'],
                 hist_kws = plot_me1['hist_kws'],
                 kde_kws = plot_me1['kde_kws'], ax=ax[0])   
    
    
    
    
    # Group 2: data, label, hist_kws and kde_kws
    
    plot_me2 = {'data': df_discounted['Quantity'],
                'label': 'discounted'.title(), 
                
                'hist_kws' :
                {'edgecolor': 'black','color':'green','alpha':0.8 ,'lw':0.5},
                

                'kde_kws':
                {'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}
    
    
    sns.distplot(plot_me2['data'], label=plot_me2['label'],
                 hist_kws=plot_me2['hist_kws'],
                 kde_kws = plot_me2['kde_kws'],ax=ax[0])
    
    
    ax[0].set_title('Histogram + KDE',fontdict=fontTitle)
    #ax[0].set_xlabel(fontAxis)
    ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)
                      
    ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])   
    ax[0].legend()


    # SUBPLOT 2 
    # Import scipy for error bars
    from scipy import stats

    x = [plot_me1 ['label'], plot_me2['label']]
    y = [np.mean(plot_me1['data']),np.mean(plot_me2['data'])]

    yerr = [stats.sem(plot_me1['data']),  stats.sem(plot_me2['data'])]
    err_kws = {'ecolor':'black','capsize':10,'capthick':1,'elinewidth':1,'barsabove':False}

    ax[1].bar(x,y,align='center', edgecolor='black', yerr=yerr,error_kw=err_kws)

    # Customize subplot 2
    ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)
    ax[1].set_xlabel('Sales Price', fontdict=fontAxis)
    ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)
    ax[1].tick_params(axis='both',labelsize=fontTicks['fontsize'])
    fig.savefig('H1_EDA_using_gridspec.png')
#     plt.tight_layout()
#     print(f')

    plt.show()


### Summary distribution with annotated mean


In [0]:
## Hypothesis 2 summary distribtuions with annotations
with plt.style.context(('tableau-colorblind10')):
    fig = plt.figure()
    
    plt.title('Order Total  of Orders with Discounted Items vs Full Price')
    
    plt.hist(arrA,alpha = 0.5, bins=30,label='Discounted')
    plt.hist(arrB,color='black', alpha = 0.5, bins=30,label='Full Price')

    # Adding annotations
    meanD = round(np.mean(arrA),3)
    meanS = round(np.mean(arrB),3)
    
    plt.axvline(meanD, linestyle='--',label='Discounted Mean')
    plt.text(meanD-1000,165,f'Mean:{meanD}',rotation=90)
    
    plt.axvline(meanS,color='k',linestyle='--',label='Full Price Mean')
    plt.text(meanS+500,165,f'Mean:{meanS}',rotation=90)


    plt.legend()
    plt.show()

# Run normality testing
stat,p = normaltest(pop_samp_full)
print(f'Normality: stat ={stat}, p = {p}')


### Resampled data distributions with annotated mean [as example for annotating?]

In [0]:
# Plot the re-sampled data

# # IF want to take smaller sample from population:

# pop_samp_disc = np.random.choice(pop_samp_disc,1000)
# pop_samp_full = np.random.choice(pop_samp_full, 1000)
with plt.style.context(('dark_background')):
    
    fig = plt.figure()
    plt.title('Distribution of Quanitities for Discounted vs Full Price ')

    disc = df_discounted['Quantity']
    full = df_fullprice['Quantity']
    
    
    plt.hist(full,color='red', alpha = 0.5, bins=30,label='Full Price')
    plt.hist(disc, alpha = 0.8, bins=30,label='Discounted')

    # Adding annotations
    meanD = round(np.mean(disc),3)
    meanF = round(np.mean(full),3)
    
    plt.axvline(meanD, color='green',linestyle='--',label='Discounted Mean')
    plt.text(meanD,190,f'Mean:{meanD}',rotation=270,fontweight='medium')
    
    plt.axvline(meanF,color='white',linestyle='-',label='Full Price Mean')
    plt.text(meanF, 190, f'Mean:{meanF}',rotation=270,fontweight='medium')

    plt.xlabel('Quantity')
    plt.ylabel('Counts')
    plt.legend()
    plt.show()


### Making a population overlapping histrogram +mean annotaitons

In [0]:
# Resample our data  i # of samples with n =50/sample 
# Dat source
data_discounted = df['Quantity'].loc[df['Discount']>0].copy()
data_fullprice = df['Quantity'].loc[df['Discount']==0].copy()


n = 50 
i = 10000
pop_samp_disc = []
pop_samp_full = []
for i in range(0,i):
    
    pop_samp_disc.append(data_discounted.sample(n).mean())
    pop_samp_full.append(data_fullprice.sample(n).mean())
    #   test_results.append(normaltest())
    # plt.hist([pop_samp_disc,pop_samp_full])
    
    
# Plot the re-sampled data

# pop_samp_disc = np.random.choice(pop_samp_disc,1000)
# pop_samp_full = np.random.choice(pop_samp_full, 1000)
with plt.style.context(('tableau-colorblind10')):
    fig = plt.figure()
    
    plt.title('Quantity of Discounted vs Full Price Products Purchased ')
    
    plt.hist(pop_samp_disc,alpha = 0.5, bins=30,label='Discounted')
    plt.hist(pop_samp_full,color='black', alpha = 0.5, bins=30,label='Full Price')

    # Adding annotations
    meanD = round(np.mean(pop_samp_disc),3)
    meanS = round(np.mean(pop_samp_full),3)
    
    plt.axvline(meanD, linestyle='--',label='Discounted Mean')
    plt.text(meanD-1,700,f'Mean:{meanD}',rotation=90)
    
    plt.axvline(meanS,color='k',linestyle='--',label='Full Price Mean')
    plt.text(meanS-1,700,f'Mean:{meanS}',rotation=90)


    plt.legend()
    plt.show()

# Run normality testing
stat,p = normaltest(pop_samp_full)
print(f'Normality: stat ={stat}, p = {p}')

# Run ttest
tstat,tp = stats.ttest_ind(pop_samp_disc, pop_samp_full)
# print(f'T-test: stat ={tstat}, p = {tp}')#tstat,tp
# ax.text(