# Data Compare

Compare data to derive insights.



## The basics

In [None]:
#chi squared
from scipy.stats import chi2_contingency, chi2

def chi_test(arr):

    """
    chi_test function takes a pandas df and breaks it into a numpy array for testing of independence

    input:
    arr - numpy array of count data
    """

    alpha = 0.05

    #array is struct of (pna, non-pna)
    stat, p, dof, expected = chi2_contingency(arr, correction = False)

    prob = 1 - alpha

    # interpet test statistic
    critical = chi2.ppf(prob, dof)
    print("probability ={}, critical ={}, stat={}, p value = {}".format(prob, critical, stat, p))

    if abs(stat) >= critical:
        print('accept alternative hypothesis; counts are dependent on categories')
    else:
        print('maintain null hypothesis; counts are not dependent on categories')

        print('contingency table:\n', arr)

#z test on proportions where yes proportion is smaller than no
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import proportion_effectsize

def z_test(yes_count, yes_target, yes_label, no_count, no_target, no_label, alpha = 0.05):
  
    """
    z test function runs a z test to see if one proportion is higher with any significance
    yes proportion must be smaller than no, other wise the alternative in the stats models function needs to be altered
    also gives effect sizes

    inputs:
    yes_count = sum count of observations for one class
    yes_target = sum count of successful observations for one class
    no_count = sum count of observations for one class
    no_target = sum count of successful observations for one class

    """

    if yes_target/yes_count > no_target/no_count:
    print('proportions are not aligned properly')
    return None

    nobs = np.array([yes_count, no_count])
    count = np.array([yes_target, no_target])
    print("observations" , nobs)
    print("counts", count)

    print("proportion ", yes_label, ": ", round(yes_target/yes_count,2))
    print("proportion ", no_label, ": ", round(no_target/no_count,2))

    stat, pval = proportions_ztest(count, nobs, alternative = 'smaller')
    #smaller is when elements 0 have smaller proportion than elements 1

    print('{0:0.3f}'.format(pval))

    print(pval)  

    if pval <= alpha:
        print("accept alternative hypothesis that ", no_label, "  have a higher proportion")
    else:
        print("maintain null hypothesis there is no significance that ", no_label, " have a higher proportion")

    #put smaller proportion thru last to get a positive effect size
    effect = proportion_effectsize(no_target/no_count, yes_target/yes_count)
        print("proportion effect size: ", effect)


#shapiro-wilks normality test
from scipy.stats import shapiro

def normality_test(data, alpha = 0.05):
    stat, p = shapiro(data)
    print('Statistics=%.3f, p=%.3f' % (stat, p))

    response = 1 #0 = normal

    if p >= alpha:
        print("data is normally distributed")
    else:
        print("data is not normally distributed")
    response = 0

    return response

#mann whitney u test for non-normal data
import numpy as np
from scipy.stats import mannwhitneyu, tiecorrect, rankdata, norm

def mann_whitney(data1, data2, alpha = .05): 
    #calculates significance of diff in means and effect
    #this is a base two sided test

    # compare samples
    stat, p = mannwhitneyu(data1, data2, alternative="less")
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    #alpha = 0.05
    if p > alpha:
        print('Same distribution (fail to reject H0)')
        print('In terms of this analysis this means that the difference in means is NOT due to the different categories!')
    else:
        print('Different distribution (reject H0)')
        print('In terms of this analysis this means that the difference in means is due to the different categories!')

    #calculate effect size:
    #borroewd from https://github.com/Hatchin/Mann-Whitney-U-Test/blob/master/mannwhitney.py
    n1 = len(data1)
    n2 = len(data2)
    ranked = rankdata(np.concatenate((data1, data2)))
    rankx = ranked[0:n1]  # get the x-ranks
    u1 = n1*n2 + (n1*(n1+1))/2.0 - np.sum(rankx, axis=0)  # calc U for x
    u2 = n1*n2 - u1  # remainder is U for y

    # use the min(u1, u2) as u-stat
    if u1 <= u2:
        stat_a, larger = u1, 1
    else:
        stat_a, larger = u2, 2

    # compute the effect size    
    effect = 1 - (2*stat_a)/(n1*n2) 

    if effect < .3:
        print('This has a small effect; the effect size for this test is : %.3f' % effect)
    elif effect > .5:
        print('This has a large effect; the effect size for this test is : %.3f' % effect)
    else:
        print('This has a medium effect; the effect size for this test is : %.3f' % effect)
    
#t-test for normal data
from scipy.stats import ttest_ind
from numpy import mean, var, sqrt

def t_test(data1, data2, alpha = 0.05):
    """
    t test function to measure if mean of data1 is less than mean of data2
    """
    d1_var = var(data1)
    d2_var = var(data2)

    equal_var = False

    if round(d1_var,4) == round(d2_var,4):
        equal_var = True
        
    print("Are variances equal?: ", equal_var)

    stat, p = ttest_ind(data1, data2, axis = 0, equal_var = equal_var, alternative = 'less')

    print('Statistics=%.3f, p=%.3f' % (stat, p))

    if p > alpha:
        print("maintain null hypothesis that means are not significantly different")
    else:
        print("accept alternative hypothesis that means are significantly different")

    #effect size (cohen's d calc)
    #machine learning mastery formula
    n1, n2 = len(data1), len(data2)

    u1, u2 = mean(data1), mean(data2)

    #pooled standard deviation
    s = sqrt((((n1 -1) * d1_var) + ((n2 - 1) * d2_var))/(n1 + n2 -2))

    #flipped u so as to show a positive effect measure
    d = (u2 - u1)/s

    if d < .2: 
        print('Cohens d shows a small effect (if your data is nor normal/gaussian this is junk!): %.3f' % d)
    elif d > .8:
        print('Cohens d shows a large effect (if your data is nor normal/gaussian this is junk!): %.3f' % d)
    else:
        print('Cohens d shows a medium effect (if your data is nor normal/gaussian this is junk!): %.3f' % d)

    

## Less Basic

feature correlation to output; run binary output against features to get general correlation direction

https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pointbiserialr.html

In [None]:
import numpy as np
from scipy.stats import pointbiserialr, ks_2samp
from scipy.spatial.distance import jensenshannon

import matplotlib.pyplot as plt

def feature_correlation(df, feat_list, target):
    
    sorted_feat_col_name = []
    sorted_correlation = []
    correlation_color = []

    for feat in feat_list:
        sorted_feat_col_name.append(feat)

        correlation = pointbiserialr(df[feat].fillna(0), df[target])[0]
        print(correlation)
        sorted_correlation.append(correlation)

        if correlation == 0:
            color = 'b'
        elif correlation > 0:
            color = 'g'
        else:
            color = 'r'

        correlation_color.append(color)
        
    #plt.title('Feature Importances')
    plt.rcParams.update({'figure.figsize':[7,5]}) #(w,h)
    plt.barh(range(len(sorted_importance)), sorted_importance, color=correlation_color, align='center')
    plt.yticks(range(len(actual_names)), actual_names)
    plt.xlabel('Relative Importance')
    plt.show()      

def js_divergence(p, q, base=None):
    """
    test function for jensen-shannon distance
    references:
    https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jensenshannon.html
    
    there is not really a great way to interpet this atm, I have been searching for a hypothesis test version of this!

    input:
    p: distrib to test
    q: base distribution (validation data)
    base: base of the logarithm used ot compute the distance
    """

    distance =  jensenshannon(p, q, base=base)
    return distance      

def ks_testing(p, q, ks_alternative="two-sided", mode="asymp")
    """
    test function for two sample ks testing
    references:
    https://www.sciencedirect.com/topics/earth-and-planetary-sciences/kolmogorov-smirnov-test
    The null hypothesis (Ho) is that the two dataset values are from the same continuous distribution. 
    The alternative hypothesis (Ha) is that these two datasets are from different continuous distributions. 
    
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html

    input:
    p: distrib to test
    q: base distribution (validation data)
    ks_alternative{‘two-sided’, ‘less’, ‘greater’}, optional
        we use two sided where the null hyp is the distribs are the same.
    """

    ks_stat, ks_pval = ks_2samp(p, q alternative=ks_alternative, mode="asymp", alpha = 0.05)
    
    print('Statistics=%.3f, p=%.3f' % (ks_stat, ks_pval))

    if p > alpha:
        print("maintain null hypothesis that distribs are not significantly different")
    else:
        print("accept alternative hypothesis that distribs are significantly different")