In [None]:
import scipy as sp

### Correlation Metrics

In [None]:
# Pearson
corr_coeff, p_value = sp.stats.pearsonr(df[x], df[y])

In [None]:
def compute_corr_CI(r, sample_size, alpha):
    '''
    will compute the ((1-alpha)*100)% confidence interval
    of the correlation coefficient "r", according to the
    size of the sample used to compute "r" called "sample_size"
    
    Implemented following those two links as reference : 
        -http://onlinestatbook.com/2/estimation/correlation_ci.html
        -https://zhiyzuo.github.io/Pearson-Correlation-CI-in-Python/
    According to the following link (https://stats.stackexchange.com/questions/18887/how-to-calculate-a-confidence-interval-for-spearmans-rank-correlation),
    this calculation can also be applied to obtain a CI for a spearman correlation coefficient.
    '''
    
    # The following formula breaks when the correlation is either -1 or 1 (because of arctanh)
    if abs(r) == 1:
        return np.NaN, np.NaN
    
    r_z = np.arctanh(r)
    se = 1/np.sqrt(sample_size-3)
    z = stats.norm.ppf(1-alpha/2)
    return np.tanh((r_z-z*se, r_z+z*se))

In [None]:
def display_ps_corr(df):
    '''
    Applies the Pearson and Spearman correlation for the given dataframe and display the results. For
    interpretation purposes, the sample size and a 95% confidence interval for both correlations 
    are also displayed.
    '''
    data_size = len(df)
    print(f'Sample size : {data_size}')
    
    for corr_type in ['spearman', 'pearson']:
                    
        corr = float(df.corr(corr_type).iloc[1,0])
        alpha = 0.05
        lo, hi = compute_corr_CI(corr, len(df), alpha)
                    
        print('%s correlation: %.4f'%(corr_type.capitalize(), corr))
        
        if np.isnan(lo) or np.isnan(hi):
            print('Note: It is not possible to calculate the  %d%% confidence interval here.'%((1-alpha)*100))
        else:
            print('with the following %d%% confidence interval for the %s correlation \"r": %.4f ≤ r ≤ %.4f '%((1-alpha)*100,corr_type.capitalize(), lo, hi))


### Bootstrapping & Hypothesis Testing

In [None]:
def bootstrap(data, sample_nb, funs={'mean': np.mean, 'std': np.std, 'median': np.median}):
    """Generate statistics for data using bootstrapping"""
    
    funs_results = {}
    
    # Initialize arrays
    for key in funs.keys():
        funs_results.setdefault(key, np.zeros(sample_nb))
    
    # Compute result values of given functions for each bootstrap
    for i in range(sample_nb):
        sample = np.random.choice(data, data.shape[0], replace=True)
        for key, value in funs_results.items():
            value[i] = funs[key](sample)
            funs_results[key] = value
            
    # Compute mean and standard deviation of bootstrap samples for all functions
    for key, value in funs_results.items():
        mean_value = np.mean(value)
        std_value = np.std(value)
        funs_results[key] = (mean_value, std_value, value)
        
    return funs_results

In [None]:
def CI_intervals_overlapping(tag1_data, tag2_data, alpha=0.05):
    """Calculate confidence intervals of the data and check for overlapping"""
    
    # Compute 95% confidence interval for tag1
    tag1_CI = (np.quantile(tag1_data, alpha/2), np.quantile(tag1_data, 1 - alpha/2))
    (lower_tag1, upper_tag1) = tag1_CI
    
    # Compute 95% confidence interval for tag2
    tag2_CI = (np.quantile(tag2_data, alpha/2), np.quantile(tag2_data, 1 - alpha/2))
    (lower_tag2, upper_tag2) = tag2_CI
    
    # Check if tag1 confidence interval overlaps with tag2 confidence interval
    overlapping = not ((lower_tag2 < lower_tag1 and upper_tag2 < lower_tag1)  \
                    or \
                    (lower_tag1 < lower_tag2 and upper_tag1 < lower_tag2))
    
    return tag1_CI, tag2_CI, overlapping