In [1]:
import pandas as pd
from scipy import stats
import scikit_posthocs as sp
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.diagnostic import kstest_normal
import statsmodels.api as sm
import pyreadstat
import pingouin as pg


In [2]:
def test_normality(data, significance_level = 0.05, test_type='s-w'):
    '''
    Normality testing is crucial because a lot of statistical tests assume normality of data.
    2 tests are most commonly used:
    - Kolmogorov-Smirnov test
    - Shapiro-Wilk test 
    
    Differences:
    -K-S test can be used in various types of data, 
    S-W test is suitable only for continous data
    -S-W is more accurate for smaller samples (under 50)
    
    '''
    if test_type == 'k-s':
        stat,p = kstest_normal(data,'norm')
    elif test_type == 's-w':
        stat,p = stats.shapiro(data)
    else:
        raise ValueError("Podano błędną nazwę testu normalności rozkładu")
    print(f"Wartość statystyki testu: {stat} \n" 
          f"Wartość p: {p} \n" 
    return stat,p

In [None]:
def test_var_homogeneity(group1,group2):
    
    
    statistic, p_value = stats.levene(group1, group2)
    print(f"Wartość statystyki Levene'a: {statistic} \n"
          f"Wartość p: {p_value}")
    return statistic, p_value

In [4]:
def correlation(x, y, corr_type = 'spearman'):
    ''' 
    if data is not parametric you should use 
    spearman's correlation, pearson's otherwise 
    
    '''
    if corr_type == 'pearson':
        corr_coef,p_value = stats.pearsonr(x,y)
    elif corr_type == 'spearman':
        corr_coef,p_value = stats.spearmanr(x,y)
    else:
        raise ValueError('Upewnij się, że poprawnie wprowadzono typ korelacji')
    print(f"Wartość współczynnika: {corr_coef}"
          f"Wartość p: {p_value}")
    return corr_coef,p_value

In [None]:
def mann_whitney(x1,x2):
    '''
    non parametric counterpart of the t test
    '''
    statistic, p_value = stats.mannwhitneyu(x1,x2)
    print(f"Wartość statystyki U: {statistic} \n"
          f"Wartość p: {p_value}") 
    return statistic, p_value
    

In [None]:
def kruskal_wallis(*args):
    '''
    The Kruskal-Wallis test is non-parametric counterpart of 
    Annova - it is used to compare mean ranks in more than two independent groups.
    
    Assumptions:
        - similar shapes of distributions
        - independence of groups
        - ordinal or continuous variables
    
    '''
    statistic, p_value = stats.kruskal([arg for arg in args])
    print(f"Wartość statystyki H: {statistic} \n"
          f"Wartość p: {p_value}")
    return statistic, p_value

In [None]:
def oneway_annova(*args):
    '''
    The oneway Annova test is used to assess differences between more than two groups
    
    Assumptions:
        - normality of distributions
        - homogeneity of variances
        - independence of groups
    
    '''
    
    statistic, p_value = stats.f_oneway([arg for arg in args])
    print(f"Wartość statystyki F: {statistic} \n"
          f"Wartość p: {p_value}")
    return statistic,p_value

In [None]:
def t_test(x1,x2):
    '''
    T test allows to check for statistical difference between 
    two groups
    
    Assumptions:
        - normality of distributions
        - homogeneity of variances
        - independence of groups
    '''
    statistic, p_value = stats.ttest_ind(x1,x2)
    print(f"Wartość statystyki t: {statistic} \n"
                  f"Wartość p: {p_value}")
    return statistic, p_value
    

In [1]:
def posthoc_test(is_parametric = False,*args):
    if is_parametric:
        tukey_results = pairwise_tukeyhsd(data.iloc[:,0], data.iloc[:,3], alpha=0.05)
        print(tukey_results)
    else:
        dunn_result = sp.posthoc_dunn([arg for arg in args], p_adjust = 'bonferroni')
        print(dunn_result)

In [231]:
def plot_distribution(data,bins,x_title):
    plt.hist(data,bins=bins,alpha=0.9)
    plt.title(f'Rozkład Zmiennej {x_title}')
    plt.xlabel(f'{x_title}')
    plt.ylabel('Częstość')
    plt.show()

In [None]:
def plot_scatter(x,y):
    slope, intercept = np.polyfit(x, y, deg=1)
    regression_line = [slope * i + intercept for i in x]
    plt.scatter(x, y, color='blue', label='Data')
    plt.plot(x, regression_line, color='red')
    plt.title(f'Wykres rozrzutu zmiennej {x.name} i zmiennej {y.name} z linią regresji')
    plt.xlabel(f'{x.name}')
    plt.ylabel(f'{y.name}')
    plt.show()

In [None]:
def freq_table(data,headers):
    table = data.value_counts().reset_index()
    table['Percentage'] = round(table['count'].div(table['count'].sum()) * 100,2)
    print(tabulate(table, headers=headers, tablefmt="fancy_grid"))

In [6]:
def boxplot(*args, headers = None, title = None):
    plt.boxplot([arg for arg in args])
    plt.xticks(range(1,len(args)+1),[arg.name for arg in args])
    plt.title(title)
    plt.show()

In [7]:
def simple_lreg(y_col,*args):
    X = dane.iloc[:,[arg for arg in args]]
    y = dane.iloc[:,y_col]
    model = sm.OLS(y, X).fit()
    return model

In [8]:
def draw_pie(data,labels,colors):
    plt.pie(data.value_counts(),autopct='%1.1f%%',startangle=90,colors=colors)
    plt.title(f'Rozkład zmiennej {data.name}') 
    plt.legend(labels,bbox_to_anchor=(1.3, 0.9))
    plt.show()

In [9]:
def describe_data(data):
    stats = data.describe().transpose()
    table = pd.DataFrame()
    new_column_names = {
    'count': 'Liczba obserwacji',
    'mean': 'Średnia',
    'std': 'Odchylenie standardowe',
    'min': 'Minimum',
    '25%': 'Kwartyl dolny',
    '50%': 'Mediana',
    '75%': 'Kwartyl górny',
    'max': 'Maksimum'
    }
    for column in stats.columns:
        table[column] = stats[column]
    for statistic in stats.index:
        table.loc[statistic] = stats.loc[statistic]
    table.rename(columns=new_column_names, inplace=True)
    for column in table.columns:
        table[column] = round(table[column],2)
    return table 

In [None]:
file_path = ''
data,metadata = pyreadstat.read_sav(file_path)