# Критерий хи-квадрта

In [1]:
import pandas as pd
import numpy as np
import scipy.stats

df = pd.read_csv('Data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,target
0,63,1,3,145,233,1,0,150,0,1
1,37,1,2,130,250,0,1,187,0,1
2,41,0,1,130,204,0,0,172,0,1
3,56,1,1,120,236,0,1,178,0,1
4,57,0,0,120,354,0,1,163,1,1


## Вычисление вручную

In [19]:
# df.pivot_table(values='age', index='sex', columns='target', aggfunc='count')
ct_o = df.groupby('sex')['target'].value_counts().unstack()
ct_o

target,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24,72
1,114,93


$
dof = (R - 1)(C - 1)
$

In [36]:
(nrows, ncols) = ct_o.shape
dof = (nrows - 1) * (ncols - 1)
dof

1

In [37]:
alpha = 0.01
critical_value = 6.63  # по таблице

In [43]:
n_total = ct_o.sum().sum()
n_target_0 = ct_o[0].sum()
n_target_1 = ct_o[1].sum()

n_women = ct_o.iloc[0].sum()
n_men = ct_o.iloc[1].sum()

print(n_total, n_target_0, n_target_1)
print(n_women, n_men)

303 138 165
96 207


In [41]:
p_target_0 = n_target_0 / n_total
p_target_1 = n_target_1 / n_total
print(p_target_0, p_target_1)

0.45544554455445546 0.5445544554455446


In [44]:
f_e_target_0_women = n_women * p_target_0
f_e_target_1_women = n_women * p_target_1

f_e_target_0_men = n_men * p_target_0
f_e_target_1_men = n_men * p_target_1

print(f_e_target_0_women, f_e_target_1_women)
print(f_e_target_0_men, f_e_target_1_men)

43.722772277227726 52.27722772277228
94.27722772277228 112.72277227722773


In [46]:
print(f_e_target_0_women + f_e_target_1_women + f_e_target_0_men + f_e_target_1_men)

303.0


In [47]:
ct_e = np.array([[f_e_target_0_women, f_e_target_1_women], [f_e_target_0_men, f_e_target_1_men]])
ct_e

array([[ 43.72277228,  52.27722772],
       [ 94.27722772, 112.72277228]])

$
f_e = \frac{f_cf_r}{n}
$

In [49]:
round(ct_o[0].sum() * ct_o.iloc[0].sum() / n_total)

44

$
\chi^2 = \sum{\frac{(f_o - f_e)^2}{f_e}}
$

In [55]:
chi_square = ((ct_o - ct_e)**2/ct_e).to_numpy().sum()
print(chi_square)

23.914383914761984


## Вычисление с помощью `scipy.stats`

chi2_contingency - хи-квадрат сопряженности

```python
def chi2_contingency(observed, correction=True, lambda_=None):
    """
    Chi-square test of independence of variables in a contingency table.
    
    This function computes the chi-square statistic and p-value for the
    hypothesis test of independence of the observed frequencies in the
    contingency table [1]_ `observed`.  The expected frequencies are computed
    based on the marginal sums under the assumption of independence; see
    `scipy.stats.contingency.expected_freq`.  The number of degrees of
    freedom is (expressed using numpy functions and attributes)::
    
        dof = observed.size - sum(observed.shape) + observed.ndim - 1
    
    
    Parameters
    ----------
    observed : array_like
        The contingency table. The table contains the observed frequencies
        (i.e. number of occurrences) in each category.  In the two-dimensional
        case, the table is often described as an "R x C table".
    correction : bool, optional
        If True, *and* the degrees of freedom is 1, apply Yates' correction
        for continuity.  The effect of the correction is to adjust each
        observed value by 0.5 towards the corresponding expected value.
    lambda_ : float or str, optional
        By default, the statistic computed in this test is Pearson's
        chi-squared statistic [2]_.  `lambda_` allows a statistic from the
        Cressie-Read power divergence family [3]_ to be used instead.  See
        `scipy.stats.power_divergence` for details.
    
    Returns
    -------
    chi2 : float
        The test statistic.
    p : float
        The p-value of the test
    dof : int
        Degrees of freedom
    expected : ndarray, same shape as `observed`
        The expected frequencies, based on the marginal sums of the table.
    """
    pass
```

In [59]:
scipy.stats.chi2_contingency(observed=ct_o, # матрица наблюдаемых частот
                             correction=False  # 
                            )

(23.914383914761988,
 1.0071642033238865e-06,
 1,
 array([[ 43.72277228,  52.27722772],
        [ 94.27722772, 112.72277228]]))

In [212]:
chi_square > critical_value

True