In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.stats import chi2
from scipy.stats import chisquare


%matplotlib inline

Let $Z_1, Z_2,\ldots Z_k$ be independent $\mathcal{N} (0, 1)$
distributed random variables. The sum of their squares

$$
Q = \sum_{i=1}^{k}Z_i^2
$$

is chi-squared distributed $Q \sim \chi^2(k)$. Where $k$ is the
number of degrees of freedom (the only parameter of the distribution).

In [16]:
def chi2_(k, size):
    
    return np.array([(np.random.randn(k) ** 2).sum() for _ in range(size)])

## Chi-2 Test (Pearson)
### Fairnes of Dice
https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test#Examples

A dice is thrown 60 times. The null hypothesis is that the dice is fair
and each result (side of the dice) $i$ is expected to occur 10 times
(10 = 60 / 6). Observed values are the actual results. Is the dice fair?

In [14]:
data = pd.DataFrame({
    'i': [1, 2, 3, 4, 5, 6],
    'observed': [5, 8, 9, 8, 10, 20],
    'expected': [10] * 6})

data

Unnamed: 0,i,observed,expected
0,1,5,10
1,2,8,10
2,3,9,10
3,4,8,10
4,5,10,10
5,6,20,10


The test statistics $T$ is computed as

$$
T = \sum_{i=1}^{6}\frac{(O_i-E_i)^2}{E_i}
$$

where $E_i$ is the expected result and $O_i$ is the observed value.

In [15]:
T = np.sum((data['observed'] - data['expected']) ** 2 / data['expected'])
T

13.4

In [22]:
# PPF is the inverse CDF (we want to know the value at 95 % percentile)
# k = 6 - 1 = 5 (number of degrees of freedom)

chi2.ppf(0.95, 5)

11.070497693516351

In [70]:
# We reject our null hypothesis at 95 % significance level
T < 11.07

False

In [21]:
chi2.cdf(11.070497693516351, 5)

0.95

In [29]:
# alternatively, we can check the percentile for our statistics (p-value)
# it is lower than 0.05, we reject the null hypothesis at 95 % significance level
1 - chi2.cdf(13.4, 5)

0.019905220334774376

In [26]:
chisquare(data['observed'], data['expected'])

Power_divergenceResult(statistic=13.4, pvalue=0.01990522033477438)

### Workers in Neighborhood
https://en.wikipedia.org/wiki/Chi-squared_test#Example_chi-squared_test_for_categorical_data

City of 1M people, sample of 650 workers. The null hypothesis
is that each persons' place of residence (neighborhood A, B, C, D) is
independent of their working class.

In [56]:
data = pd.DataFrame(
    {
        'A': [90, 30, 30],
        'B': [60, 50, 40],
        'C': [104, 51, 45],
        'D': [95, 20, 35]
    },
    index=['white_collar', 'blue_collar', 'no_colar']
)

data = pd.DataFrame(data.stack()).reset_index()
data.columns = ['class', 'neighborhood', 'observed']

n = data['observed'].sum()

display(data)

Unnamed: 0,class,neighborhood,observed
0,white_collar,A,90
1,white_collar,B,60
2,white_collar,C,104
3,white_collar,D,95
4,blue_collar,A,30
5,blue_collar,B,50
6,blue_collar,C,51
7,blue_collar,D,20
8,no_colar,A,30
9,no_colar,B,40


This is our initial data, we have the observed number of people from each
class living in each neighborhood. The first step is to calculate the totals
for each class, neighboord.

In [58]:
class_totals = (
    data.groupby(['class'])
        .agg({'observed': sum})
        .reset_index()
        .rename(columns={'observed': 'class_total'})
)

neighborhood_totals = (
    data.groupby(['neighborhood'])
        .agg({'observed': sum})
        .reset_index()
        .rename(columns={'observed': 'neighborhood_total'})
)

display(class_totals)
display(neighborhood_totals)

Unnamed: 0,class,class_total
0,blue_collar,151
1,no_colar,150
2,white_collar,349


Unnamed: 0,neighborhood,neighborhood_total
0,A,150
1,B,150
2,C,200
3,D,150


We have $n = 650$ observations with the following class share:

$$ 
blue = \frac{151}{650} \approx 0.232 \\
no = \frac{150}{650} \approx 0.231 \\
white = \frac{349}{650} \approx 0.537 \\
$$

Under the null hypothesis, share of each class in each neighborhood is
the same as the global class share.
For example, the expected number of $white$ collar workers in neighborhood $C$
is $200 \times \frac{349}{650} \approx 0.537 \times 200 \approx 107$.

We get all the expected values by joining the `class_totals` and
`neighborhood totals` with the original data and computing
`(class_total / 650) * neighborhood_total` for each row.

In [63]:
final = (
    data.merge(class_totals, on='class')
        .merge(neighborhood_totals, on='neighborhood')    
)

final['expected'] = final.apply(
    lambda r: (r['class_total'] / n) * r['neighborhood_total'], axis=1)

In [64]:
final

Unnamed: 0,class,neighborhood,observed,class_total,neighborhood_total,expected
0,white_collar,A,90,349,150,80.538462
1,blue_collar,A,30,151,150,34.846154
2,no_colar,A,30,150,150,34.615385
3,white_collar,B,60,349,150,80.538462
4,blue_collar,B,50,151,150,34.846154
5,no_colar,B,40,150,150,34.615385
6,white_collar,C,104,349,200,107.384615
7,blue_collar,C,51,151,200,46.461538
8,no_colar,C,45,150,200,46.153846
9,white_collar,D,95,349,150,80.538462


We compute the same test statistics $T$ as in the previous example.

In [65]:
T = np.sum((final['observed'] - final['expected']) ** 2 / final['expected'])
T

24.571202858582595

The number of degress of freedom $k$ of our $\chi^2(k)$ distribution is

$$
k = uv - 1 = 12 - 1 = 11
$$

where $u$ is the number of working classes and $v$ is the number of neighborhoods.

In [72]:
# PPF is the inverse CDF (we want to know the value at 95 % percentile)
# k = 6

chi2.ppf(0.95, 11)

19.67513757268249

In [73]:
# we reject our null hypothesis at 95 % significance level
T < 19.68

False

In [74]:
# alternatively, we can check the percentile for our statistics (p-value)
# it is lower than 0.05, we reject the null hypothesis at 95 % significance level
1 - chi2.cdf(24.57, 11)

0.010532857023036746

In [75]:
chisquare(final['observed'], final['expected'])

Power_divergenceResult(statistic=24.571202858582595, pvalue=0.010528619368354175)