Contingency table below has two rows and five columns (not counting header rows/columns) and shows the results of a random sample of 2200 adults classified by two variables, namely gender and favorite way to eat ice cream (Larson and Farber 2014)

In [None]:
import pandas as pd

ice_cream = pd.DataFrame ({'gender' : ['male', 'female'],
                           'cup' : [592, 410],
                           'cone' : [300, 335],
                           'sundae' : [204, 180],
                           'sandwich' : [24, 20],
                           'other' : [80, 55]}
                          )

ice_cream

Unnamed: 0,gender,cup,cone,sundae,sandwich,other
0,male,592,300,204,24,80
1,female,410,335,180,20,55


In [None]:
#add totals column and row
ice_cream.loc[:, 'Row Total'] = ice_cream.sum(numeric_only=True, axis=1)

ice_cream.loc['Column Total'] = ice_cream.sum(numeric_only=True, axis=0)

ice_cream

Unnamed: 0,gender,cup,cone,sundae,sandwich,other,Row Total
0,male,592.0,300.0,204.0,24.0,80.0,1200.0
1,female,410.0,335.0,180.0,20.0,55.0,1000.0
Column Total,,1002.0,635.0,384.0,44.0,135.0,2200.0


In [None]:
#expected values dataframe
ice_cream_expected = pd.DataFrame ({'gender' : ['male', 'female'],
                                    'cup_expected' :[((ice_cream['Row Total'][0] * ice_cream['cup']['Column Total']) / ice_cream['Row Total']['Column Total']), 
                                                      ((ice_cream['Row Total'][1] * ice_cream['cup']['Column Total']) / ice_cream['Row Total']['Column Total'])],
                                    'cone_expected' :[((ice_cream['Row Total'][0] * ice_cream['cone']['Column Total']) / ice_cream['Row Total']['Column Total']), 
                                                      ((ice_cream['Row Total'][1] * ice_cream['cone']['Column Total']) / ice_cream['Row Total']['Column Total'])],
                                    'sundae_expected' :[((ice_cream['Row Total'][0] * ice_cream['sundae']['Column Total']) / ice_cream['Row Total']['Column Total']), 
                                                      ((ice_cream['Row Total'][1] * ice_cream['sundae']['Column Total']) / ice_cream['Row Total']['Column Total'])],
                                    'sandwich_expected' :[((ice_cream['Row Total'][0] * ice_cream['sandwich']['Column Total']) / ice_cream['Row Total']['Column Total']), 
                                                      ((ice_cream['Row Total'][1] * ice_cream['sandwich']['Column Total']) / ice_cream['Row Total']['Column Total'])],
                                    'other_expected' :[((ice_cream['Row Total'][0] * ice_cream['other']['Column Total']) / ice_cream['Row Total']['Column Total']), 
                                                      ((ice_cream['Row Total'][1] * ice_cream['other']['Column Total']) / ice_cream['Row Total']['Column Total'])]                                  
                                    }
                                   )

ice_cream_expected


Unnamed: 0,gender,cup_expected,cone_expected,sundae_expected,sandwich_expected,other_expected
0,male,546.545455,346.363636,209.454545,24.0,73.636364
1,female,455.454545,288.636364,174.545455,20.0,61.363636


In [None]:
#create chi-square table
ice_cream_chi_sq = pd.DataFrame ({'gender' : ['male', 'female'],
                                  'cup_chi_sq' : [(((ice_cream['cup'][0] - ice_cream_expected['cup_expected'][0]) ** 2) / ice_cream_expected['cup_expected'][0]),
                                                  (((ice_cream['cup'][1] - ice_cream_expected['cup_expected'][1]) ** 2) / ice_cream_expected['cup_expected'][1])],
                                  'cone_chi_sq' : [(((ice_cream['cone'][0] - ice_cream_expected['cone_expected'][0]) ** 2) / ice_cream_expected['cone_expected'][0]),
                                                  (((ice_cream['cone'][1] - ice_cream_expected['cone_expected'][1]) ** 2) / ice_cream_expected['cone_expected'][1])],
                                  'sundae_chi_sq' : [(((ice_cream['sundae'][0] - ice_cream_expected['sundae_expected'][0]) ** 2) / ice_cream_expected['sundae_expected'][0]),
                                                  (((ice_cream['sundae'][1] - ice_cream_expected['sundae_expected'][1]) ** 2) / ice_cream_expected['sundae_expected'][1])],
                                  'sandwich_chi_sq' : [(((ice_cream['sandwich'][0] - ice_cream_expected['sandwich_expected'][0]) ** 2) / ice_cream_expected['sandwich_expected'][0]),
                                                  (((ice_cream['sandwich'][1] - ice_cream_expected['sandwich_expected'][1]) ** 2) / ice_cream_expected['sandwich_expected'][1])],
                                  'other_chi_sq' : [(((ice_cream['other'][0] - ice_cream_expected['other_expected'][0]) ** 2) / ice_cream_expected['other_expected'][0]),
                                                  (((ice_cream['other'][1] - ice_cream_expected['other_expected'][1]) ** 2) / ice_cream_expected['other_expected'][1])],
                                  }
                                 )

ice_cream_chi_sq

Unnamed: 0,gender,cup_chi_sq,cone_chi_sq,sundae_chi_sq,sandwich_chi_sq,other_chi_sq
0,male,3.780318,6.206156,0.142045,0.0,0.549944
1,female,4.536382,7.447387,0.170455,0.0,0.659933


In [None]:
#ice_cream_chi_sq.drop(labels='gender', axis=1, inplace=True)
ice_cream_chi_sq.sum().sum()

23.4926197837629

In [None]:
# Import the libraries
import numpy as np
from scipy.stats import chi2_contingency

# Create the table using as a NumPy array
table = np.array([[200, 290], [400, 910]])

# Print out the table to double-check
print('Contingency table: \n', table)

# Perform the chi-square test
stat, p, dof, expected = chi2_contingency(table, correction=False)

# Print out the stats in a nice format
print('Expected values: \n ', expected.round(2))
print(f'The chi square statistics is: {stat:.3f}')
print(f'The p value is: {p:.6f}')

Contingency table: 
 [[200 290]
 [400 910]]
Expected values: 
  [[163.33 326.67]
 [436.67 873.33]]
The chi square statistics is: 16.965
The p value is: 0.000038


In [None]:
# Import the libraries
import numpy as np
from scipy.stats import chi2_contingency

# Create the table using as a NumPy array
ice_cream2 = pd.DataFrame ({'cup' : [592, 410],
                           'cone' : [300, 335],
                           'sundae' : [204, 180],
                           'sandwich' : [24, 20],
                           'other' : [80, 55]}
                          )

# Print out the table to double-check
print('Contingency table: \n', ice_cream2)

# Perform the chi-square test
stat, p, dof, expected = chi2_contingency(ice_cream2, correction=False)

# Print out the stats in a nice format
print('Expected values: \n ', expected.round(2))
print(f'The chi square statistics is: {stat:.3f}')
print(f'The p value is: {p:.6f}')

Contingency table: 
    cup  cone  sundae  sandwich  other
0  592   300     204        24     80
1  410   335     180        20     55
Expected values: 
  [[546.55 346.36 209.45  24.    73.64]
 [455.45 288.64 174.55  20.    61.36]]
The chi square statistics is: 23.493
The p value is: 0.000101


In [None]:
# Import the libraries
import numpy as np
from scipy.stats import chi2_contingency

# Create the table using as a NumPy array
table1 = np.array([[49, 50, 69], [24, 36, 38], [19, 22, 28]])

# Print out the table to double-check
print('Contingency table: \n', table1)

# Perform the chi-square test
stat, p, dof, expected = chi2_contingency(table1, correction=False)

# Print out the stats in a nice format
print('Expected values: \n ', expected.round(2))
print(f'The chi square statistics is: {stat:.3f}')
print(f'The p value is: {p:.6f}')

Contingency table: 
 [[49 50 69]
 [24 36 38]
 [19 22 28]]
Expected values: 
  [[46.14 54.16 67.7 ]
 [26.91 31.59 39.49]
 [18.95 22.24 27.81]]
The chi square statistics is: 1.513
The p value is: 0.824399


In [None]:
import numpy as np

# Create the array for each die value
a1 = [13, 7, 10, 5, 13]
a2 = [5, 7, 4, 12, 9]
a3 = [5, 9, 14, 0, 10]
a4 = [12, 13, 8, 7, 7]
a5 = [7, 10, 9, 13, 6]
a6 = [8, 4, 5, 13, 5]

# Combine them into a (6,5) array
dice = np.array([a1, a2, a3, a4, a5, a6])

In [None]:
# Import the stats module
from scipy.stats import chi2_contingency

# Perform the chi-square test
stat, p, dof, expected = chi2_contingency(dice, correction=False)

# Print out the stats in a nice format
print('Expected values: \n ', expected.round(2))
print('The degrees of freedom: ', dof)
print(f'The chi square statistics is: {stat:.3f}')
print(f'The p value is: {p:.6f}')

Expected values: 
  [[9.6 9.6 9.6 9.6 9.6]
 [7.4 7.4 7.4 7.4 7.4]
 [7.6 7.6 7.6 7.6 7.6]
 [9.4 9.4 9.4 9.4 9.4]
 [9.  9.  9.  9.  9. ]
 [7.  7.  7.  7.  7. ]]
The degrees of freedom:  20
The chi square statistics is: 40.375
The p value is: 0.004477


In [5]:
import numpy as np
from scipy.stats import chisquare # One-way chi square test

# Chi square can take any crosstab/table and test the independence of two rows/cols
# The null hypothesis is that the rows/cols are independent --> low chi square
# The alternative hypothesis is that there is a dependence --> high chi square
# Be aware! Chi square does *not* tell you direction/causation

ind_obs = np.array(([1, 1], [2, 2])).T
print(ind_obs)
print(chisquare(ind_obs, axis=None))

dep_obs=np.array(([16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24])).T
print(dep_obs)
print(chisquare(dep_obs, axis=None))

[[1 2]
 [1 2]]
Power_divergenceResult(statistic=0.6666666666666666, pvalue=0.8810148425137847)
[[16 32]
 [18 24]
 [16 16]
 [14 28]
 [12 20]
 [12 24]]
Power_divergenceResult(statistic=23.31034482758621, pvalue=0.015975692534127565)


In [6]:
# Distribution Tests:
# We often assume that something is normal, but it can be important to *check*

# For example, later on with predictive modeling, a typical assumption is that 
# residuals (prediction errors) are normal - checking is a good diagnostic

from scipy.stats import normaltest
# Poisson models arrival times and is related to the binomial (coinflip)
sample = np.random.poisson (5, 1000)
print(normaltest(sample)) # Pretty clearly not normal

NormaltestResult(statistic=21.175068996638082, pvalue=2.5228544195117452e-05)
