In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
# Generate data

rep = pd.DataFrame(["white"]*100000 + ["hispanic"]*60000 + \
                   ["black"]*50000 + ["asian"]*15000 + ["other"]*35000)

rep_table = pd.crosstab(index = rep [0], columns = "count")


sample = pd.DataFrame(["white"]*600 + ["hispanic"]*300 + \
                   ["black"]*250 + ["asian"]*75 + ["other"]*150)

sample_table = pd.crosstab(index = sample [0], columns = "count")

print("National_rep")
print(rep_table)
print(" ")
print("Sample")
print(sample_table)

National_rep
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
 
Sample
col_0     count
0              
asian        75
black       250
hispanic    300
other       150
white       600


In [3]:
# -----------1. Chi-Squred Goodness-of-Fit Test----------------
# To check whether the distribution of the population across different categories is the same for the sample

In [4]:
# Chi-Squred Goodness-of-Fit Test - Using scipy.stats.chisquare()

observed = sample_table

rep_ratios = rep_table/len(rep)

print(rep_ratios)

expected = rep_ratios * len(sample) # Get expected counts

stats.chisquare(f_obs = observed, f_exp = expected) 

# chi-squared test statistics = 18.19 
# p-value = 0.001 - Reject the null hypothesis: the two distributions are different

col_0        count
0                 
asian     0.057692
black     0.192308
hispanic  0.230769
other     0.134615
white     0.384615


Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

In [5]:
# -----------2.Chi-Squared Test of Independence 
# To check whether two categorical variables are independent

In [6]:
# Generate data

np.random.seed(10)

# Sample data randomly at fixed probabilities
voter_race = np.random.choice (a = ["asian", "black", "hispanic","other", "white"],
                               p = [0.05, 0.15, 0.25, 0.05, 0.5],
                               size = 1000)

# Sample data randomly at fixed probabilities
voter_party = np.random.choice (a = ["democrat", "independent", "republican"],
                               p = [0.4, 0.2, 0.4],
                               size = 1000)

voters = pd.DataFrame({"race":voter_race,
                      "party":voter_party})

voter_tab = pd.crosstab(voters.race, voters.party, margins = True)

voter_tab.columns = ["democrat", "independent", "republican", "row_total"]

voter_tab.index = ["asian", "black", "hispanic","other", "white", "col_totals"]

observed = voter_tab.iloc[0:5, 0:3] # actual counts
voter_tab

Unnamed: 0,democrat,independent,republican,row_total
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
col_totals,397,186,417,1000


In [7]:
# Chi-Squared Test of Independence - Using stats.chi2.contingenct()

stats.chi2_contingency(observed = observed)

# chi-squared statistics = 7.16
# p-value = 0.51  - Fail to reject null hypothesis
# df = (5-1) * (3-1) = 4 * 2 = 8 
# array shows the expected counts

(7.169321280162059,
 0.518479392948842,
 8,
 array([[ 23.82 ,  11.16 ,  25.02 ],
        [ 61.138,  28.644,  64.218],
        [ 99.647,  46.686, 104.667],
        [ 15.086,   7.068,  15.846],
        [197.309,  92.442, 207.249]]))