In [43]:
import numpy as np
from scipy.stats import norm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
fpath = "./outputs/simulation_100_4000/results.csv"
sim_results = pd.read_csv(filepath_or_buffer=fpath,
                          header=0,
                          index_col=False,
                          usecols=['iteration', 'gini', 'theil_l', 'theil_t', 'atkinson_25', 'atkinson_50', 'atkinson_75'])

# Fit Gini to Normal distribution
To start with, let's fit our Gini results to a Normal distribution and measure the error.

In [51]:
def plot_mle_gaussian_fit(data, plt_title='', plt_xlabel='', plt_ylabel='', filename=''):
    sns.distplot(data['x'],
                 fit=norm,
                 kde=False,
                 kde_kws={'linewidth': 3},
                 color='darkblue')
    
    plt.title(plt_title)
    plt.xlabel(plt_xlabel)
    plt.ylabel(plt_ylabel)
    
    figname=filename+".png"
    plt.savefig(figname)
    
    plt.show()
    return

In [52]:
sim_results_gini = sim_results.rename(columns={'gini':'x'})
plot_mle_gaussian_fit(sim_results_gini, plt_title="MLE Gaussian Fit to Gini on 100 Simulated Partitions\n(Pop. criteria 4,000)", plt_xlabel="Empirical Gini", plt_ylabel="Density", filename="Norm_Gini_100_4000")

In [53]:
sim_results_theil_l = sim_results.rename(columns={'theil_l':'x'})
plot_mle_gaussian_fit(sim_results_theil_l, plt_title="MLE Gaussian Fit to Theil's L on 100 Simulated Partitions\n(Pop. criteria 4,000)", plt_xlabel="Empirical Theil's L", plt_ylabel="Density", filename="Norm_Theil_L_100_4000")

In [54]:
sim_results_theil_t = sim_results.rename(columns={'theil_t':'x'})
plot_mle_gaussian_fit(sim_results_theil_t, plt_title="MLE Gaussian Fit to Theil's T on 100 Simulated Partitions\n(Pop. criteria 4,000)", plt_xlabel="Empirical Theil's T", plt_ylabel="Density", filename="Norm_Theil_T_100_4000")

In [55]:
sim_results_atkinson_25 = sim_results.rename(columns={'atkinson_25':'x'})
plot_mle_gaussian_fit(sim_results_atkinson_25, plt_title="MLE Gaussian Fit to Atkinson's Index on 100 Simulated Partitions\n(Pop. criteria 4,000)", plt_xlabel="Empirical Atkinson's (e=0.25)", plt_ylabel="Density", filename="Norm_Atkinson_25_100_4000")

In [56]:
sim_results_atkinson_50 = sim_results.rename(columns={'atkinson_50':'x'})
plot_mle_gaussian_fit(sim_results_atkinson_50, plt_title="MLE Gaussian Fit to Atkinson's Index on 100 Simulated Partitions\n(Pop. criteria 4,000)", plt_xlabel="Empirical Atkinson's (e=0.5)", plt_ylabel="Density", filename="Norm_Atkinson_50_100_4000")

In [58]:
sim_results_atkinson_75 = sim_results.rename(columns={'atkinson_75':'x'})
plot_mle_gaussian_fit(sim_results_atkinson_75, plt_title="MLE Gaussian Fit to Atkinson's Index on 100 Simulated Partitions\n(Pop. criteria 4,000)", plt_xlabel="Empirical Atkinson's (e=0.75)", plt_ylabel="Density", filename="Norm_Atkinson_75_100_4000")

# Compare sampled results with actual partitions

The quantification of how abnormal census tracts are in terms of Gini outcomes will be explored next, by fitting a distribution to the simulated results. Similar tests will be performed on the other income statistics. In addition to showing that the census tracts of today are unlikely, our simulation distribution shows that under the population constraint Gini is relatively stable, with a standard deviation of less than 0.01. This variation is inconsequential when compared to the difference between Gini on ungrouped incomes and block averages, underlining the findings in the previous section. In addition, we will attempt to gain a better mathematical understanding of the Theil and Atkinson indexes based on their behavior in simulation.

In [28]:
# Confidence intervals using method 1 - normal approximation
bootstrap_mean = sim_results['gini'].mean()
bootstrap_std = sim_results['gini'].std()

# 95% confidence interval is z value of 1.96
# 99% confidence interval is z value of 2.58
low=bootstrap_mean - (2.58 * bootstrap_std)
high=bootstrap_mean + (2.58 * bootstrap_std)
print("Normal approximation 99% confidence interval: ({0}, {1})".format(low, high))

Normal approximation 99% confidence interval: (0.19818876981013106, 0.20707067116756317)


In [29]:
# Confidence intervals using method 2 - quantiles
low_quantile = sim_results['gini'].quantile(q=0.005)
high_quantile = sim_results['gini'].quantile(q=0.995)

print("Pivotal approximation 99% confidence interval: ({0}, {1})".format(low_quantile, high_quantile))

Pivotal approximation 99% confidence interval: (0.19851567326552616, 0.20590112414641554)


Null hypothesis: The Hennepin county census tract Gini of **0.218685** was sampled from the distribution of possible Gini values on partitions where population is optimized to be 4,000. Or, 
```
H0: bootstrap_mean = 0.218685
H1: bootstrap_mean != 0.218685
```

We assume that the bootstrap mean is asymptotically Normal. Then,
```(bootstrap_mean - 0.218685) / (bootstrap_std) ~> N(0, 1)```
We reject H0 when
```((bootstrap_mean - 0.218685) / (bootstrap_std)) > z_score```

We select z_score according to our desired confidence interval. Assuming ```Z ~ N(0, 1)```,
```
p-value = 2 * Probability(Z < z_score)
```
We want to select p-value so that we require very strong evidence against H0 in order to reject it. We will use a p-value of 0.001

In [37]:
bootstrap_mean = sim_results['gini'].mean()
bootstrap_std = sim_results['gini'].std()

z = (bootstrap_mean - 0.218685) / bootstrap_std
P = norm()
z_score = P.cdf(z)
p_value = 2 * z_score
print("Bootstrap mean\t{0}\nBootstrap std\t{1}\nDifference\t{2}\nz-score\t\t{3}\np-value\t\t{4}".format(bootstrap_mean, bootstrap_std, bootstrap_mean-0.218685, z_score, p_value))

Bootstrap mean	0.20262972048884711
Bootstrap std	0.0017212987126806463
Difference	-0.016055279511152876
z-score		5.423860420866147e-21
p-value		1.0847720841732294e-20


p-value is much smaller than our acceptance criteria, so we easily reject the null hypothesis. We are left with the result that census tracts were not drawn by the same process as our population-optimizing algorithm.

In [62]:
def hypothesis_test_simulation(data, observation, name):
    bootstrap_mean = data['x'].mean()
    bootstrap_std = data['x'].std()

    z = (bootstrap_mean - observation) / bootstrap_std
    P = norm()
    z_score = P.cdf(z)
    p_value = 2 * z_score
    print("{0}\n\tBootstrap mean\t{1}\n\tBootstrap std\t{2}\n\tDifference\t{3}\n\tz-score\t\t{4}\n\tp-value\t\t{5}".format(name, bootstrap_mean, bootstrap_std, bootstrap_mean-0.218685, z_score, p_value))
    return

In [63]:
hypothesis_test_simulation(sim_results_gini, 0.21868478998420604, "Gini")
hypothesis_test_simulation(sim_results_theil_l, 0.08007582708892766, "Theil's L")
hypothesis_test_simulation(sim_results_theil_t, 0.07446742355287116, "Theil's T")
hypothesis_test_simulation(sim_results_atkinson_25, 0.01878933454551135, "Atkinson (e=0.25)")
hypothesis_test_simulation(sim_results_atkinson_50, 0.03790244775072271, "Atkinson (e=0.50)")
hypothesis_test_simulation(sim_results_atkinson_75, 0.057303961132517234, "Atkinson (e=0.75)")

Gini
	Bootstrap mean	0.20262972048884711
	Bootstrap std	0.0017212987126806463
	Difference	-0.016055279511152876
	z-score		5.430105945398945e-21
	p-value		1.086021189079789e-20
Theil's L
	Bootstrap mean	0.06938825156603992
	Bootstrap std	0.001322770059825865
	Difference	-0.14929674843396007
	z-score		3.2465106671452165e-16
	p-value		6.493021334290433e-16
Theil's T
	Bootstrap mean	0.06414205338977479
	Bootstrap std	0.0011061350639359647
	Difference	-0.15454294661022522
	z-score		5.066857415500019e-21
	p-value		1.0133714831000038e-20
Atkinson (e=0.25)
	Bootstrap mean	0.04977592233421279
	Bootstrap std	0.0008940205693372583
	Difference	-0.1689090776657872
	z-score		1.0
	p-value		2.0
Atkinson (e=0.50)
	Bootstrap mean	0.03282976635430674
	Bootstrap std	0.0005785515289009558
	Difference	-0.18585523364569326
	z-score		9.101546882157557e-19
	p-value		1.8203093764315115e-18
Atkinson (e=0.75)
	Bootstrap mean	0.04977592233421279
	Bootstrap std	0.0008940205693372583
	Difference	-0.1689090776657872
