In [1]:
import os
import sys
import numpy as np
import pylab as pl
import json


%pylab inline

from IPython.display import Image

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Add Table 2.1 here

## NULL HYPOTHESIS: the % of former prisoners employed after release is the same or lower for candidates who participated in the program as for the control group, significance level p=0.05

 H_0: P_0 - P_1 >= 0

 H_a: P_0 - P_1 < 0

 $\alpha$ = 0.05

this is a TEST OF PROPORTIONS. we use the Binomial distribution since it is a yes/no (bernulli) test for each subject: the former inmate was or was not ever employed in a CEO transitional job (second row in the table above):

 $P_0=0.035, P_1=0.701$

In [3]:
# This is the significance threshold, or p-value
alpha=0.05



# Converting the test and control group values to percentages

P_0 = 3.5 * 0.01   # Control Group
P_1 = 70.1 * 0.01  # Test Group

if P_0 - P_1 >= 0:
    # we are done
    print ("the Null holds")
else:
    print ("we must assess the statistical significance")

#sample sizes 

n_0 = 409   # Control Group
n_1 = 564   # Test Group
    
# Calculate the actual integers values that correspond to each percentage

Nt_0 = P_0 * n_0
Nt_1 = P_1 * n_1

we must assess the statistical significance


## The first test will be the Z-test

In [4]:
# First, calculate the rate of "successes"

sp = (P_0 * n_0 + P_1 * n_1) / (n_1 + n_0)
print (sp)

0.4210472764645426


In [5]:
p = lambda p0, p1, n0, n1: (p0 * n0 + p1 * n1) / (n0 + n1)
#standard error
se = lambda p, n0, n1: np.sqrt(p * (1 - p) * (1.0 / n0 + 1.0 / n1))

In [6]:
zscore = lambda p0, p1, s : (p0 - p1) / s
z_2y = zscore(P_1, P_0, se(p(P_0, P_1, n_0, n_1), n_0, n_1))
print (z_2y)

20.7697865408


In [7]:
# Add Z-table here

In [8]:
# p-value for employment after 2 years: 
# since the largest number we read off the table for is (way) smaller
# than the value for our statistic 
# our p-value will be smaller than it would be if calculated using
# (e.g.) .9998 (and in fact using 1.0000 which is the largest number
# in the table). Using 0.9998 is a **conservative** approach. 

p_2y = 1 - 0.9984


def report_result(p,a):
    print ('is the p value ' + 
           '{0:.2f} smaller than the critical value {1:.2f}?'.format(p,a))
    if p < a:
        print ("YES!")
    else: 
        print ("NO!")
    
    print ('the Null hypothesis is {}'.format(\
                            'rejected' if p < a  else 'not rejected') )

    
report_result(p_2y, alpha)

is the p value 0.00 smaller than the critical value 0.05?
YES!
the Null hypothesis is rejected


## Next Test

## Null Hypothesis: the felony recidivism rate (% of felony convictions after release from prison) is greater or equal among those who participated in the program versus those who did not, significance level = 0.05

H_0: P_1_rc >= P_0_rc

H_a: P_1_rc < P_0_rc

In [17]:
P_0_rc = 10.0 * 0.01
P_1_rc = 11.7 * 0.01
n_0_rc = 409
n_1_rc = 568


Nt_0_rc = P_0_rc * n_0_rc 
Nt_1_rc = P_1_rc * n_1_rc 

p_rc = p(P_0_rc, P_1_rc, n_0_rc, n_1_rc)
se_rc = se(p_rc, n_0_rc, n_1_rc)
z_3y = zscore(P_1_rc, P_0_rc, se(p(P_0_rc, P_1_rc, n_0_rc, n_1_rc), n_0_rc, n_1_rc))

print("p-value        : %.3f"%p_rc)
print("standard error : %.3f"%se_rc)
print("z score        : %.3f"%z_3y)

p-value        : 0.110
standard error : 0.020
z score        : 0.838


In [20]:
# Using the z-score we use the following equation to determine the p-value

p_rc_pop = 1 - 0.7995

print("Using the Z table we calculate a p-value of %.4f, which is larger than 0.05. \nSo we cannot reject the null hypothesis."%p_rc_pop)

Using the Z table we calculate a p-value of 0.2005, which is larger than 0.05. 
So we cannot reject the null hypothesis.


# Chi-Square Test for Employment Rate

In [21]:
# First, copy the following function, which defined the Chi-Square test

def evalChisq(values):
    values = np.array(values)
    E = np.empty_like(values)
    for j in range(len(values[0])):
        for i in range(2):
            
            E[i][j] = ((values[i,:].sum() * values[:,j].sum()) / 
                        (values).sum())
    return ((values - E)**2 / E).sum()


In [22]:
Ntot = 973

sample_values = np.array([[0.701 * 564, 0.299 * 564], [0.0305 * 409, 0.965 * 409]])

print (evalChisq(sample_values))

436.223462575


With an alpha of 0.05, the chi-square value is 3.84, which is significantly smaller than the calculated statistic.  Thus the null hypothesis can be rejected.  There is a statistically significant result related to higher employment rates among those who participated in the program.

# Chi-Square Test for Recidivism


|convicted of a fellony     |     yes   | no        |                   
|---------------------------|-----------|-----------|----------------|
| test sample               | 0.1 * 568 | 0.9 * 568 |     568        |
| control sample            |0.117 * 409|0.883 * 409|     409        |
|                           |           |           |                |
| total                     |  104.653  |  872.347  |     977        |

In [23]:
sample_values_recid = [[0.100 * 568, 0.90 * 568],[0.117 * 409, 0.883 * 409]]

print(sample_values_recid)

[[56.800000000000004, 511.2], [47.853, 361.147]]


In [27]:
chisq_recid = evalChisq(sample_values_recid)

print(chisq_recid)

0.718493917505


In [25]:
DOF = len(sample_values_recid) - 1
print ('chi sq statistics for "recidivism 1-3 years": '+
       'chisq = {:.3f}, DOF = {:d}'.format(chisq_recid, DOF))

chi sq statistics for "recidivism 1-3 years": chisq = 0.718, DOF = 1


In [28]:
# We check the Chi-Square table for a value to compare

chimin_alpha5pc = 3.84  # For DOF = 1, alpha = 0.05


print ("The Null hypothesis that the program is ineffective at controlling recidivism")
if chisq_recid > chimin_alpha5pc :
    print ("can be rejected at alpha = 0.05")
else: 
    print ("cannot be rejected (p<0.05)")
print ("with a chi square statistics of %.2f"%chisq_recid)

The Null hypothesis that the program is ineffective at controlling recidivism
cannot be rejected (p<0.05)
with a chi square statistics of 0.72
