In [31]:
import pandas as pd
import numpy as np

from scipy.stats import chi2_contingency
import scipy

In [2]:
water_data = pd.read_csv('water.txt', sep='\t', header=0)
water_data.columns = ['location', 'town', 'mortality', 'hardness'] 

In [4]:
water_data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [10]:
water_data.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


In [11]:
water_data.corr(method='spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


In [12]:
water_data_north = water_data[water_data['location'] == 'North']
water_data_south = water_data[water_data['location'] == 'South']

In [14]:
water_data_south.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [19]:
conf = pd.DataFrame([[203, 239],
                     [718, 515]])

In [23]:
N = 515.0 + 203.0 + 239.0 + 718.0
P = (203.0 + 718.0)/N
S = (203.0 + 239.0)/N

In [24]:
C = ((203.0/N) - S*P)/(np.sqrt(P*S*(1-S)*(1-P)))

In [25]:
C

-0.10900237458678959

In [27]:
chi2_contingency(conf)

(19.40753078854304,
 1.0558987006638725e-05,
 1,
 array([[243.03402985, 198.96597015],
        [677.96597015, 555.03402985]]))

In [40]:
def proportions_diff_confint_ind(count1, count2, len1, len2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)   
    p1 = float(count1) / len1
    p2 = float(count2) / len2
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len1 + p2 * (1 - p2)/ len2)
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len1 + p2 * (1 - p2)/ len2)
    
    return (left_boundary, right_boundary)

In [41]:
def proportions_diff_z_stat_ind(count1, count2, len1, len2):
    n1 = len1
    n2 = len2
    
    p1 = count1 / n1
    p2 = count2 / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [42]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [43]:
proportions_diff_confint_ind(203.0, 239.0, 718.0+203.0, 515.0+239.0)

(-0.13922183141523897, -0.053905233215813156)

In [44]:
print ("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(203.0, 239.0, 718.0+203.0, 515.0+239.0)))

p-value: 0.000008


In [47]:
0.000008

8e-06

In [66]:
def proportions_diff_confint_rel(alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    n = 718.0+203.0+515.0+239.0
        
    g = 718
    f = 239
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [67]:
proportions_diff_confint_rel()

(-0.31947792352399607, -0.2524623749834666)

In [48]:
conf1 = pd.DataFrame([[197,111,33],
                      [382, 685, 331],
                      [110, 342, 333]])

In [54]:
chi2 = chi2_contingency(conf1)[0]

In [59]:
n = conf1.sum().sum()
phi2 = chi2/n
r,k = conf1.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)

In [60]:
np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

0.23964751409931453

In [70]:
def cramers_stat(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n*(min(confusion_matrix.shape)-1)))

result = cramers_stat(conf1)

In [71]:
result

0.2412013934500338