In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('water.txt', sep='\t')
df.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [3]:
df[['mortality', 'hardness']].corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


In [4]:
df[['mortality', 'hardness']].corr('spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


In [7]:
south = df[df['location'] == 'South']
north = df[df['location'] == 'North']
south[['mortality', 'hardness']].corr(), north[['mortality', 'hardness']].corr()

(           mortality  hardness
 mortality   1.000000 -0.602153
 hardness   -0.602153  1.000000,            mortality  hardness
 mortality   1.000000 -0.368598
 hardness   -0.368598  1.000000)

In [8]:
a = 203
b = 718
c = 239
d = 515
mattews = (a*d - b*c) / np.sqrt((a+b) * (a+c) * (d+c) * (d+b))
mattews

-0.10900237458678963

In [11]:
import scipy.stats

scipy.stats.chi2_contingency(np.array([[a, b], [c, d]]))

(19.40753078854304,
 1.0558987006638725e-05,
 1,
 array([[243.03402985, 677.96597015],
        [198.96597015, 555.03402985]]))

In [20]:
def proportions_confint_diff_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [21]:
women = [1]*a + [0]*b
men = [1]*c + [0]*d
proportions_confint_diff_rel(men, women)

(0.03252571231439902, 0.06296500386597233)

In [22]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [23]:
proportions_diff_z_test(proportions_diff_z_stat_ind(women, men))

8.153453089576601e-06

In [27]:
matrix = np.array([[197, 111, 33], [382, 685, 331], [110, 342, 333]])
chi2 = scipy.stats.chi2_contingency(matrix)
chi2

(293.68311039689746,
 2.4964299580093467e-62,
 4,
 array([[ 93.08597464, 153.74722662,  94.16679873],
        [381.6251981 , 630.318542  , 386.0562599 ],
        [214.28882726, 353.93423138, 216.77694136]]))

In [28]:
k1, k2 = matrix.shape
phi = np.sqrt(chi2[0] / (np.sum(matrix) * (min(k1, k2) - 1)))
phi

0.2412013934500338