In [37]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, chi2_contingency
import scipy

In [2]:
data = pd.read_csv('water.txt', sep = '\t', header = 0)

In [3]:
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


## 1 Кореляция Пирсона

In [4]:
data_correlation = data.corr()

In [6]:
data_correlation

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


## 2 Корреляция Спирмена

In [7]:
spearmanr(data.mortality, data.hardness)

SpearmanrResult(correlation=-0.6316646189166502, pvalue=4.79546153722838e-08)

## 3 Пирсон для южных и северных

In [13]:
data_correlation_south = data[data.location == 'South'].corr()

In [14]:
data_correlation_north = data[data.location == 'North'].corr()

In [15]:
print('South')
data_correlation_south.head()

South


Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [16]:
print('North')
data_correlation_north.head()

North


Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


## 4 Мэтьюс

In [20]:
a = 203. # weman often
b = 239. # mans often
c = 718. # weman rarely
d = 515. # man rarely

In [23]:
print((a * b - c * d)/np.sqrt((a+b)*(a+c)*(b+d)*(c+d)))

-0.5222026014156602


## 5 Значимость

In [24]:
obs = np.array([[a, b], [c, d]])

In [28]:
chi2_contingency(obs)[1]

1.0558987006638725e-05

## 6 Интервал для долей

In [31]:
fem = np.append(np.ones(203), np.zeros(718))
mal = np.append(np.ones(239), np.zeros(515))

In [36]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [39]:
proportions_diff_confint_ind(mal, fem, alpha = 0.05)

(0.053905233215813156, 0.13922183141523897)

## 7 Гипотеза о равенсве долей

In [40]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [41]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [43]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(fem, mal), 'two-sided'))

p-value: 0.000008


## 8 Хи квадрат для таблицы

In [44]:
table = np.array([[197, 111, 33],
                  [382, 685, 331],
                  [110, 342, 333]
                 ]
                )

In [47]:
chi2_contingency(table)[0]

293.68311039689746

## 9 p-value

In [48]:
chi2_contingency(table)[1]

2.4964299580093467e-62

1: -0.6548

2: -0.6317

3: -0.3686

4: -

    wrong: -0.522

5: 5

6: 0.0539

7: 6

8: 293.6831

9: 62