In [36]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
water = pd.read_csv('water.txt', sep = '\t')

In [3]:
water.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [4]:
water.describe()

Unnamed: 0,mortality,hardness
count,61.0,61.0
mean,1524.147541,47.180328
std,187.668754,38.093966
min,1096.0,5.0
25%,1379.0,14.0
50%,1555.0,39.0
75%,1668.0,75.0
max,1987.0,138.0


In [24]:
mort_hard = water[['mortality', 'hardness']]
mort_hard.shape

(61, 2)

In [9]:
mort_hard_correlation = mort_hard.corr(method = 'pearson')
mort_hard_correlation

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


In [12]:
print(mort_hard_correlation['mortality']['hardness'].round(4))

-0.6548


In [13]:
mort_hard_correlation_spearman = mort_hard.corr(method = 'spearman')
mort_hard_correlation_spearman

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


In [15]:
print(mort_hard_correlation_spearman['mortality']['hardness'].round(4))

-0.6317


In [25]:
mort_hard_south = water[water['location'] == 'South'][['mortality', 'hardness']]
mort_hard_south.shape

(26, 2)

In [27]:
mort_hard_north = water[water['location'] == 'North'][['mortality', 'hardness']]
mort_hard_north.shape

(35, 2)

In [28]:
mort_hard_north_correlation_pearson = mort_hard_north.corr(method = 'pearson')

mort_hard_south_correlation_pearson = mort_hard_south.corr(method = 'pearson')

In [29]:
print(mort_hard_north_correlation_pearson['mortality']['hardness'].round(4))
print(mort_hard_south_correlation_pearson['mortality']['hardness'].round(4))

-0.3686
-0.6022


## Задание 4

Обозначим значения признаков **Пол:** 0 - женщина, 1 - мужчина; **Частота похода в бары:** 0 - реже, чем раз в месяц, 1 - хотя бы раз в месяц

| Пол \ Частота похода в бары |  0  |  1  | 

|              0              | 718 | 203 | 

|              1              | 515 | 239 | 

In [33]:
# Вычислим значение коэффициента корреляции Мэтьюса между полом и частотой похода в бары

MCC = (718 * 239 - 203 * 515) / np.sqrt((718 + 203) * (718 + 515) * (203 + 239) * (515 + 239))
MCC.round(3)

0.109

## Задание 5

In [37]:
observed = [[718 , 203], [515, 239]] 

In [38]:
stats.chi2_contingency(observed)

(19.40753078854304,
 1.0558987006638725e-05,
 1,
 array([[677.96597015, 243.03402985],
        [555.03402985, 198.96597015]]))

## Задание 6

In [50]:
p1 = 239 / (515 + 239) 
p2 = 203 / (718 + 203)

n1 = 515 + 239
n2 = 718 + 203

In [51]:
z = stats.norm.ppf(1 - 0.05 / 2)
left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)

In [53]:
print('[%f, %f]' % (left_boundary, right_boundary))

[0.053905, 0.139222]


In [54]:
left_boundary.round(4)

0.0539

## Задание 7

In [55]:
P = (p1*n1 + p2*n2) / (n1 + n2)
Z = (p1 - p2) / np.sqrt(P * (1 - P) * (1 / n1 + 1 / n2))
Z

4.46111444482329

In [56]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [57]:
proportions_diff_z_test(Z, alternative = 'two-sided')

8.153453089576601e-06

## Задание 8

In [58]:
observed = [[197, 111, 33], [382, 685, 331], [110, 342, 333]]

In [65]:
statistics_chi2 = stats.chi2_contingency(observed)
print(statistics_chi2[0].round(4))
print(statistics_chi2[1])

293.6831
2.4964299580093467e-62


## Задание 10

In [69]:
coeff_v_kramer = np.sqrt(statistics_chi2[0] / (np.sum(observed) * 2))
coeff_v_kramer.round(4)

0.2412