In [None]:
import pandas as pd
import numpy as np
from scipy import stats

### 1

In [None]:
water = pd.read_csv('water.txt', sep='\t')
water.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [None]:
round(water['mortality'].corr(water['hardness']), 4)

-0.6548

### 2

In [None]:
round(water['mortality'].corr(water['hardness'], method='spearman'), 4)

-0.6317

### 3

In [None]:
water_south = water[water['location'] == 'South']
water_north = water[water['location'] == 'North']

In [None]:
print(round(water_south['mortality'].corr(water_south['hardness']), 4))
print(round(water_north['mortality'].corr(water_north['hardness']), 4))

-0.6022
-0.3686


### 4

In [None]:
columns = ['Sex', 'At least once a month']
data = np.vstack((
    np.tile([1, 1], (203, 1)), 
    np.tile([0, 1], (239, 1)),
    np.tile([1, 0], (718, 1)),  
    np.tile([0, 0], (515, 1))
))
df = pd.DataFrame(data=data, columns=columns)

In [None]:
ct = pd.crosstab(df['Sex'], df['At least once a month'])
ct

At least once a month,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,515,239
1,718,203


In [None]:
a = ct[0][0]
b = ct[1][0]
c = ct[0][1]
d = ct[1][1]
MCC = (a*d - b*c)/np.sqrt((a+b)*(a+c)*(b+d)*(c+d))
round(MCC, 3)

-0.109

### 5

In [None]:
chi2, p, dof, expected = stats.chi2_contingency(ct)
print(p)

1.0558987006638725e-05


### 6

In [None]:
n1 = sum(ct.loc[0])
p1 = ct[1][0]/n1
n2 = sum(ct.loc[1])
p2 = ct[1][1]/n2

In [None]:
alpha = 0.05
z = stats.norm.ppf(1 - alpha / 2.)   
left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/n1 + p2 * (1 - p2)/n2)
right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/n1 + p2 * (1 - p2)/n2)
print((left_boundary, right_boundary))
print(round(left_boundary, 4))

(0.053905233215813156, 0.13922183141523897)
0.0539


### 7

In [None]:
P = (p1*n1 + p2*n2) / (n1 + n2)
z_stat = (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))
2 * (1 - stats.norm.cdf(np.abs(z_stat)))

8.153453089576601e-06

### 8

In [None]:
columns = ['Happines', 'Financial situation']
data = np.vstack((
    np.tile(['Не очень счастлив', 'Не доволен'], (197, 1)), 
    np.tile(['Не очень счастлив', 'Более или менее'], (111, 1)),
    np.tile(['Не очень счастлив', 'Доволен'], (33, 1)), 
    np.tile(['Достаточно счастлив', 'Не доволен'], (382, 1)), 
    np.tile(['Достаточно счастлив', 'Более или менее'], (685, 1)),
    np.tile(['Достаточно счастлив', 'Доволен'], (331, 1)),
    np.tile(['Очень счастлив', 'Не доволен'], (110, 1)), 
    np.tile(['Очень счастлив', 'Более или менее'], (342, 1)),
    np.tile(['Очень счастлив', 'Доволен'], (333, 1))
))
df = pd.DataFrame(data=data, columns=columns)
ct = pd.crosstab(df['Happines'], df['Financial situation'])
ct

Financial situation,Более или менее,Доволен,Не доволен
Happines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Достаточно счастлив,685,331,382
Не очень счастлив,111,33,197
Очень счастлив,342,333,110


In [None]:
chi2, p, dof, expected = stats.chi2_contingency(ct)
print(round(chi2, 4))

293.6831


### 9

In [None]:
p

2.4964299580093467e-62

### 10

In [None]:
n = np.sum(ct.values)
round(np.sqrt(chi2/(n*2)), 4)

0.2412