In [1]:
import numpy as np
import pandas as pd
import scipy
from scipy import stats

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Задание 3

In [4]:
def proportions_diff_z_stat_ind(n1, s1, n2, s2):
    p1 = s1 / n1
    p2 = s2 / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [6]:
Z = proportions_diff_z_stat_ind(34, 10, 16, 4)
print(Z)

0.32410186177608225


In [7]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [10]:
p = proportions_diff_z_test(Z, 'greater')
print(p.round(4))

0.3729


## Задание 4

In [11]:
banknotes = pd.read_csv('banknotes.txt', sep = '\t')

In [12]:
banknotes.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [14]:
banknotes.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,214.896,130.1215,129.9565,9.4175,10.6505,140.4835,0.5
std,0.376554,0.361026,0.404072,1.444603,0.802947,1.152266,0.501255
min,213.8,129.0,129.0,7.2,7.7,137.8,0.0
25%,214.6,129.9,129.7,8.2,10.1,139.5,0.0
50%,214.9,130.2,130.0,9.1,10.6,140.45,0.5
75%,215.1,130.4,130.225,10.6,11.2,141.5,1.0
max,216.3,131.0,131.1,12.7,12.3,142.4,1.0


In [15]:
from sklearn.model_selection import train_test_split

In [23]:
y = banknotes['real']
X = banknotes[banknotes.columns.drop('real')]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [25]:
from sklearn.linear_model import LogisticRegression

In [32]:
X_train1 = X_train[['X1', 'X2', 'X3']]
X_train2 = X_train[['X4', 'X5', 'X6']]

X_test1 = X_test[['X1', 'X2', 'X3']]
X_test2 = X_test[['X4', 'X5', 'X6']]

In [47]:
model1 = LogisticRegression(solver = 'liblinear')
model2 = LogisticRegression(solver = 'liblinear')

model1.fit(X_train1, y_train)
model2.fit(X_train2, y_train)

y_predicted1 = model1.predict(X_test1)
y_predicted2 = model2.predict(X_test2)

score1 = model1.score(X_test1, y_test)
score2 = model2.score(X_test2, y_test)

error1 = 1 - score1
error2 = 1 - score2

In [53]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [55]:
Z = proportions_diff_z_stat_rel(abs(y_test - y_predicted1), abs(y_test - y_predicted2))
Z

2.9386041680175268

In [56]:
p = proportions_diff_z_test(Z, 'two-sided')
print(p)

0.0032969384555543435


In [57]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [58]:
interval = proportions_diff_confint_rel(abs(y_test - y_predicted1), abs(y_test - y_predicted2), alpha = 0.05)

In [59]:
interval[0].round(4)

0.0599

## Задание 6

In [60]:
mu_0 = 525
sigma = 100
n = 100
X_mean = 541.4

In [61]:
Z = (X_mean - mu_0) / (sigma / np.sqrt(n))
print(Z)

1.6399999999999977


In [62]:
p = 1 - stats.norm.cdf(Z)
print(p.round(4))

0.0505


In [63]:
mu_0 = 525
sigma = 100
n = 100
X_mean = 541.5

In [64]:
Z = (X_mean - mu_0) / (sigma / np.sqrt(n))
print(Z)

1.65


In [65]:
p = 1 - stats.norm.cdf(Z)
print(p.round(4))

0.0495
