In [29]:
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### 3

In [7]:
def proportions_diff_z_stat_ind(a, b, n1, n2):
    p1 = a/n1
    p2 = b/n2

    P = (p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1 / n1 + 1 / n2))
  
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [11]:
a = 10
n1 = 34
b = 4
n2 = 16

round(proportions_diff_z_test(proportions_diff_z_stat_ind(a, b, n1, n2), 'greater'), 4)

0.3729

### 4

In [16]:
banknotes = pd.read_csv('banknotes.txt', sep='\t')
banknotes.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [71]:
X = banknotes.drop('real', axis=1)
y = banknotes['real']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=50, random_state=1)

features1 = ['X1', 'X2', 'X3']
features2 = ['X4', 'X5', 'X6']

lr1 = LogisticRegression(solver='liblinear')
lr1.fit(X_train[features1], y_train)
pred1 = lr1.predict(X_test[features1])

lr2 = LogisticRegression(solver='liblinear')
lr2.fit(X_train[features2], y_train)
pred2 = lr2.predict(X_test[features2])

In [72]:
err_pred1 = np.abs(y_test - pred1)
err_pred2 = np.abs(y_test - pred2)

In [73]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return (f - g) / np.sqrt(f + g - (f - g)**2 / n )

In [74]:
p_value = proportions_diff_z_test(proportions_diff_z_stat_rel(err_pred1, err_pred2))
p_value

0.0032969384555543435

### 5

In [75]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = (f - g) / n  - z * np.sqrt((f + g) / n**2 - (f - g)**2 / n**3)
    right_boundary = (f - g) / n  + z * np.sqrt((f + g) / n**2 - (f - g)**2 / n**3)
    return (left_boundary, right_boundary)

In [76]:
interval = proportions_diff_confint_rel(err_pred1, err_pred2)
print(interval)
print(round(interval[0], 4))

(0.059945206279614305, 0.3000547937203857)
0.0599


### 6

In [69]:
mu_0 = 525
sigma = 100

X_mean = 541.4
n = 100

z = (X_mean - mu_0)/(sigma/np.sqrt(n))

round(1-stats.norm.cdf(z), 4)

0.0505

### 7

In [70]:
X_mean = 541.5
n = 100

z = (X_mean - mu_0)/(sigma/np.sqrt(n))

round(1-stats.norm.cdf(z), 4)

0.0495