In [1]:
import numpy as np
import pandas as pd

import scipy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LogisticRegression

In [2]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [3]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [4]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [5]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [6]:
data1 = [1] * 4 + [0] * 12
data2 = [1] * 10 + [0] * 24

print(round(proportions_diff_z_test(proportions_diff_z_stat_ind(data1, data2), 'less'), 4))

0.3729


In [7]:
data = pd.read_csv("banknotes.txt", delimiter="\t")
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [8]:
y = data['real']
X1 = data.loc[:, 'X1':'X3']
X2 = data.loc[:, 'X4':'X6']

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y, test_size=0.25, random_state=1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.25, random_state=1)

In [9]:
model1 = LogisticRegression()
model1.fit(X_train1, y_train1)
predict1 = model1.predict(X_test1)

In [10]:
model2 = LogisticRegression()
model2.fit(X_train2, y_train2)
predict2 = model2.predict(X_test2)

In [11]:
proportions_diff_z_test(proportions_diff_z_stat_rel(y_test1 != predict1, y_test2 != predict2))

0.0032969384555543435

In [12]:
print(3)

3


In [13]:
interval = proportions_diff_confint_rel(y_test1 != predict1, y_test2 != predict2)
print(round(interval[0], 4))

0.0599


In [14]:
z=(541.4 - 525)/(100/np.sqrt(100))
print(round((1-scipy.stats.norm.cdf(abs(z))), 4))

0.0505


In [15]:
z=(541.5 - 525)/(100/np.sqrt(100))
print(round(1-scipy.stats.norm.cdf(abs(z)), 4))

0.0495
