In [36]:
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
p1 = np.zeros((34, 1))
p2 = np.zeros((16, 1))

In [6]:
for i in range(10):
    p1[i] = 1
for i in range(4):
    p2[i] = 1

In [8]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [9]:
z = proportions_diff_z_stat_ind(p1, p2)

In [18]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [23]:
proportions_diff_z_test(z, alternative='greater')

0.37293045872523534

In [27]:
data = pd.read_csv('banknotes.txt', sep='\t')
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [28]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('real', axis=1), data['real'],
                                                    random_state=1, test_size=50)

In [32]:
X_train[['X1', 'X2', 'X3']]

Unnamed: 0,X1,X2,X3
98,215.1,130.0,129.8
123,215.1,130.7,130.4
119,214.4,130.2,129.9
53,215.4,130.2,130.2
33,215.6,130.4,130.1
...,...,...,...
133,214.6,130.2,130.4
137,214.9,130.7,130.3
72,215.0,129.6,130.2
140,214.8,130.2,130.3


In [72]:
logit1 = LogisticRegression(random_state=1, solver='liblinear')
logit2 = LogisticRegression(random_state=1, solver='liblinear')
logit1.fit(X_train[['X1', 'X2', 'X3']], y_train)
logit2.fit(X_train[['X4', 'X5', 'X6']], y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
LogisticRegression?

In [73]:
y_test_1 = logit1.predict(X_test[['X1', 'X2', 'X3']])
y_test_2 = logit2.predict(X_test[['X4', 'X5', 'X6']])

In [74]:
accuracy_score(y_test, y_test_1), accuracy_score(y_test, y_test_2)

(0.8, 0.98)

In [75]:
y_1 = abs(y_test - y_test_1)
y_2 = abs(y_test - y_test_2)

In [76]:
y_z = proportions_diff_z_stat_rel(y_1, y_2)
y_z

2.9386041680175268

In [77]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [78]:
proportions_diff_z_test(y_z)

0.0032969384555543435

In [79]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [80]:
proportions_diff_confint_rel(y_1, y_2)

(0.059945206279614305, 0.3000547937203857)

In [81]:
gmat_mean = 525
gmat_sigma = 100
n = 200000
n_pr = 100
gmat_pr_mean = 541.4

In [82]:
z = (gmat_pr_mean - gmat_mean)/(gmat_sigma/np.sqrt(n_pr))
z

1.6399999999999977

In [87]:
proportions_diff_z_test(z, alternative='greater')

0.05050258347410397

In [93]:
z = (541.5 - gmat_mean)/(gmat_sigma/np.sqrt(n_pr))
z

1.65

In [97]:
proportions_diff_z_test(z, alternative='greater')

0.0494714680336481