In [219]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

In [220]:
experimentGroup = [1 if i < 4 else 0 for i in range(16)]
controlGroup = [1 if i < 10 else 0 for i in range(34)]

In [221]:
conf_interval_experimentGroup = proportion_confint(sum(experimentGroup), len(experimentGroup), method='wilson')
conf_interval_controlGroup = proportion_confint(sum(controlGroup), len(controlGroup), method='wilson')

In [222]:
print ('95%% confidence interval for a yawn, experimentGroup: [%f, %f]' % conf_interval_experimentGroup)
print ('95%% confidence interval for a yawn, controlGroup: [%f, %f]' % conf_interval_controlGroup)

95% confidence interval for a yawn, experimentGroup: [0.101821, 0.494983]
95% confidence interval for a yawn, controlGroup: [0.168346, 0.461689]


In [223]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [224]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [225]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [226]:
print ("95%% confidence interval for a difference between proportions: [%f, %f]" %\
      proportions_diff_confint_ind(controlGroup, experimentGroup))

95% confidence interval for a difference between proportions: [-0.217558, 0.305793]


In [230]:
p = proportions_diff_z_test(proportions_diff_z_stat_ind(controlGroup, experimentGroup), 'greater')
print ("p-value: %f" % p)

p-value: 0.372930


In [149]:
data = pd.read_csv('banknotes.txt', sep='\t', header=0)
data.columns = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'real']

In [150]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [151]:
data.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,214.896,130.1215,129.9565,9.4175,10.6505,140.4835,0.5
std,0.376554,0.361026,0.404072,1.444603,0.802947,1.152266,0.501255
min,213.8,129.0,129.0,7.2,7.7,137.8,0.0
25%,214.6,129.9,129.7,8.2,10.1,139.5,0.0
50%,214.9,130.2,130.0,9.1,10.6,140.45,0.5
75%,215.1,130.4,130.225,10.6,11.2,141.5,1.0
max,216.3,131.0,131.1,12.7,12.3,142.4,1.0


In [152]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
X1      200 non-null float64
X2      200 non-null float64
X3      200 non-null float64
X4      200 non-null float64
X5      200 non-null float64
X6      200 non-null float64
real    200 non-null int64
dtypes: float64(6), int64(1)
memory usage: 11.0 KB


In [153]:
y = data['real']
X = data.drop(['real'], axis=1)

In [154]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)

In [156]:
X_train1 = X_train[['X1', 'X2', 'X3']]
X_test1 = X_test[['X1', 'X2', 'X3']]

In [157]:
X_train2 = X_train[['X4', 'X5', 'X6']]
X_test2 = X_test[['X4', 'X5', 'X6']]

In [158]:
model1 = LogisticRegression()
model1.fit(X_train1, y_train)
y_predict1 = model1.predict(X_test1)

In [159]:
model2 = LogisticRegression()
model2.fit(X_train2, y_train)
y_predict2 = model2.predict(X_test2)

In [160]:
sum(y_predict1)/len(y_predict1)

0.46

In [161]:
sum(y_predict2)/len(y_predict2)

0.56

In [183]:
conf_interval_y1 = proportion_confint(sum(np.abs(y_predict1 - y_test)), len(y_predict1), method='wilson')
conf_interval_y2 = proportion_confint(sum(np.abs(y_predict2 - y_test)), len(y_predict2), method='wilson')

In [184]:
print ('95%% confidence interval, y_predict1: [%f, %f]' % conf_interval_y1)
print ('95%% confidence interval, y_predict2: [%f, %f]' % conf_interval_y2)

95% confidence interval, y_predict1: [0.112438, 0.330371]
95% confidence interval, y_predict2: [0.003539, 0.104954]


In [185]:
print ("95%% confidence interval for a difference between proportions: [%f, %f]" %\
      proportions_diff_confint_ind(np.abs(y_predict1 - y_test), np.abs(y_predict2 - y_test)))

95% confidence interval for a difference between proportions: [0.062533, 0.297467]


In [232]:
p = proportions_diff_z_test(proportions_diff_z_stat_ind(np.abs(y_predict1 - y_test), np.abs(y_predict2 - y_test)))
print ("p-value: %f" % p)

p-value: 0.004022


In [202]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [203]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [204]:
print ("95%% confidence interval for a difference between proportions: [%f, %f]" \
      % proportions_diff_confint_rel(np.abs(y_predict1 - y_test), np.abs(y_predict2 - y_test)))

95% confidence interval for a difference between proportions: [0.059945, 0.300055]


In [206]:
print ("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_rel(np.abs(y_predict1 - y_test), np.abs(y_predict2 - y_test))))

p-value: 0.003297


In [211]:
mu = 525
sigma = 100

In [216]:
mu_test = 541.5
n = 100

In [217]:
Z_stat = (mu - mu_test)/(sigma/np.sqrt(n))
Z_stat

-1.65

In [218]:
p_values = scipy.stats.norm.sf(abs(Z_stat))
p_values

0.0494714680336481