In [1]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *

In [28]:
z = (9.57 - 9.5) / 0.4 * np.sqrt(160)
scipy.stats.norm.sf(z)*2
stats.norm.sf(z)*2

0.026856695507523776

In [3]:
df = pd.read_csv('diamonds.txt', sep='\t')
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [5]:
X = df.drop('price', axis='columns')
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40455, 6), (13485, 6), (40455,), (13485,))

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators=100, random_state=1)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [7]:
lr_diff = np.abs(lr.predict(X_test) - y_test)
rf_diff = np.abs(rf.predict(X_test) - y_test)
lr_diff.mean(), rf_diff.mean()

(890.3764004285589, 779.711758355821)

In [14]:
stats.ttest_rel(lr_diff, rf_diff)

Ttest_relResult(statistic=18.037259744511335, pvalue=6.936823477539177e-72)

In [23]:
DescrStatsW(lr_diff - rf_diff).zconfint_mean()

(98.63960523805434, 122.68967890743124)

In [27]:
diff = lr_diff - rf_diff
mean = diff.mean()
std = diff.std(ddof=1)
mean - 1.96*std/np.sqrt(len(diff)), mean + 1.96*std/np.sqrt(len(diff))

(98.63938427112015, 122.68989987436507)

In [29]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [30]:
proportions_diff_confint_ind([1]*10 + [0]*24,[1]*4 + [0]*12)

(-0.2175577216559601, 0.3057930157736072)

In [31]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [32]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [87]:
proportions_diff_z_test(proportions_diff_z_stat_ind([1]*4 + [0]*12, [1]*10 + [0]*24), alternative='less')

0.37293045872523534

In [34]:
df = pd.read_csv('banknotes.txt', sep='\t')
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [36]:
X1 = df[['X1', 'X2', 'X3']]
X2 = df[['X4', 'X5', 'X6']]
y = df['real']

X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, test_size=0.25)
X1_train.shape, X1_test.shape, X2_train.shape, X2_test.shape, y_train.shape, y_test.shape

((150, 3), (50, 3), (150, 3), (50, 3), (150,), (50,))

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr1 = LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear')
lr1.fit(X1_train, y_train)
lr1_accuracy = lr1.predict(X1_test) == y_test

lr2 = LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear')
lr2.fit(X2_train, y_train)
lr2_accuracy = lr2.predict(X2_test) == y_test

In [83]:
np.sum(lr1_accuracy) / len(lr1_accuracy), np.sum(lr2_accuracy) / len(lr2_accuracy)

(0.86, 0.98)

In [84]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [86]:
proportions_diff_z_test(proportions_diff_z_stat_rel(lr2_accuracy, lr1_accuracy))

0.026165407141421015

In [67]:
proportions_diff_confint_ind(lr1_accuracy, lr2_accuracy)

(-0.22371154563472453, -0.016288454365275457)

In [55]:
z = (541.5 - 525) / (100 / np.sqrt(100))
z

1.65

In [56]:
1-stats.norm.cdf(z)

0.0494714680336481