# Week 3
## Week 3-1  
Comparison of population means when there is a correspondence

In [6]:
import numpy as np
from scipy import stats
import pandas as pd
from sympy import *

In [19]:
df = pd.read_excel('3-1_Paired Test and Estimation on the Difference of Population Means.xlsx', index_col=0)

A = np.array(list(df[df.columns[0]]))
B = np.array(list(df[df.columns[1]]))

D = A - B
n = len(D)
df = n - 1
D_mean = np.mean(D)
D_var = np.var(D, ddof=1)

alpha = 0.01
t0 = stats.t.ppf(alpha, df)
t_stat = D_mean / np.sqrt(D_var / n)

lower_t = stats.t.ppf(alpha / 2, df)
upper_t = stats.t.ppf(1 - alpha / 2, df)

lower_intvl = D_mean - upper_t * np.sqrt(D_var / n)
upper_intvl = D_mean - lower_t * np.sqrt(D_var / n)

print("Sample mean={:.3f}, sample variance={:.6f}".format(D_mean, D_var))
print("Degree of freedom={}, t0={:.3f}, t-stat={:.3f}".format(df, t0, t_stat))
print("Confidence interval=[{:.3f}, {:.3f}]".format(lower_intvl, upper_intvl))

Sample mean=-0.290, sample variance=0.016556
Degree of freedom=9, t0=-2.821, t-stat=-7.127
Confidence interval=[-0.422, -0.158]


In [21]:
# Exercise 3-1
df = pd.read_excel('practice3-1.xlsx')
A = np.array(list(df[df.columns[0]]))
B = np.array(list(df[df.columns[1]]))

D = A - B
n = len(D)
df = n - 1
D_mean = np.mean(D)
D_var = np.var(D, ddof=1)

alpha = 0.05
t0 = stats.t.ppf(alpha, df)
t_stat = D_mean / np.sqrt(D_var / n)

lower_t = stats.t.ppf(alpha / 2, df)
upper_t = stats.t.ppf(1 - alpha / 2, df)

lower_intvl = D_mean - upper_t * np.sqrt(D_var / n)
upper_intvl = D_mean - lower_t * np.sqrt(D_var / n)

print("Sample mean={:.3f}, sample variance={:.6f}".format(D_mean, D_var))
print("Degree of freedom={}, t0={:.3f}, t-stat={:.3f}".format(df, t0, t_stat))
print("Confidence interval=[{:.3f}, {:.3f}]".format(lower_intvl, upper_intvl))

Sample mean=1.362, sample variance=1.171250
Degree of freedom=7, t0=-1.895, t-stat=3.561
Confidence interval=[0.458, 2.267]


In [5]:
# Mini-Exam 3-1
df = pd.read_excel('mini-exam3-1.xlsx')
A = np.array(list(df[df.columns[0]]))
B = np.array(list(df[df.columns[1]]))

D = A - B
n = len(D)
df = n - 1
D_mean = np.mean(D)
D_var = np.var(D, ddof=1)

alpha = 0.05
t0 = stats.t.ppf(alpha / 2, df)
t_stat = D_mean / np.sqrt(D_var / n)

lower_t = stats.t.ppf(alpha / 2, df)
upper_t = stats.t.ppf(1 - alpha / 2, df)

lower_intvl = D_mean - upper_t * np.sqrt(D_var / n)
upper_intvl = D_mean - lower_t * np.sqrt(D_var / n)

print("Sample mean={:.3f}, sample variance={:.6f}".format(D_mean, D_var))
print("Degree of freedom={}, t0={:.3f}, t-stat={:.3f}".format(df, t0, t_stat))
print("Confidence interval=[{:.3f}, {:.3f}]".format(lower_intvl, upper_intvl))

Sample mean=-17.875, sample variance=3546.982143
Degree of freedom=7, t0=-2.365, t-stat=-0.849
Confidence interval=[-67.666, 31.916]


## Week 3-2
Tests and estimation of the difference between parameters for two populations assuming binomial distributions.
- Approximation method 1  
When the random variables $X_1$, $X_2$ independently follow $\mathcal{B}(n_1,p_1)$, $\mathcal{B}(n_2,p_2)$, then $\hat{p}^*_1 − \hat{p}^*_2 = \frac{X_1+0.5}{n_1+1} − \frac{X_2+0.5}{n_2+1}$ can be approximated by  
$$ \mathcal{N}\left(p_1−p_2, \frac{p_1(1−p_1)}{n_1} + \frac{p_2(1−p_2)}{n_2}\right)$$  
-  Approximation method 2  
 When the random variables X1,X2 independently follow $\mathcal{B}(n_1,p_1)$, $\mathcal{B}(n_2,p_2)$, then $L(\hat{p}^*_1) − L(\hat{p}^*_2) = \frac{X_1+0.5}{n_1+1} − \frac{X_2+0.5}{n_2+1}$ can be approximated by  
$$ \mathcal{N}\left(L(p_1)−L(p_2), \frac{1}{n_1p_1(1−p_1)} + \frac{1}{n_2p_2(1−p_2)}\right).$$  
&emsp; Here, $L$ is a logit transformation defined by $L(x)={\rm ln}\frac{x}{1−x}$.   

&emsp; $\hat{p}^* = \frac{X_1+X_2+0.5}{n_1+n_2+1}$ is substituted instead of the unknown $p$

In [16]:
n1, n2 = 1000, 1000
X1, X2 = 60, 50
p1_hs = (X1 + 0.5) / (n1 + 1)
p2_hs = (X2 + 0.5) / (n2 + 1) 
p_hs = (X1 + X2 + 0.5) / (n1 + n2 + 1)

U0_m1 = (p1_hs - p2_hs) / np.sqrt(p_hs * (1 - p_hs) * (1/n1 + 1/n2))

def ln(p):
    return np.log(p / (1 - p))

U0_m2 = (ln(p1_hs) - ln(p2_hs)) * np.sqrt(p_hs * (1 - p_hs)) / np.sqrt (1/n1 + 1/n2)

print("p1*={:.4f}, p2*={:.4f}, p*={:.4f}".format(p1_hs, p2_hs, p_hs))
print("Approximations method 1 U0={:.3f}, method 2 U0={:.3f}".format(U0_m1, U0_m2))

alpha = 0.05
lower_r = stats.norm.ppf(alpha/2)
upper_r = stats.norm.ppf(1 - alpha/2)

var = np.sqrt(p1_hs * (1 - p1_hs) / n1 + p2_hs * (1 - p2_hs) / n2)
mean = p1_hs - p2_hs
lower_intvl = mean - upper_r * var
upper_intvl = mean - lower_r * var
print("Region={:.3f}, {:.3f}".format(lower_r, upper_r))
print("Confidence interval={:.3f}, {:.3f}".format(lower_intvl, upper_intvl))

p1*=0.0604, p2*=0.0504, p*=0.0552
Approximations method 1 U0=0.978, method 2 U0=0.977
Region=-1.960, 1.960
Confidence interval=-0.010, 0.030


Estimation of the difference of the parameters of binomial distribution  
For point estimation, or are used as estimators. 

In [18]:
# Exercise 3-2
n1, n2 = 200, 200
X1, X2 = 35, 47
p1_hs = (X1 + 0.5) / (n1 + 1)
p2_hs = (X2 + 0.5) / (n2 + 1) 
p_hs = (X1 + X2 + 0.5) / (n1 + n2 + 1)

U0_m1 = (p1_hs - p2_hs) / np.sqrt(p_hs * (1 - p_hs) * (1/n1 + 1/n2))

def ln(p):
    return np.log(p / (1 - p))

U0_m2 = (ln(p1_hs) - ln(p2_hs)) * np.sqrt(p_hs * (1 - p_hs)) / np.sqrt (1/n1 + 1/n2)

print("p1*={:.4f}, p2*={:.4f}, p*={:.4f}".format(p1_hs, p2_hs, p_hs))
print("Approximations method 1 U0={:.3f}, method 2 U0={:.3f}".format(U0_m1, U0_m2))

alpha = 0.05
n_stat = stats.norm.ppf(alpha)
print("Region={:.3f}".format(n_stat))

lower_r = stats.norm.ppf(alpha/2)
upper_r = stats.norm.ppf(1 - alpha/2)

var = np.sqrt(p1_hs * (1 - p1_hs) / n1 + p2_hs * (1 - p2_hs) / n2)
mean = p1_hs - p2_hs
lower_intvl = mean - upper_r * var
upper_intvl = mean - lower_r * var
print("Norm interval={:.3f}, {:.3f}".format(lower_r, upper_r))
print("Confidence interval={:.3f}, {:.3f}".format(lower_intvl, upper_intvl))

p1*=0.1766, p2*=0.2363, p*=0.2057
Approximations method 1 U0=-1.477, method 2 U0=-1.481
Region=-1.645
Region=-1.960, 1.960
Confidence interval=-0.139, 0.019


In [19]:
# Mini-Exam 3-2
n1, n2 = 50, 50
X1, X2 = 27, 18
p1_hs = (X1 + 0.5) / (n1 + 1)
p2_hs = (X2 + 0.5) / (n2 + 1) 
p_hs = (X1 + X2 + 0.5) / (n1 + n2 + 1)

U0_m1 = (p1_hs - p2_hs) / np.sqrt(p_hs * (1 - p_hs) * (1/n1 + 1/n2))

def ln(p):
    return np.log(p / (1 - p))

U0_m2 = (ln(p1_hs) - ln(p2_hs)) * np.sqrt(p_hs * (1 - p_hs)) / np.sqrt (1/n1 + 1/n2)

print("p1*={:.4f}, p2*={:.4f}, p*={:.4f}".format(p1_hs, p2_hs, p_hs))
print("Approximations method 1 U0={:.3f}, method 2 U0={:.3f}".format(U0_m1, U0_m2))

alpha = 0.05
n_stat = stats.norm.ppf(alpha / 2)
print("Region={:.3f}".format(n_stat))

lower_r = stats.norm.ppf(alpha/2)
upper_r = stats.norm.ppf(1 - alpha/2)

var = np.sqrt(p1_hs * (1 - p1_hs) / n1 + p2_hs * (1 - p2_hs) / n2)
mean = p1_hs - p2_hs
lower_intvl = mean - upper_r * var
upper_intvl = mean - lower_r * var
print("Norm interval={:.3f}, {:.3f}".format(lower_r, upper_r))
print("Confidence interval={:.3f}, {:.3f}".format(lower_intvl, upper_intvl))

p1*=0.5392, p2*=0.3627, p*=0.4505
Approximations method 1 U0=1.773, method 2 U0=1.793
Region=-1.960
Norm interval=-1.960, 1.960
Confidence interval=-0.015, 0.368
