<h3>More Sample and Population (Chp 7)</h3>

<h6>Definitions</h6>
$$\begin{array}{cc}
\mu_\bar{X} = \mu_X & \text{Mean of all possible sample means of samples of size n from population} \\
\end{array}$$

<h6>Equations</h6>
$$\begin{array}{cc}
\sigma_\bar{X}^2 = \dfrac{\sigma_X^2}{\sqrt{n}} & \text{Variance of all possible means of samples of size n from population} \\ % Is this actually correct?
\sigma_\bar{X} = \dfrac{\sigma_X}{\sqrt{n}} & \text{Standard deviation of all possible means  of samples of size n from population} \\
SE(\bar{X}) = \dfrac{S}{\sqrt{n}}  & \text{Standard error of all possible sample means of samples of size n from population} \\
Z = \dfrac{\bar{X}-\mu}{\sigma/\sqrt{n}} & \text{Standardize Formula when calculating probabilities about $\bar{X}$} \\
\end{array}$$

<h3>Confidence Level (Chp 8))</h3>

<h6>Definitions</h6>
$$\begin{array}{cc}
\bar{X} \pm Z_{\alpha/2} \dfrac{\sigma}{\sqrt{n}} & \text{Two-sided CI for the population mean, when $\sigma$ is known} \\
\bar{X} + Z_\alpha \dfrac{\sigma}{\sqrt{n}} & \text{Upper bound for the population mean, when $\sigma$ is known} \\
\bar{X} - Z_\alpha \dfrac{\sigma}{\sqrt{n}} & \text{Lower bound for the population mean, when $\sigma$ is known} \\
\bar{X} \pm T_{\alpha/2} \dfrac{S}{\sqrt{n}} & \text{Two-sided CI for the population mean, when $\sigma$ is NOT known} \\
\bar{X} + T_{\alpha/2} \dfrac{S}{\sqrt{n}} & \text{Upper bound for the population mean, when $\sigma$ is NOT known} \\
\bar{X} - T_{\alpha/2} \dfrac{S}{\sqrt{n}} & \text{Lower bound CI for the population mean, when $\sigma$ is NOT known} \\
\end{array}$$

<h6>Equations</h6>
$$\begin{array}{cc}
C = 1 - \alpha & \text{Confidence level} \\
E = Z_{\alpha/2} \dfrac{\sigma}{\sqrt{n}} & \text{Margin of error for a two-sided CI} \\
E = Z_{\alpha} \dfrac{\sigma}{\sqrt{n}} & \text{Margin of error for a one-sided CI} \\
n = \left( \dfrac{Z_{\alpha/2}\cdot\sigma}{E} \right)^2 & \text{Sample size for two-sided hypothesis test, given C and E} \\
n = \left( \dfrac{Z_\alpha\cdot\sigma}{E} \right)^2 & \text{Sample size for one-sided hypothesis test, given C and E}
\end{array}$$

<h3>Hypothesis Testing (Chp 9)</h3>

<h6>Definitions</h6>
$$\begin{array}{cc}
\alpha & \begin{array}{c}\text{Significance level, and} \\ \text{Probability of Type I error (False positive)}\end{array} \\
\beta & \text{Probability of Type II error (False negative)} \\
Z_\alpha & \text{$Z$ value with $\alpha$ greater-than probability} \\
Z_\beta & \text{$Z$ value with $\beta$ greater-than probability} \\
Z_{\alpha/2} & \text{$Z$ value with $\alpha/2$ greater-than probability} \\
\end{array}$$

<h6>Equations</h6>
$$\begin{array}{cc}
Z_0 = \dfrac{\bar{X}-\mu_0}{\sigma/\sqrt{n}} & \text{Test statistic for a One-sample Z test, where $Z_0 \sim N(0,1)$}\\
T_0 = \dfrac{\bar{X}-\mu_0}{S/\sqrt{n}} & \text{Test statistic for a One-sample T test, where $T_0 \sim t(n-1)$} \\
\delta = \mu_a - \mu_0 & \text{Difference between centers of $\bar{X}$ under $H_0$ and $H_a$} \\
\beta = 1 -P\left( Z \le -Z_\alpha - \dfrac{\delta\sqrt{n}}{\sigma} \right) & H_0: \mu = \mu_0 \text{ vs. } H_a: \mu < \mu_0 \\
\beta = P\left( Z \le Z_
\alpha - \dfrac{\delta\sqrt{n}}{\sigma} \right) & H_0: \mu = \mu_0 \text{ vs. } H_a: \mu > \mu_0 \\
\beta = P\left( Z \le Z_{\alpha/2} - \dfrac{\delta\sqrt{n}}{\sigma} \right) - P\left( Z \le -Z_{\alpha/2} - \dfrac{\delta\sqrt{n}}{\sigma} \right) & H_0: \mu = \mu_0 \text{ vs. } H_a: \mu \ne \mu_0 \\
\text{Power} = 1-\beta & \text{Power it the probability of rejecting $H_0$ when $H_a$ is true} \\
n = \left(\dfrac{(Z_\alpha+Z_\beta)\sigma}{\delta}\right)^2 & \text{Sample size calculation for a one-sided hypothesis test} \\
n = \left(\dfrac{(Z_{\alpha/2}+Z_\beta)\sigma}{\delta}\right)^2 & \text{Sample size calculation for a two-sided hypothesis test} \\
\end{array}$$

In [1]:
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def P(eq,**kwargs):
    relational_operators = [' U ',' | ',' I ',' < ',' > ',' == ',' <= ',' >= ']
    rel_op = "You can't fucking spell";
    for operator in relational_operators:
        if operator in eq:
            rel_op = operator
    
    if rel_op == "You can't fucking spell":
        raise ValueError("No recognized relational operator in equation: '{}'".format(eq))
    
    LHS,RHS = eq.split(rel_op)
    LHS = LHS.strip()
    RHS = RHS.strip()
    #print("LHS: '{}'".format(LHS))
    #print("RHS: '{}'".format(RHS))
    #print("RelOp: '{}'".format(rel_op.strip()))
    
    if (LHS == 'Z' or LHS == 'T' or LHS == 'T0' or LHS == 'Z0' or LHS == 'T_0' or LHS == 'Z_0') and rel_op.strip() not in ['U','I','|']:
        if RHS not in kwargs:
            raise ValueError("Variable '{}' not provided. Symbolism don't mean shit here.".format(RHS))
        #print("This is a table problem, with {}: {}".format(RHS,kwargs[RHS]))
        if LHS == 'Z':
            # returns z-score
            if '<' in rel_op.strip():
                return round(stats.norm.ppf(kwargs[RHS]),3)
            elif '>' in rel_op.strip():
                return round(stats.norm.ppf(1-kwargs[RHS]),3)
        elif LHS == 'T':
            # returns t-score
            if 'n' not in kwargs:
                raise ValueError("Sample size 'n' not provided.")
            df = kwargs['n']-1
            if '<' in rel_op.strip():
                return round(stats.t.ppf(kwargs[RHS],df),3)
            elif '>' in rel_op.strip():
                return round(stats.t.ppf(1-kwargs[RHS],df),3)
        elif LHS == 'T_0' or LHS == 'T0':
            # returns t-value
            if 'n' not in kwargs:
                raise ValueError("Sample size 'n' not provided.")
            df = kwargs['n']-1
            return round(stats.t.cdf(kwargs[RHS],df),6)
        elif LHS == 'Z_0' or LHS == 'Z0':
            # returns z-value
            return round(stats.norm.cdf(kwargs[RHS]),6)
    else:
        #print("You're gonna need some equations here.")
        if 'S' not in kwargs:
            raise ValueError("Sample space 'S' not provided.")
        S = kwargs['S']
            
        print("Only Z- and T-table lookups are available at this time. Apologies for the inconvenience.")
        
        if LHS not in S.keys():
            raise KeyError("Sample Space setup up improperly. No key '{}' in keys '{}'".format(LHS,S.keys()))
        if RHS not in S[list(S.keys())[0]].keys():
            raise KeyError("Sample Space setup up improperly. No key '{}' in keys '{}'".format(RHS,S[list(S.keys())[0]].keys()))
        if rel_op.strip() == 'U' and 'disjoint' not in kwargs:
            raise ValueError("Disjointedness ('disjoint=') not defined.")
        if rel_op.strip() == 'I' and 'independent' not in kwargs:
            raise ValueError("Independence ('independent=') not defined.")
    
    print("It appears. There's an error. My apologies.")

In [8]:
# Get z_alpha
C = 0.95 # 95% confident (confidence level)
alpha = 1-C
z_alpha = P("Z <= z",z=C)
print("z_value: {}".format(z_alpha));

# Get t-value
n = 25
df = n-1
t_value = P("T <= t",t=C,n=n)
print("t-value: {}".format(t_value))

# Get p-value
# From emissions example:
x_bar = 1.21
mu_0 = 1.45
s = 0.4
n = 28
t_0 = (x_bar-mu_0)/(s/(n)**0.5)
p_value = P("T_0 <= t_0",t_0=t_0,n=n)
print("p-value: {}".format(p_value))

z_value: 1.645
t-value: 1.711
p-value: 0.001863


In [6]:
alpha = 0.05
print(P("Z <= z",z=1-alpha))
print(P("Z <= z",z=1-alpha/2))

1.645
1.96


In [9]:
n = 25
x_bar = 15
s = 4

C = 0.90
alpha = 1-C
t_alphaOn2 = P("T <= z",z=1-alpha/2,n=n)
spread = (x_bar - t_alphaOn2*s/(n)**0.5,x_bar + t_alphaOn2*s/(n)**0.5)
print(spread)

(13.6312, 16.3688)


In [13]:
P("T <= t",t=1-0.10/2,n=21)

1.725

In [19]:
n = 35
mu_0 = 350
alpha = 0.05
std = 25

mu_true = 362
delta = mu_true-mu_0
print("delta = {}".format(delta))
z_alpha = P("Z <= z",z=1-alpha)
print("z_alpha = {}".format(z_alpha))
beta = P("Z0 <= Z_0",Z_0=z_alpha-delta*(n)**0.5/std)
print("beta = {}".format(beta))
power = 1-beta
print("power = {}".format(power))

delta = 12
z_alpha = 1.645
beta = 0.116099
power = 0.883901


In [21]:
P("Z <= z",z=0.10)

-1.282

In [23]:
beta = 0.10
z_beta = P("Z <= z",z=1-beta)
np.ceil((z_alpha+z_beta)**2*(std)**2/(delta)**2)

38.0

In [26]:
mu_0 = 400000
n = 15
x_bar = 380000
s = 30000
alpha = 0.05

t_0 = (x_bar-mu_0)/(s/(n)**0.5)
print("Test statistic: {}".format(t_0))
p_value = P("T_0 <= t_0",t_0=t_0,n=n)
print(2*p_value)

Test statistic: -2.5819888974716116
0.021726
