# Hypothesis Testing

In [56]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp, ttest_ind, ttest_rel, sem, norm, ttest_ind_from_stats
from statsmodels.stats import weightstats as stests

## Test for mean, population vairance known - unlikely

salary data

In [6]:
data = np.genfromtxt('data/excercise37.csv')
data

array([117313., 104002., 113038., 101936.,  84560., 113136.,  80740.,
       100536., 105052.,  87201.,  91986.,  94868.,  90745., 102848.,
        85927., 112276., 108637.,  96818.,  92307., 114564., 109714.,
       108833., 115295.,  89279.,  81720.,  89344., 114426.,  90410.,
        95118., 113382.])

null hypothesis: $H_0:\mu=113,000$; alternative hypothesis  $H_1:\mu \neq 113,000$. This is a 2 sided test

The formula:
$Z=\frac{x - \mu_0}{\frac{\sigma}{\sqrt{n}}}$

In [21]:
mean = np.mean(data)
pop_mean = 113000
pop_std = 15000
size = len(data)

In [22]:
z_stat = (mean - pop_mean)/(pop_std/size**0.5)
z_stat

-4.673765269641137

In [26]:
crit_val = norm.ppf(1 - 0.025) # 95% ci

In [27]:
if abs(z_stat) > crit_val:
    print('reject null hypothesis')
else:
    print('accept the null hypothesis')

reject null hypothesis


at a 5% significance, $\alpha = 0.05$ we reject that the mean salary is 113,000

## Test for mean, population vairance unknown

null hypothesis: $H_0:\mu<=40\%$; alternative hypothesis  $H_1:\mu>40\%$ and $\alpha = 0.05$ This is a one sided test

In [34]:
data =  np.array([0.26, 0.23, 0.42, 0.49, 0.23, 0.59, 0.29, 0.29, 0.57, 0.40])

In [41]:
t_stat, p_val = ttest_1samp(data, 0.4)
t_stat, p_val/2 #one sided

(-0.5295018231059438, 0.3046341429137719)

In [42]:
if p_val/2 < 0.05:
    print('reject null hypothesis')
else:
    print('accept the null hypothesis')

accept the null hypothesis


## Test for mean, Dependant Samples

In [48]:
data = np.genfromtxt('data/excercise44.csv', delimiter=',')
data

array([[2. , 1.7],
       [1.4, 1.7],
       [1.3, 1.8],
       [1.1, 1.3],
       [1.8, 1.7],
       [1.6, 1.5],
       [1.5, 1.6],
       [0.7, 1.7],
       [0.9, 1.7],
       [1.5, 2.4]])

null hypothesis: $H_0:\mu_b>=\mu_a$; alternative hypothesis  $H_1:\mu_b<\mu_a$ and $\alpha = 0.01$ This is a one sided test. 

null hypothesis is that there is mg level will the stay the same or decrease, alternative hypothesis is that it will increase

In [49]:
data = np.column_stack((data, np.subtract(data[:,0], data[:,1]))) #calc before - after

In [53]:
t_stat, p_val = ttest_rel(data[:,0], data[:,1])
t_stat, p_val/2 #one sided

(-2.294890710802983, 0.02369696819869895)

In [54]:
if p_val/2 < 0.01:
    print('reject null hypothesis')
else:
    print('accept the null hypothesis')

accept the null hypothesis


## Independant samples, known variance

In [62]:
df = pd.DataFrame(
    data = [
        [100, 70],
        [58, 65],
        [10, 6]
    ],
    columns=['Engineering', 'Management'],
    index=['Size', 'Sample mean', 'population std']
)
df

Unnamed: 0,Engineering,Management
Size,100,70
Sample mean,58,65
population std,10,6


null hypothesis: $H_0:\mu_e - \mu_m = -4\% $; alternative hypothesis  $H_1:\mu_e - \mu_m \neq -4\% $ and $\alpha = 0.05$ This is a two sided test. 

In [63]:
df['Difference'] = np.nan
df.loc['Sample mean', 'Difference'] =  df.loc['Sample mean', 'Engineering'] -  df.loc['Sample mean','Management']
df.loc['population std', 'Difference'] =  (df.loc['population std','Engineering']**2/df.loc['Size','Engineering'] +  df.loc['population std','Management']**2/df.loc['Size','Management'])**0.5

In [70]:
z_stat = (df.loc['Sample mean', 'Difference'] - -4)/df.loc['population std', 'Difference']
z_stat #lower than -4

-2.4379081661160034

In [68]:
p_stat = norm.sf(abs(z_stat))*2

In [69]:
if p_stat < 0.05:
    print('reject null hypothesis')
else:
    print('accept the null hypothesis')

reject null hypothesis


## Independant samples, unknown variance that are equal

In [71]:
df = pd.DataFrame(
    data = [
        [10, 8],
        [3.94, 3.25],
        [0.18, 0.27]
    ],
    columns=['NY', 'LA'],
    index=['Size', 'sample mean', 'sample std']
)

null hypothesis: $H_0:\mu_{ny} - \mu_{la} = 0 $; alternative hypothesis  $H_1:\mu_{ny} - \mu_{la} \neq -0 $ and $\alpha = 0.05$ This is a two sided test. 

In [73]:
t_stat, p_val = ttest_ind_from_stats(
    df.loc['sample mean', 'NY'], df.loc['sample std', 'NY'], df.loc['Size', 'NY'], 
    df.loc['sample mean', 'LA'], df.loc['sample std', 'LA'], df.loc['Size', 'LA'],
    equal_var=True
)

In [75]:
if p_val < 0.05:
    print('reject null hypothesis')
else:
    print('accept the null hypothesis')

reject null hypothesis


In [77]:
if t_stat > 4:
    print('Extremely significant, potentially pointless test')

Extremely significant, potentially pointless test


as the means are quite different and there is little sample std, it is clear that the prices in ny and la are different. hypothesis test not really needed