# Confidence Intervals & Margin of Error

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import sem, norm, t, ttest_ind, ttest_ind_from_stats

## CI when Population std known

Calculate the sample mean with confidence interval of 95%

In [2]:
data = np.genfromtxt('data/excercise37.csv',delimiter=',')
data

array([117313., 104002., 113038., 101936.,  84560., 113136.,  80740.,
       100536., 105052.,  87201.,  91986.,  94868.,  90745., 102848.,
        85927., 112276., 108637.,  96818.,  92307., 114564., 109714.,
       108833., 115295.,  89279.,  81720.,  89344., 114426.,  90410.,
        95118., 113382.])

In [3]:
sample_mean = np.mean(data)
sample_mean

100200.36666666667

In [4]:
pop_std = 15000

In [5]:
std_error = pop_std/(len(data)**0.5)

In [6]:
rf = abs(norm.ppf(1 - 0.025))

In [7]:
m_error =  rf*std_error 

The sample mean is

In [8]:
sample_mean - m_error, sample_mean + m_error

(94832.7842355152, 105567.94909781814)

## CI when Population std unknown

Calculate the sample mean with confidence interval of 99%

In [9]:
data = np.array(
    [78000, 90000,75000,117000,105000,96000,89500,102300,80000]
)

In [10]:
sample_mean = np.mean(data)
sample_mean

92533.33333333333

In [11]:
sample_std = np.std(data, ddof=1) 
sample_std

13931.887883556916

In [12]:
std_error = sem(data)

In [13]:
rf = abs(t(len(data)-1).ppf(1 - 0.005))

In [14]:
m_error =  rf*std_error 

The sample mean to 99% confidence

In [15]:
sample_mean - m_error, sample_mean + m_error

(76951.03996465197, 108115.62670201469)

## CI for two means with dependant samples

Each row is a patient, first column is magnesium levels before treatment, second is magnesium levels after treatment

In [16]:
data = np.genfromtxt('data/excercise44.csv', delimiter=',')
print(data[:3,])

[[2.  1.7]
 [1.4 1.7]
 [1.3 1.8]]


Calculate the difference

In [17]:
data = np.column_stack((data, np.subtract(data[:,1], data[:,0])))
print(data[:3,])

[[ 2.   1.7 -0.3]
 [ 1.4  1.7  0.3]
 [ 1.3  1.8  0.5]]


In [18]:
sample_mean = np.mean(data[:,2])
sample_std = np.std(data[:,2], ddof=1)
sample_mean, sample_std

(0.32999999999999996, 0.45472824607426554)

In [19]:
m_error = abs(t(len(data)-1).ppf(1-0.025))*sem(data[:,2])

With 95% confidence we can say the mean of the sample is in the above range and as the whole range is positive we can say that after the treatment the magnesium levels are higher

In [20]:
sample_mean - m_error, sample_mean + m_error

(0.00470700883907349, 0.6552929911609264)

## CI for two means with independant samples
### Part I) Known population vairance

descriptive statistice for university grades for students in different departments

In [21]:
df = pd.DataFrame(
    data = [
        [100, 70],
        [58, 65],
        [10, 5]
    ],
    columns=['Engineering', 'Management'],
    index=['Size', 'Sample mean', 'population std']
)

In [22]:
df['Difference'] = np.nan
df.loc['Sample mean', 'Difference'] =  df.loc['Sample mean', 'Engineering'] -  df.loc['Sample mean','Management']
df.loc['population std', 'Difference'] =  (df.loc['population std','Engineering']**2/df.loc['Size','Engineering'] +  df.loc['population std','Management']**2/df.loc['Size','Management'])**0.5

In [23]:
df

Unnamed: 0,Engineering,Management,Difference
Size,100,70,
Sample mean,58,65,-7.0
population std,10,5,1.164965


In [24]:
rf = abs(norm.ppf(1 - 0.025))

In [25]:
m_error = rf*df.loc['population std', 'Difference']

95% confidence that the students in management did worse that the students in Engineering

In [26]:
df.loc['Sample mean', 'Difference'] - m_error, df.loc['Sample mean', 'Difference'] + m_error

(-9.2832889435009, -4.716711056499101)

### Part II)  unknown population difference, assuming population variance is equal

In [27]:
df = pd.DataFrame(
    data = [
        [10, 8],
        [3.94, 3.25],
        [0.18, 0.27]
    ],
    columns=['NY', 'LA'],
    index=['Size', 'sample mean', 'sample std']
)

In [28]:
df

Unnamed: 0,NY,LA
Size,10.0,8.0
sample mean,3.94,3.25
sample std,0.18,0.27


In [29]:
t_stat = ttest_ind_from_stats(
    df.loc['sample mean', 'NY'], df.loc['sample std', 'NY'], df.loc['Size', 'NY'], 
    df.loc['sample mean', 'LA'], df.loc['sample std', 'LA'], df.loc['Size', 'LA'],
).statistic

In [30]:
rf = t.ppf(1-0.025,16)

In [31]:
std_error = (df.loc['sample mean', 'NY'] - df.loc['sample mean', 'LA'])/t_stat #derived from t statistics

In [32]:
m_error= rf*std_error

In [33]:
mean_diff = df.loc['sample mean', 'NY'] - df.loc['sample mean', 'LA']

In [34]:
mean_diff - m_error, mean_diff +m_error

(0.4648832371955388, 0.9151167628044611)

### Part III)  unknown population difference, assuming population variance is different

In [35]:
t_stat = ttest_ind_from_stats(
    df.loc['sample mean', 'NY'], df.loc['sample std', 'NY'], df.loc['Size', 'NY'], 
    df.loc['sample mean', 'LA'], df.loc['sample std', 'LA'], df.loc['Size', 'LA'],
    equal_var=False
).statistic

In [36]:
std_error = (df.loc['sample mean', 'NY'] - df.loc['sample mean', 'LA'])/t_stat #derived from t statistics

In [37]:
m_error= rf*std_error

In [38]:
mean_diff - m_error, mean_diff +m_error

(0.4543899064230037, 0.9256100935769962)