In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
import math

The t-distribution is available in `scipy.stats` with the nickname "t" so we can get t-critical values with `stats.t.ppf()`.

We can see that the sample mean differs from the population mean by 1.13 years. We can calculate a confidence interval without the population standard deviation, using the t-distribution using `stats.t.ppf(q, df)` function. This function takes in a value for the confidence level required (q) with "degrees of freedom" (df).

> In this case, the number of degrees of freedom, `df`, is equal to the sample size minus 1, or `df = sample_size - 1`. 



In [None]:
# Calculate the t-critical value for 95% confidence level for sample taken above. 
t_critical = stats.t.ppf(q=.975, df=sample_size-1)   # Get the t-critical value  by using 95% confidence level and degree of freedom
print("t-critical value:")                  # Check the t-critical value
#print(t_critical)     

# t-critical value:
# 2.0638985616280205

Confidence Interval using t-distribution

In [None]:
# Calculate the sample standard deviation
sample_stdev = sample.std()    # Get the sample standard deviation

# Calculate sigma using the formula described above to get population standard deviation estimate
sigma = sample_stdev/math.sqrt(sample_size)

# Calculate margin of error using t_critical and sigma
margin_of_error = t_critical*sigma

# Calculate the confidence intervals using calculated margin of error 
confidence_interval = (sample_mean-margin_of_error, sample_mean+margin_of_error)


print("Confidence interval:")
print(confidence_interval)

# Confidence interval:
# (18.4609156900928, 21.280661568850913)

one sample t test

In [3]:
from scipy import stats
import numpy as np
import seaborn as sns
def one_sample_ttest(sample, popmean, alpha):

    # Visualize sample distribution for normality 
    sns.set(color_codes=True)
    sns.set(rc={'figure.figsize':(12,10)})
    sns.distplot(sample)
    
    # Population mean 
    mu=popmean

    # Sample mean (x̄) using NumPy mean()
    x_bar=sample.mean()
    
    # Sample Standard Deviation (sigma) using Numpy
    sigma=np.std(sample)
    
    # Degrees of freedom
    df=len(sample)-1
    
    # Calculate the critical t-value
    t_crit=stats.t.ppf(1-alpha, df=df)
    
    # Calculate the t-value and p-value      
    results=stats.ttest_1samp(a=sample, popmean=mu)
    
    
    if (results[0]>t_crit) and (results[1]<alpha):
        print('Null hypothesis rejected. Results are statistically significant with t-value =',
               round(results[0],2), 'critical t-value=', t_crit, 'and p-value =', np.round((results[1]),10))
    else:
        print('Null hypothesis is True with t-value =', 
               round(results[0], 2), ", critical t-value=", t_crit, 'and p-value =', np.round((results[1]), 10))
    # return results
    

two sample t test

In [None]:
'''
Calculates the t-test for the means of *two independent* samples of scores.

This is a two-sided test for the null hypothesis that 2 independent samples
have identical average (expected) values. This test assumes that the
populations have identical variances by default.
'''

stats.ttest_ind(experimental, control)