# Day 11: Hypothesis Testing
## Author: Hau Nguyen

In [21]:
# Find working directory
import os
path = os.getcwd()
print(path)

C:\Users\HNGUY40\OneDrive - Emory University\Python\Notes (2024 Spring)


In [22]:
# Import necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

In [23]:
# Import data
birth = pd.read_csv(r'C:\Users\HNGUY40\OneDrive - Emory University\Python\Data\BirthdataNC.csv')
# Data's information
birth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fage            829 non-null    float64
 1   mage            1000 non-null   int64  
 2   mature          1000 non-null   object 
 3   weeks           998 non-null    float64
 4   premie          998 non-null    object 
 5   visits          991 non-null    float64
 6   marital         999 non-null    object 
 7   gained          973 non-null    float64
 8   weight          1000 non-null   float64
 9   lowbirthweight  1000 non-null   object 
 10  gender          1000 non-null   object 
 11  habit           999 non-null    object 
 12  whitemom        998 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 101.7+ KB


## Hypothesis Testing for Difference in Means

In [24]:
# Create 2 samples by gender to do test for difference on weight
female = birth[birth['gender']=='female']['weight']
male = birth[birth['gender']=='male']['weight']

In [25]:
# Test by gender
ttest_ind(female, male)

Ttest_indResult(statistic=-4.211995513148301, pvalue=2.760111651635877e-05)

In [26]:
# Round the result
# round(stats.ttest_ind(female, male), 4) does not work
test_gen = ttest_ind(female, male)
print('test-stat: ', round(test_gen[0], 4))
print('p-value: ', round(test_gen[1], 4))
# Statistically significant at 5% level of significance: reject null hypothesis 
# -> There is difference in birth weight between female and male babies

test-stat:  -4.212
p-value:  0.0


In [27]:
# Notes: if there are many missing values, you need to add (nan_policy='omit') to the t-test
# test_gen = stats.ttest_ind(female, male, nan_policy='omit')

In [28]:
# Lower-tailed test:
test_gen = ttest_ind(female, male, alternative='less')
print('test-stat: ', round(test_gen[0], 4))
print('p-value: ', round(test_gen[1], 4))

test-stat:  -4.212
p-value:  0.0


In [29]:
# Upper-tailed test:
test_gen = ttest_ind(female, male, alternative='greater')
print('test-stat: ', round(test_gen[0], 4))
print('p-value: ', round(test_gen[1], 4))

test-stat:  -4.212
p-value:  1.0


In [30]:
# PRACTICE:
# Test for difference in weight between babies born to smokers and nonsmokers
smoker = birth[birth['habit']=='smoker']['weight']
nonsmoker = birth[birth['habit']=='nonsmoker']['weight']
test_habit = ttest_ind(smoker, nonsmoker)
print('test-stat: ', round(test_habit[0], 4))
print('p-value: ', round(test_habit[1], 4))
# Statistically significant at 5% level of significance: reject null hypothesis 
# -> There is difference in birth weight between babies born to smokers and nonsmokers

test-stat:  -2.2034
p-value:  0.0278


## Hypothesis Testing for Difference in Proportions 

In [34]:
# Distribution of babies with low birth weight by gender
pd.crosstab(birth['gender'], birth['lowbirthweight'], margins=True)

lowbirthweight,low,not low,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,59,444,503
male,52,445,497
All,111,889,1000


In [35]:
# Test by gender
test_pro_gen = proportions_ztest([59, 52], [503, 497])
print('T-Stat: ', round(test_pro_gen[0], 4))
print('P-Value: ', round(test_pro_gen[1], 4))
# Not significant at 5% level of significance: fail to reject null 
# -> No evidence of difference

T-Stat:  0.6376
P-Value:  0.5237


In [36]:
# Lower-tailed test:
test_pro_gen = proportions_ztest([59, 52], [503, 497], alternative='smaller')
print('T-Stat: ', round(test_pro_gen[0], 4))
print('P-Value: ', round(test_pro_gen[1], 4))

T-Stat:  0.6376
P-Value:  0.7381


In [37]:
# Upper-tailed test:
test_pro_gen = proportions_ztest([59, 52], [503, 497], alternative='larger')
print('T-Stat: ', round(test_pro_gen[0], 4))
print('P-Value: ', round(test_pro_gen[1], 4))

T-Stat:  0.6376
P-Value:  0.2619


In [38]:
# PRACTICE:
# Test for difference in proportions of babies with low birth weight by mom's habit: smoker vs nonsmoker
pd.crosstab(birth['habit'], birth['lowbirthweight'], margins=True)

lowbirthweight,low,not low,All
habit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nonsmoker,92,781,873
smoker,18,108,126
All,110,889,999


In [39]:
# Test by mom's habit
test_pro_habit = proportions_ztest([92, 18], [873, 126])
print('T-Stat: ', round(test_pro_habit[0], 4))
print('P-Value: ', round(test_pro_habit[1], 4))
# Not significant at 5% level of significance: fail to reject null
# -> No evidence of difference

T-Stat:  -1.2562
P-Value:  0.2091


In [40]:
!jupyter nbconvert --to html Day_11_Notes.ipynb

[NbConvertApp] Converting notebook Day_11_Notes.ipynb to html
[NbConvertApp] Writing 607931 bytes to Day_11_Notes.html
