Using the 'prestige' dataset, perform a hypothesis test on the claim that there is an income difference between white-collar (wc, white-collar) and blue-collar workers (bc, blue-collar) at a significance level of 0.05

In [11]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

df = pd.read_csv('../../data/prestige.csv')
df

Unnamed: 0,job,education,income,women,prestige,census,type
0,gov.administrators,13.11,12351,11.16,68.8,1113,prof
1,general.managers,12.26,25879,4.02,69.1,1130,prof
2,accountants,12.77,9271,15.70,63.4,1171,prof
3,purchasing.officers,11.42,8865,9.11,56.8,1175,prof
4,chemists,14.62,8403,11.68,73.5,2111,prof
...,...,...,...,...,...,...,...
97,bus.drivers,7.58,5562,9.47,35.9,9171,bc
98,taxi.drivers,7.93,4224,3.59,25.1,9173,bc
99,longshoremen,8.37,4753,0.00,26.1,9313,bc
100,typesetters,10.00,6462,13.58,42.2,9511,bc


In [12]:
df.size

714

In [18]:
# check the sample size and the average income for each group only bc and wc
df[df['type'].isin(['bc', 'wc'])].groupby('type')['income'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
type,Unnamed: 1_level_1,Unnamed: 2_level_1
bc,44,5374.136364
wc,23,5052.304348


In [23]:
# test the normality in each group
bc = stats.shapiro(df[df['type'] == 'bc']['income']) 
wc = stats.shapiro(df[df['type'] == 'wc']['income'])

print("bc:",bc)
print("wc:",wc)

bc: ShapiroResult(statistic=np.float64(0.9662506459293128), pvalue=np.float64(0.22181723139990506))
wc: ShapiroResult(statistic=np.float64(0.9260685765517781), pvalue=np.float64(0.08995567654171997))


In [25]:
stats.levene(bc, wc)

LeveneResult(statistic=np.float64(3.409525000868724e+29), pvalue=np.float64(2.932959869029284e-30))

In [26]:
# test whether there is a difference in the average income of the two groups
stats.ttest_ind(df[df['type'] == 'bc']['income'], df[df['type'] == 'wc']['income'], equal_var=False)

TtestResult(statistic=np.float64(0.6364887967154291), pvalue=np.float64(0.5276144694057663), df=np.float64(45.97230158495311))