In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pingouin

# Testing sample size

In [16]:
late_shipments = pd.read_feather('data/late_shipments.feather')
late_shipments.rename(columns={'freight_cost_groups': "freight_cost_group"}, inplace=True)
pd.set_option('display.max_columns', 100)

In [17]:
late_shipments.shape

(1000, 27)

In [18]:
late_shipments.head()

Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,vendor,item_description,molecule_test_type,brand,dosage,dosage_form,unit_of_measure_per_pack,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_group,line_item_insurance_usd
0,36203.0,Nigeria,PMO - US,Direct Drop,EXW,Air,1.0,Yes,HRDT,HIV test,"Orgenics, Ltd","HIV 1/2, Determine Complete HIV Kit, 100 Tests","HIV 1/2, Determine Complete HIV Kit",Determine,,Test kit,100.0,2996.0,266644.0,89.0,0.89,"Alere Medical Co., Ltd.",Yes,1426.0,33279.83,expensive,373.83
1,30998.0,Botswana,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,"Trinity Biotech, Plc","HIV 1/2, Uni-Gold HIV Kit, 20 Tests","HIV 1/2, Uni-Gold HIV Kit",Uni-Gold,,Test kit,20.0,25.0,800.0,32.0,1.6,"Trinity Biotech, Plc",Yes,10.0,559.89,reasonable,1.72
2,69871.0,Vietnam,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,HETERO LABS LIMITED,"Lamivudine/Nevirapine/Stavudine 150/200/30mg, ...",Lamivudine/Nevirapine/Stavudine,Generic,150/200/30mg,Tablet - FDC,60.0,22925.0,110040.0,4.8,0.08,Hetero Unit III Hyderabad IN,Yes,3723.0,19056.13,expensive,181.57
3,17648.0,South Africa,PMO - US,Direct Drop,DDP,Ocean,0.0,No,ARV,Adult,Aurobindo Pharma Limited,"Lamivudine 150mg, tablets, 60 Tabs",Lamivudine,Generic,150mg,Tablet,60.0,152535.0,361507.95,2.37,0.04,"Aurobindo Unit III, India",Yes,7698.0,11372.23,expensive,779.41
4,5647.0,Uganda,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test - Ancillary,"Orgenics, Ltd","HIV, Lancet, Safety, for HIV Test kits, 100 Pcs","HIV, Lancet, Safety, for HIV Test kits, 100 Pcs",Generic,,Test kit - Ancillary,100.0,850.0,8.5,0.01,0.0,Inverness Japan,Yes,56.0,360.0,reasonable,0.01


#### In order to conduct a hypothesis test and be sure that the result is fair, a sample must meet three requirements: it is a random sample of the population, the observations are independent, and there are enough observations. Of these, only the last condition is easily testable with code.

In [19]:
late_shipments.all()

id                           True
country                      True
managed_by                   True
fulfill_via                  True
vendor_inco_term             True
shipment_mode                True
late_delivery               False
late                         True
product_group                True
sub_classification           True
vendor                       True
item_description             True
molecule_test_type           True
brand                        True
dosage                       True
dosage_form                  True
unit_of_measure_per_pack     True
line_item_quantity           True
line_item_value             False
pack_price                  False
unit_price                  False
manufacturing_site           True
first_line_designation       True
weight_kilograms             True
freight_cost_usd             True
freight_cost_group           True
line_item_insurance_usd     False
dtype: bool

In [20]:
# Count the freight_cost_group values
counts = late_shipments["freight_cost_group"].value_counts()

counts

expensive     531
reasonable    455
Name: freight_cost_group, dtype: int64

In [21]:
# Inspect whether the counts are big enough for a two sample t-test.
(counts >= 30).all()

True

In [22]:
# Count the late values
counts = late_shipments['late'].value_counts()

counts

No     939
Yes     61
Name: late, dtype: int64

In [23]:
# Inspect whether the counts are big enough for a one sample proportion test.

(counts >= 10).all()

True

In [24]:
# Count the values of freight_cost_group grouped by vendor_inco_term
counts = late_shipments.groupby("vendor_inco_term")["freight_cost_group"].value_counts()

counts

vendor_inco_term  freight_cost_group
CIP               reasonable             34
                  expensive              16
DDP               expensive              55
                  reasonable             45
DDU               reasonable              1
EXW               expensive             423
                  reasonable            302
FCA               reasonable             73
                  expensive              37
Name: freight_cost_group, dtype: int64

In [25]:
# Inspect whether the counts are big enough for a chi-square independence test.

(counts >= 5).all()

False

In [26]:
# Count the shipment_mode values
counts = late_shipments['shipment_mode'].value_counts()

counts

Air            906
Ocean           88
Air Charter      6
Name: shipment_mode, dtype: int64

In [27]:
# Inspect whether the counts are big enough for an ANOVA test.

(counts >= 30).all()

False

# Assumptions not met

## Wilcoxon signed-rank test

In [29]:
sample_dem_data = pd.read_feather('data/dem_votes_potus_12_16.feather')

In [31]:
sample_dem_data.shape

(500, 4)

In [32]:
sample_dem_data.head()

Unnamed: 0,state,county,dem_percent_12,dem_percent_16
0,Alabama,Bullock,76.3059,74.946921
1,Alabama,Chilton,19.453671,15.847352
2,Alabama,Clay,26.673672,18.674517
3,Alabama,Cullman,14.661752,10.028252
4,Alabama,Escambia,36.915731,31.020546


In [34]:
# Conduct a paired t-test on dem_percent_12 and dem_percent_16
paired_test_results = pingouin.ttest(x=sample_dem_data['dem_percent_12'], 
                                     y=sample_dem_data['dem_percent_16'], paired=True) 

paired_test_results

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,30.298384,499,two-sided,3.600634e-115,"[6.39, 7.27]",0.454202,2.246e+111,1.0


In [35]:
# Conduct a Wilcoxon test on dem_percent_12 and dem_percent_16
wilcoxon_test_results = pingouin.wilcoxon(x=sample_dem_data['dem_percent_12'],
                                          y=sample_dem_data['dem_percent_16'])

wilcoxon_test_results

Unnamed: 0,W-val,alternative,p-val,RBC,CLES
Wilcoxon,2401.0,two-sided,1.7803959999999998e-77,0.961661,0.644816


# Wilcoxon-Mann-Whitney

In [36]:
late_shipments.head()

Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,vendor,item_description,molecule_test_type,brand,dosage,dosage_form,unit_of_measure_per_pack,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_group,line_item_insurance_usd
0,36203.0,Nigeria,PMO - US,Direct Drop,EXW,Air,1.0,Yes,HRDT,HIV test,"Orgenics, Ltd","HIV 1/2, Determine Complete HIV Kit, 100 Tests","HIV 1/2, Determine Complete HIV Kit",Determine,,Test kit,100.0,2996.0,266644.0,89.0,0.89,"Alere Medical Co., Ltd.",Yes,1426.0,33279.83,expensive,373.83
1,30998.0,Botswana,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,"Trinity Biotech, Plc","HIV 1/2, Uni-Gold HIV Kit, 20 Tests","HIV 1/2, Uni-Gold HIV Kit",Uni-Gold,,Test kit,20.0,25.0,800.0,32.0,1.6,"Trinity Biotech, Plc",Yes,10.0,559.89,reasonable,1.72
2,69871.0,Vietnam,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,HETERO LABS LIMITED,"Lamivudine/Nevirapine/Stavudine 150/200/30mg, ...",Lamivudine/Nevirapine/Stavudine,Generic,150/200/30mg,Tablet - FDC,60.0,22925.0,110040.0,4.8,0.08,Hetero Unit III Hyderabad IN,Yes,3723.0,19056.13,expensive,181.57
3,17648.0,South Africa,PMO - US,Direct Drop,DDP,Ocean,0.0,No,ARV,Adult,Aurobindo Pharma Limited,"Lamivudine 150mg, tablets, 60 Tabs",Lamivudine,Generic,150mg,Tablet,60.0,152535.0,361507.95,2.37,0.04,"Aurobindo Unit III, India",Yes,7698.0,11372.23,expensive,779.41
4,5647.0,Uganda,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test - Ancillary,"Orgenics, Ltd","HIV, Lancet, Safety, for HIV Test kits, 100 Pcs","HIV, Lancet, Safety, for HIV Test kits, 100 Pcs",Generic,,Test kit - Ancillary,100.0,850.0,8.5,0.01,0.0,Inverness Japan,Yes,56.0,360.0,reasonable,0.01


In [38]:
# Choose the weight_kilograms and late columns
weight_vs_late = late_shipments[['weight_kilograms', 'late']]
weight_vs_late.head()

Unnamed: 0,weight_kilograms,late
0,1426.0,Yes
1,10.0,No
2,3723.0,No
3,7698.0,No
4,56.0,No


In [39]:
# Convert weight_vs_late into wide format
weight_vs_late_wide = weight_vs_late.pivot(values='weight_kilograms', columns='late')
weight_vs_late_wide

late,No,Yes
0,,1426.0
1,10.0,
2,3723.0,
3,7698.0,
4,56.0,
...,...,...
995,43.0,
996,99.0,
997,,881.0
998,16234.0,


In [40]:
# Run a two-sided Wilcoxon-Mann-Whitney test on weight_kilograms vs. late
wmw_test = pingouin.mwu(x=weight_vs_late_wide['No'], y=weight_vs_late_wide['Yes'])

wmw_test

Unnamed: 0,U-val,alternative,p-val,RBC,CLES
MWU,19134.0,two-sided,1.4e-05,0.331902,0.334049


# Kruskal-Wallis

In [41]:
late_shipments.shape

(1000, 27)

In [42]:
late_shipments.head()

Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,vendor,item_description,molecule_test_type,brand,dosage,dosage_form,unit_of_measure_per_pack,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_group,line_item_insurance_usd
0,36203.0,Nigeria,PMO - US,Direct Drop,EXW,Air,1.0,Yes,HRDT,HIV test,"Orgenics, Ltd","HIV 1/2, Determine Complete HIV Kit, 100 Tests","HIV 1/2, Determine Complete HIV Kit",Determine,,Test kit,100.0,2996.0,266644.0,89.0,0.89,"Alere Medical Co., Ltd.",Yes,1426.0,33279.83,expensive,373.83
1,30998.0,Botswana,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,"Trinity Biotech, Plc","HIV 1/2, Uni-Gold HIV Kit, 20 Tests","HIV 1/2, Uni-Gold HIV Kit",Uni-Gold,,Test kit,20.0,25.0,800.0,32.0,1.6,"Trinity Biotech, Plc",Yes,10.0,559.89,reasonable,1.72
2,69871.0,Vietnam,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,HETERO LABS LIMITED,"Lamivudine/Nevirapine/Stavudine 150/200/30mg, ...",Lamivudine/Nevirapine/Stavudine,Generic,150/200/30mg,Tablet - FDC,60.0,22925.0,110040.0,4.8,0.08,Hetero Unit III Hyderabad IN,Yes,3723.0,19056.13,expensive,181.57
3,17648.0,South Africa,PMO - US,Direct Drop,DDP,Ocean,0.0,No,ARV,Adult,Aurobindo Pharma Limited,"Lamivudine 150mg, tablets, 60 Tabs",Lamivudine,Generic,150mg,Tablet,60.0,152535.0,361507.95,2.37,0.04,"Aurobindo Unit III, India",Yes,7698.0,11372.23,expensive,779.41
4,5647.0,Uganda,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test - Ancillary,"Orgenics, Ltd","HIV, Lancet, Safety, for HIV Test kits, 100 Pcs","HIV, Lancet, Safety, for HIV Test kits, 100 Pcs",Generic,,Test kit - Ancillary,100.0,850.0,8.5,0.01,0.0,Inverness Japan,Yes,56.0,360.0,reasonable,0.01


In [43]:
# Run a Kruskal-Wallis test on weight_kilograms vs. shipment_mode
kw_test = pingouin.kruskal(dv='weight_kilograms', between='shipment_mode', data=late_shipments)

kw_test

Unnamed: 0,Source,ddof1,H,p-unc
Kruskal,shipment_mode,2,125.096618,6.848799e-28
