# Non-Parametric Tests

## Imports

In [35]:
import pandas as pd
import pingouin

In [14]:
late_shipments = pd.read_feather('late_shipments.feather').rename(
    columns={
        'freight_cost_groups': 'freight_cost_group'
    })
late_shipments.sample(5)

Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,...,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_group,line_item_insurance_usd
503,46844.0,Cote d'Ivoire,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,3049.0,82323.0,27.0,1.35,Chembio Diagnostics Sys. Inc.,Yes,904.0,7208.0,expensive,177.49
693,9749.0,Vietnam,PMO - US,Direct Drop,FCA,Air,0.0,No,ARV,Adult,...,1480.0,121656.0,82.2,0.68,ABBVIE Ludwigshafen Germany,Yes,370.0,2652.33,reasonable,194.65
711,30220.0,Haiti,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,...,1440.0,2592.0,1.8,0.03,Hetero Unit III Hyderabad IN,Yes,99.0,1497.53,reasonable,2.66
912,66762.0,Cote d'Ivoire,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,3000.0,240000.0,80.0,0.8,"Alere Medical Co., Ltd.",Yes,1428.0,28142.92,expensive,336.48
774,73335.0,Nigeria,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,6704.0,214528.0,32.0,1.6,"Trinity Biotech, Plc",Yes,2134.0,13771.63,expensive,252.5


In [25]:
sample_dem_data = pd.read_feather('dem_votes_potus_12_16.feather')
sample_dem_data.sample(5)

Unnamed: 0,state,county,dem_percent_12,dem_percent_16
131,Iowa,Howard,59.590958,36.784383
218,Minnesota,Renville,44.020752,27.83329
444,Texas,Titus,29.907386,27.571929
2,Alabama,Clay,26.673672,18.674517
202,Michigan,Ottawa,32.300169,31.50826


In [30]:
late_shipments = pd.read_feather('late_shipments.feather')
late_shipments.sample(5)

Unnamed: 0,id,country,managed_by,fulfill_via,vendor_inco_term,shipment_mode,late_delivery,late,product_group,sub_classification,...,line_item_quantity,line_item_value,pack_price,unit_price,manufacturing_site,first_line_designation,weight_kilograms,freight_cost_usd,freight_cost_groups,line_item_insurance_usd
516,19246.0,Tanzania,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,...,5175.0,116437.5,22.5,0.75,Mylan (formerly Matrix) Nashik,Yes,775.0,4136.58,reasonable,144.03
57,6691.0,Vietnam,PMO - US,Direct Drop,EXW,Air,0.0,No,ARV,Adult,...,100.0,894.0,8.94,0.15,"Ranbaxy, Paonta Shahib, India",Yes,6.0,2794.34,reasonable,1.43
718,46209.0,Rwanda,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,5084.0,355880.0,70.0,0.7,Inverness Japan,Yes,1153.0,46638.41,expensive,697.52
602,9351.0,Tanzania,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,10000.0,200000.0,20.0,0.8,"Standard Diagnostics, Korea",Yes,3345.0,29470.0,expensive,320.0
126,38662.0,Malawi,PMO - US,Direct Drop,EXW,Air,0.0,No,HRDT,HIV test,...,246.0,19680.0,80.0,0.8,"Alere Medical Co., Ltd.",Yes,114.0,3278.11,reasonable,27.59


## Assumptions in hypothesis testing

### Testing sample size

In [15]:
# Count the freight_cost_group values
counts = late_shipments['freight_cost_group'].value_counts()

# Print the result
print(counts)

# Inspect whether the counts are big enough
print((counts >= 30).all())

freight_cost_group
expensive     531
reasonable    455
Name: count, dtype: int64
True


In [17]:
# Count the late values
counts = late_shipments['late'].value_counts()

# Print the result
print(counts)

# Inspect whether the counts are big enough
print((counts >= 10).all())

late
No     939
Yes     61
Name: count, dtype: int64
True


In [19]:
# Count the values of freight_cost_group grouped by vendor_inco_term
counts = late_shipments.groupby('vendor_inco_term')['freight_cost_group'].value_counts()

# Print the result
print(counts)

# Inspect whether the counts are big enough
print((counts >= 5).all())

vendor_inco_term  freight_cost_group
CIP               reasonable             34
                  expensive              16
DDP               expensive              55
                  reasonable             45
DDU               reasonable              1
EXW               expensive             423
                  reasonable            302
FCA               reasonable             73
                  expensive              37
Name: count, dtype: int64
False


In [20]:
# Count the shipment_mode values
counts = late_shipments['shipment_mode'].value_counts()

# Print the result
print(counts)

# Inspect whether the counts are big enough
print((counts >= 30).all())

shipment_mode
Air            906
Ocean           88
Air Charter      6
Name: count, dtype: int64
False


## Non-parametric tests

### Wilcoxon signed-rank test

In [28]:
# Conduct a paired t-test on dem_percent_12 and dem_percent_16
paired_test_results = pingouin.ttest(
    x=sample_dem_data['dem_percent_12'],
    y=sample_dem_data['dem_percent_16'],
    paired=True,
    alternative='two-sided'
    ) 


# Print paired t-test results
print(paired_test_results)

                T  dof alternative          p-val         CI95%   cohen-d  \
T-test  30.298384  499   two-sided  3.600634e-115  [6.39, 7.27]  0.454202   

              BF10 power  
T-test  2.246e+111   NaN  


In [29]:
# Conduct a Wilcoxon test on dem_percent_12 and dem_percent_16
wilcoxon_test_results = pingouin.wilcoxon(
    x=sample_dem_data['dem_percent_12'],
    y=sample_dem_data['dem_percent_16'],
    alternative='two-sided'
    )

# Print Wilcoxon test results
print(wilcoxon_test_results)

           W-val alternative         p-val       RBC      CLES
Wilcoxon  2401.0   two-sided  1.780396e-77  0.961661  0.644816


## Non-parametric ANOVA and unpaired t-tests

### Wilcoxon-Mann-Whitney

In [33]:
# Select the weight_kilograms and late columns
weight_vs_late = late_shipments[['weight_kilograms', 'late']]

# Convert weight_vs_late into wide format
weight_vs_late_wide = weight_vs_late.pivot(columns='late', 
                                           values='weight_kilograms')


# Run a two-sided Wilcoxon-Mann-Whitney test on weight_kilograms vs. late
wmw_test = pingouin.mwu(
    x=weight_vs_late_wide['No'],
    y=weight_vs_late_wide['Yes'],
    alternative='two-sided'
)

# Print the test results
print(wmw_test)

       U-val alternative     p-val       RBC      CLES
MWU  19134.0   two-sided  0.000014 -0.331902  0.334049


### Kruskal-Wallis

In [34]:
# Run a Kruskal-Wallis test on weight_kilograms vs. shipment_mode
kw_test = pingouin.kruskal(
    data=late_shipments,
    dv='weight_kilograms',
    between='shipment_mode'
)

# Print the results
print(kw_test)

                Source  ddof1           H         p-unc
Kruskal  shipment_mode      2  125.096618  6.848799e-28
