In [1]:
pip install scipy

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from scipy.stats import shapiro

# Load data
df = pd.read_csv("YOB_Data.csv")

# Drop missing values just in case
df = df.dropna(subset=["Volume"])

# Extract the numeric column you want to test
body_volume = df["Volume"]

# Perform Shapiro-Wilk test
stat, p = shapiro(body_volume)

print(f"Shapiro-Wilk Test Statistic: {stat:.4f}")
print(f"P-value: {p:.4f}")

# Interpretation
if p < 0.05:
    print("The data is likely NOT normally distributed.")
else:
    print("The data is likely normally distributed.")
    
# Grouped Shapiro-Wilk
for group, data in df.groupby("YOB"):
    stat, p = shapiro(data["Volume"].dropna())
    print(f"{group} - Shapiro-Wilk p = {p:.4f} ({'normal' if p >= 0.05 else 'not normal'})")

Shapiro-Wilk Test Statistic: 0.8375
P-value: 0.0197
The data is likely NOT normally distributed.
2017 - Shapiro-Wilk p = 0.1565 (normal)
2018 - Shapiro-Wilk p = 0.1173 (normal)


In [3]:
from scipy.stats import mannwhitneyu

# Load your data and filter groups
preg = df[df["YOB"] == 2017]["Volume"].dropna()
nonpreg = df[df["YOB"] == 2018]["Volume"].dropna()

# Check if both groups have data before performing the test
if len(preg) > 0 and len(nonpreg) > 0:
    # Perform Mann-Whitney U test
    u_stat, p_value = mannwhitneyu(preg, nonpreg, alternative='two-sided')
    
    print(f"Mann-Whitney U statistic: {u_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    # Interpretation
    if p_value < 0.05:
        print("Statistically significant difference in body volumes between pregnant and non-pregnant.")
    else:
        print("No statistically significant difference in body volumes between the groups.")
else:
    # Print diagnostic information
    print(f"Cannot perform test: preg group has {len(preg)} samples, nonpreg group has {len(nonpreg)} samples")
    print("Both groups must have at least one sample.")
    
    # Optional: Print the unique values in Status column to check if the filtering is correct
    print("Unique values in Status column:", df["Status"].unique())

Mann-Whitney U statistic: 33.0000
P-value: 0.1014
No statistically significant difference in body volumes between the groups.
