In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import pandas as pd

In [None]:
# Step 2: Load Data as Numpy Matrices
# Read the CSV files, skipping the header row
male_url = "https://raw.githubusercontent.com/gagolews/teaching-data/master/marek/nhanes_adult_male_bmx_2020.csv"
female_url = "https://raw.githubusercontent.com/gagolews/teaching-data/master/marek/nhanes_adult_female_bmx_2020.csv"

male = np.genfromtxt(male_url, delimiter=',', skip_header=1)
female = np.genfromtxt(female_url, delimiter=',', skip_header=1)

print("Male data shape:", male.shape)
print("Female data shape:", female.shape)

In [None]:
# Step 3: Histograms for Male and Female Weights (NaN-safe)
male_weights = male[:, 0][~np.isnan(male[:, 0])]  # Remove NaNs
female_weights = female[:, 0][~np.isnan(female[:, 0])]  # Remove NaNs

# Check if arrays are non-empty
if len(male_weights) == 0 or len(female_weights) == 0:
    raise ValueError("No valid weight data after NaN removal")

# Determine global x-axis limits
global_min = min(np.nanmin(male_weights), np.nanmin(female_weights))
global_max = max(np.nanmax(male_weights), np.nanmax(female_weights))

plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
plt.hist(female_weights, bins=30, color='pink', edgecolor='black')
plt.title('Female Weights Distribution')
plt.ylabel('Frequency')
plt.xlim(global_min, global_max)

plt.subplot(2, 1, 2)
plt.hist(male_weights, bins=30, color='blue', edgecolor='black')
plt.title('Male Weights Distribution')
plt.xlabel('Weight (kg)')
plt.ylabel('Frequency')
plt.xlim(global_min, global_max)

plt.tight_layout()
plt.show()

In [None]:
# Step 4: Boxplot Comparing Male and Female Weights
plt.figure(figsize=(8, 6))
plt.boxplot([female_weights, male_weights], labels=['Female', 'Male'])
plt.title('Weight Distribution Comparison')
plt.ylabel('Weight (kg)')
plt.show()

In [None]:
# Step 5: Numerical Aggregates
def compute_aggregates(data, name):
    mean = np.mean(data)
    median = np.median(data)
    std = np.std(data, ddof=1)
    skew = (np.sum((data - mean)**3) / len(data)) / (std**3)
    kurtosis = (np.sum((data - mean)**4) / len(data)) / (std**4) - 3
    print(f"{name} Aggregates:\nMean: {mean:.2f}, Median: {median:.2f}, Std: {std:.2f}, Skewness: {skew:.2f}, Kurtosis: {kurtosis:.2f}\n")

compute_aggregates(female_weights, "Female Weights")
compute_aggregates(male_weights, "Male Weights")

In [None]:
# Step 6: Add BMI Column to Female Matrix
# BMI = weight (kg) / (height (m))^2
female_bmi = female[:, 0] / (female[:, 1] / 100)**2
female = np.hstack((female, female_bmi.reshape(-1, 1)))
print("Female matrix with BMI:", female.shape)

In [None]:
# Step 7: Standardize Female Data (Z-scores)
zfemale = (female - female.mean(axis=0)) / female.std(axis=0, ddof=1)
print("ZFemale shape:", zfemale.shape)

In [None]:
# --------------------------
# Step 8: Handle Missing Values & Recompute Z-Scores
# --------------------------
# Remove rows with NaN in relevant columns BEFORE standardization
cols_to_keep = [0, 1, 5, 6]  # Weight, Height, Hip, Waist (indices from original female matrix)
female_clean = female[~np.isnan(female[:, cols_to_keep]).any(axis=1)]

# Recompute BMI (Step 6) on cleaned data
female_clean = np.hstack((female_clean, (female_clean[:, 0] / (female_clean[:, 1]/100)**2).reshape(-1, 1)))

# Recompute z-scores (Step 7) on cleaned data
zfemale_clean = (female_clean - female_clean.mean(axis=0)) / female_clean.std(axis=0, ddof=1)

# --------------------------
# Step 8: Scatterplot Matrix & Correlations (NaN-Free)
# --------------------------
selected_cols = zfemale_clean[:, [0, 1, 6, 5, 7]]  # Weight, Height, Waist, Hip, BMI
df = pd.DataFrame(selected_cols, columns=['Weight', 'Height', 'Waist', 'Hip', 'BMI'])

# Plot
scatter_matrix(df, figsize=(12, 12), diagonal='kde')
plt.show()

# Compute correlations
pearson_corr = np.corrcoef(selected_cols, rowvar=False)
spearman_corr = pd.DataFrame(selected_cols).corr(method='spearman').values

print("Pearson Correlation:\n", np.round(pearson_corr, 2))
print("\nSpearman Correlation:\n", np.round(spearman_corr, 2))

In [None]:
# Step 9: Add Waist-to-Height and Waist-to-Hip Ratios
male_wh = male[:, 6] / male[:, 1]    # Waist-to-height
male_whr = male[:, 6] / male[:, 5]   # Waist-to-hip
male = np.hstack((male, male_wh.reshape(-1, 1), male_whr.reshape(-1, 1)))

female_wh = female[:, 6] / female[:, 1]
female_whr = female[:, 6] / female[:, 5]
female = np.hstack((female, female_wh.reshape(-1, 1), female_whr.reshape(-1, 1)))

print("Male matrix with ratios:", male.shape)
print("Female matrix with ratios:", female.shape)

In [None]:
# --------------------------
# Clean Data for Both Genders
# --------------------------
# Remove rows with NaN in critical columns (weight, height, hip, waist)
cols_to_clean = [0, 1, 5, 6]  # Indices: weight, height, hip, waist

male_clean = male[~np.isnan(male[:, cols_to_clean]).any(axis=1)]
female_clean = female[~np.isnan(female[:, cols_to_clean]).any(axis=1)]

# --------------------------
# Recompute Ratios on Cleaned Data
# --------------------------
# Add waist-to-height (WHtR) and waist-to-hip (WHR) ratios
male_wh = male_clean[:, 6] / male_clean[:, 1]    # Waist-to-height
male_whr = male_clean[:, 6] / male_clean[:, 5]   # Waist-to-hip
male_clean = np.hstack((male_clean, male_wh.reshape(-1, 1), male_whr.reshape(-1, 1)))

female_wh = female_clean[:, 6] / female_clean[:, 1]
female_whr = female_clean[:, 6] / female_clean[:, 5]
female_clean = np.hstack((female_clean, female_wh.reshape(-1, 1), female_whr.reshape(-1, 1)))

# --------------------------
# Corrected Boxplot
# --------------------------
plt.figure(figsize=(10, 6))
plt.boxplot([
    female_clean[:, 8],  # Female WHtR (column 8)
    male_clean[:, 7],    # Male WHtR (column 7)
    female_clean[:, 9],  # Female WHR (column 9)
    male_clean[:, 8]     # Male WHR (column 8)
], labels=['F WHtR', 'M WHtR', 'F WHR', 'M WHR'])
plt.title('Waist-to-Height (WHtR) and Waist-to-Hip (WHR) Ratios')
plt.ylabel('Ratio')
plt.grid(True)
plt.show()

In [None]:
# --------------------------
# Clean Data (Reuse from Step 8/10)
# --------------------------
# Remove NaN from critical columns (weight, height, hip, waist)
cols_to_clean = [0, 1, 5, 6]  # Indices: weight, height, hip, waist
female_clean = female[~np.isnan(female[:, cols_to_clean]).any(axis=1)]

# Recompute BMI (Step 6) on cleaned data
female_clean = np.hstack((female_clean, (female_clean[:, 0] / (female_clean[:, 1]/100)**2).reshape(-1, 1)))

# Recompute z-scores (Step 7) on cleaned data
zfemale_clean = (female_clean - female_clean.mean(axis=0)) / female_clean.std(axis=0, ddof=1)

# --------------------------
# Step 12: Extreme BMIs (NaN-Free)
# --------------------------
# Get BMI column from CLEANED data (column 7)
bmi_indices = np.argsort(female_clean[:, 7])  # Use cleaned BMI values
extreme_indices = np.concatenate([bmi_indices[:5], bmi_indices[-5:]])

# Print standardized data from CLEANED zfemale
print("Standardized Data for Extreme BMIs (Cleaned):")
print(zfemale_clean[extreme_indices])

References:

Watch video lectures for each topic again in LMS as well as in Youtube.