In [None]:
#from src.io_utils import load_data, coerce_numeric 

#df = coerce_numeric(load_data("data/health_study_dataset.csv"))

#df.info()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from scipy import stats

df = pd.read_csv("data/health_study_dataset.csv")

In [None]:
# Calculates a summary of statistics for age, weight, height, systolic_bp, cholesterol

stats= df[["age", "weight", "height", "systolic_bp", "cholesterol"]].agg(["mean", "median", "min", "max"])
stats_summary = stats.rename(
    columns= {
    "age": "Age (years)",
    "weight": "Weight (kg)",
    "height": "Height (cm)", 
    "systolic_bp": "Systolic Blood Pressure (mmHg)",
    "cholesterol": "Cholesterol (mmol(L)"
})
stats_summary.round(1)


In [None]:
# Histogram of systolic blood pressure
plt.figure(figsize=(10,6))
plt.hist(df["systolic_bp"], bins = 30, edgecolor= "black")
plt.title("Distribution of Systolic Blood Pressure")
plt.xlabel("Systolic Blood Pressure (mmHg)")
plt.ylabel("Number of Participants")
plt.tight_layout()


In [None]:
# Bar plot of smokers vs non-smokers

total_smokers = df["smoker"].value_counts()
total_smokers.plot(kind="bar", edgecolor= "black")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of Participants")
plt.title("Smoking Status of Participants")
plt.tight_layout()

In [None]:
# Boxplot of weight distribution by gender 

F_weight = df[df["sex"] == "F"]
female_weight = F_weight["weight"]
M_weight = df[df["sex"] == "M"]
male_weight = M_weight["weight"]

plt.figure(figsize=(8,6))
plt.boxplot([female_weight, male_weight], tick_labels=["Female", "Male"], patch_artist= True)
plt.title("Weight Distribution by Gender")
plt.ylabel("Weight (kg)")
plt.tight_layout()

In [None]:
# Calculates the actual proportion of participants with the disease

disease_count = df["disease"].mean()
print(f"Actual proportion of participants with the disease: {disease_count:.2%}")

#Simulated the disease status of 1000 participants

np.random.seed(42)
n = 1000

simulated = (np.random.random(n) < disease_count).astype(int)
simulated_count = np.mean(simulated)
print(f"Simulated proportion of participants with the disease: {simulated_count:.2%}")

#Difference between actual and simulated proportions

diff = disease_count - simulated_count
print(f"The difference: {diff:.2%}")

In [None]:
# Calculate the true mean of systolic blood pressure

np.random.seed(123)
sbp = df["systolic_bp"]
true_mean = float(np.mean(sbp))
true_mean

In [None]:
# Calculate sample statistics for systolic blood pressure

np.random.seed(35)
n = 40 
x = np.random.choice(sbp, size=n, replace=True)

mean_x = float(np.mean(x))
std = float(np.std(x, ddof=1))
se = std / np.sqrt(n)
mean_x, std, n, se

In [None]:
# Confidesintervall for systolic blood pressure

from math import sqrt

def ci_mean_normal(x, confidence=0.95):
    x = np.asarray(x, dtype=float)
    mean_x = float(np.mean(x))
    std = float(np.std(x, ddof=1))
    n = len(x)

    z_critical = 1.96
    half_width = z_critical * std / sqrt(n)
    lo, hi = mean_x - half_width, mean_x + half_width
    return lo, hi, mean_x, std, n

lo, hi, mean_x, std, n = ci_mean_normal(x)
(lo, hi), mean_x, std, n , true_mean

In [None]:
smokers = df[df["smoker"] == "Yes"]["systolic_bp"]
nonsmokers = df[df["smoker"] == "No"]["systolic_bp"]
difference = smokers.mean() - nonsmokers.mean()
print(f"Difference: {difference:.2f} mmHg")

In [None]:
# Bootstrap for hypothesis that smokers have higher mean systolic blood pressure than non-smokers

np.random.seed(2024)

n_boot = 10_000
obs_diff = smokers.mean() - nonsmokers.mean()

boot_diffs = np.empty(n_boot)
for i in range(n_boot):
    smokers_star = np.random.choice(smokers, size=len(smokers), replace=True)
    nonsmokers_star = np.random.choice(nonsmokers, size=len(nonsmokers), replace=True)
    boot_diffs[i] = smokers_star.mean() - nonsmokers_star.mean()

p_boot = np.mean(boot_diffs >= obs_diff)

ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5])

print(obs_diff)
print(p_boot)
print(ci_low, ci_high)

## Förklaring av resultatet:

- Om p < 0.05 finns det stöd för skillnad
- I detta fall är p-värdet stort
- Det visar inte på en signifikant skillnad i blodtryck mellan rökare och icke-rökare
- Hypotesen att rökare har högre medel-blodtryck än icke-rökare stöds därmed inte