###1

#Import Libraries

In [1]:
import numpy as np # to handle arrays and numbers
import pandas as pd # to handle dataset files
import statsmodels.api as sm # for linear regression
from statsmodels.formula.api import ols # for linear regression
import seaborn as sns # for visualization

from sklearn.metrics import mean_squared_error # to compute MSE
import matplotlib.pyplot as plt # for vizualization
from matplotlib.pyplot import subplots # for vizualization subplots

part1

In [2]:
df= pd.read_csv("frailty_females_raw.csv")
df.head()

Unnamed: 0,Height,Weight,Age,Grip_strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y


a.Unit standardization

In [3]:
# Convert Height (in → m) and Weight (lb → kg)
df["Height_m"] = df["Height"] * 0.0254
df["Weight_kg"] = df["Weight"] * 0.45359237

# Preview updated dataset
df.head()

Unnamed: 0,Height,Weight,Age,Grip_strength,Frailty,Height_m,Weight_kg
0,65.8,112,30,30,N,1.67132,50.802345
1,71.5,136,19,31,N,1.8161,61.688562
2,69.4,153,45,29,N,1.76276,69.399633
3,68.2,142,22,28,Y,1.73228,64.410117
4,67.8,144,29,24,Y,1.72212,65.317301


b.Feature engineering

In [4]:
# i. Compute BMI (rounded to 2 decimals)
df["BMI"] = (df["Weight_kg"] / (df["Height_m"] ** 2)).round(2)

# ii. Create AgeGroup column
def age_group(age):
    if age < 30:
        return "<30"
    elif 30 <= age <= 45:
        return "30–45"
    elif 46 <= age <= 60:
        return "46–60"
    else:
        return ">60"

df["AgeGroup"] = df["Age"].apply(age_group)

# Preview updated dataset
df.head()


Unnamed: 0,Height,Weight,Age,Grip_strength,Frailty,Height_m,Weight_kg,BMI,AgeGroup
0,65.8,112,30,30,N,1.67132,50.802345,18.19,30–45
1,71.5,136,19,31,N,1.8161,61.688562,18.7,<30
2,69.4,153,45,29,N,1.76276,69.399633,22.33,30–45
3,68.2,142,22,28,Y,1.73228,64.410117,21.46,<30
4,67.8,144,29,24,Y,1.72212,65.317301,22.02,<30


c.Categorical → numeric encoding

In [5]:
# i. Binary encoding for Frailty
df["Frailty_binary"] = df["Frailty"].map({"Y": 1, "N": 0}).astype("int8")

# ii. One-hot encoding for AgeGroup
for group in ["<30", "30–45", "46–60", ">60"]:
    df[f"AgeGroup_{group}"] = (df["AgeGroup"] == group).astype("int8")

# Preview the updated dataset
df.head()

Unnamed: 0,Height,Weight,Age,Grip_strength,Frailty,Height_m,Weight_kg,BMI,AgeGroup,Frailty_binary,AgeGroup_<30,AgeGroup_30–45,AgeGroup_46–60,AgeGroup_>60
0,65.8,112,30,30,N,1.67132,50.802345,18.19,30–45,0,0,1,0,0
1,71.5,136,19,31,N,1.8161,61.688562,18.7,<30,0,1,0,0,0
2,69.4,153,45,29,N,1.76276,69.399633,22.33,30–45,0,0,1,0,0
3,68.2,142,22,28,Y,1.73228,64.410117,21.46,<30,1,1,0,0,0
4,67.8,144,29,24,Y,1.72212,65.317301,22.02,<30,1,1,0,0,0


In [6]:
import os

# Select numeric column
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Compute summary
summary = pd.DataFrame({
    "mean": df[numeric_cols].mean(),
    "median": df[numeric_cols].median(),
    "std": df[numeric_cols].std(ddof=1)
}).round(4)

# Create the 'reports' directory if it doesn't exist
if not os.path.exists("reports"):
    os.makedirs("reports")

# Save summary to Markdown file
with open("reports/findings.md", "w") as f:
    f.write("# Question 1 — EDA & Findings\n\n")
    f.write("## Summary statistics (numeric columns)\n")
    f.write(summary.to_markdown())
    f.write("\n\n")

In [7]:
# Correlation between grip strength and frailty
corr_val = df["Grip_strength"].corr(df["Frailty_binary"])

with open("reports/findings.md", "a") as f:
    f.write("## Grip strength ↔ Frailty (binary) correlation\n")
    f.write(f"Pearson correlation: **{corr_val:.4f}**\n\n")
    f.write("**Interpretation:** A negative correlation means that higher grip "
            "strength is associated with lower frailty.\n")