In [17]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np

In [18]:
df = pd.read_csv("/content/data.csv")
print(df.shape)

(10, 5)


In [19]:
df


Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


Cleaning data

In [20]:
print("Columns in your file:")
print(df.columns.tolist())

Columns in your file:
['Height ', 'Weight ', 'Age\xa0', 'Grip strength\xa0', 'Frailty\xa0']


In [21]:
# Strip spaces and weird characters from column names - chatGPT
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace("\xa0", "", regex=True)

print(df.columns.tolist())

['Height', 'Weight', 'Age', 'Grip strength', 'Frailty']


In [22]:
#Height/Weight to numeric
for col in ["Height", "Weight"]:
    df[col] = (
        df[col].astype(str)
               .str.strip()
               .str.replace(r"[,\s\"]", "", regex=True)
               .str.replace(r"[^\d\.\-]", "", regex=True)  #keep digits, dot, minus
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")      #should make floats (NaN if bad)

print(df[["Height","Weight"]].dtypes)  #should show float64, but maybe int64 is fine too??
#still says dtype is object so idk

Height    float64
Weight      int64
dtype: object


In [23]:
#cleaning age column
df["Age"] = df["Age"].astype(str).str.replace(r"[^\d\-]", "", regex=True)
df["Age"] = pd.to_numeric(df["Age"], errors="coerce")

print(df["Age"].head())   #check values
print(df["Age"].dtypes)   #should be float64?? or int64??


0    30
1    19
2    45
3    22
4    29
Name: Age, dtype: int64
int64


In [24]:
#Re-clean Age and Grip strength to be safe??
#chat GPT suggested
for col in ["Age", "Grip strength"]:
    df[col] = (
        df[col].astype(str)          # forces to string
               .str.strip()          # trim spaces
               .str.replace(r"[^\d\.\-]", "", regex=True)
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

print(df[["Age", "Grip strength"]].dtypes)



Age              int64
Grip strength    int64
dtype: object


a. unit standardization

In [25]:
df["Height_m"] = df["Height"] * 0.0254
df["Weight_kg"] = df["Weight"] * 0.45359237
print(df[["Height", "Height_m", "Weight", "Weight_kg"]].head())


   Height  Height_m  Weight  Weight_kg
0    65.8   1.67132     112  50.802345
1    71.5   1.81610     136  61.688562
2    69.4   1.76276     153  69.399633
3    68.2   1.73228     142  64.410117
4    67.8   1.72212     144  65.317301


B. feature engineering

In [26]:
#BMI-rounded to 2 dec
df["BMI"] = (df["Weight_kg"] / (df["Height_m"] ** 2)).round(2)

bins = [-1, 29, 45, 60, np.inf]
labels = ["<30", "30–45", "46–60", ">60"]

df["AgeGroup"] = pd.cut(df["Age"], bins=bins, labels=labels)
print(df[["Height_m", "Weight_kg", "BMI", "Age", "AgeGroup"]].head())

   Height_m  Weight_kg    BMI  Age AgeGroup
0   1.67132  50.802345  18.19   30    30–45
1   1.81610  61.688562  18.70   19      <30
2   1.76276  69.399633  22.33   45    30–45
3   1.73228  64.410117  21.46   22      <30
4   1.72212  65.317301  22.02   29      <30


first CSV files (raw+engineered and just engineered)

In [27]:
df.to_csv("/content/data_full.csv", index=False)
#both raw & engineered

c. Categorical → numeric encoding


In [28]:
#standardize
df["Frailty"] = df["Frailty"].astype(str).str.strip().str.lower()

#Map to binary: Y/Yes/Frail → 1, N/No/Non-frail → 0
frailty_map = {
    "y": 1, "yes": 1, "frail": 1, "1": 1,
    "n": 0, "no": 0, "non-frail": 0, "0": 0
}
df["Frailty_binary"] = df["Frailty"].map(frailty_map).astype("int8")

print(df[["Frailty", "Frailty_binary"]].head())


  Frailty  Frailty_binary
0       n               0
1       n               0
2       n               0
3       y               1
4       y               1


In [29]:
#trimmed version for grading
keep = ["Height_m","Weight_kg","BMI","Age","AgeGroup","Frailty_binary","Grip strength"]
df_proc = df[keep].copy()
df_proc.to_csv("/content/processed_data.csv", index=False)
#only engineered

In [30]:
#One-hot encode AgeGroup, add as new columns
age_dummies = pd.get_dummies(df["AgeGroup"], prefix="AgeGroup")

#Concatenate with main dataframe
df = pd.concat([df, age_dummies], axis=1)

print(df[["Age", "AgeGroup"] + age_dummies.columns.tolist()].head())


   Age AgeGroup  AgeGroup_<30  AgeGroup_30–45  AgeGroup_46–60  AgeGroup_>60
0   30    30–45         False            True           False         False
1   19      <30          True           False           False         False
2   45    30–45         False            True           False         False
3   22      <30          True           False           False         False
4   29      <30          True           False           False         False


D. EDA & Reporting

In [31]:
num_cols = ["Height_m","Weight_kg","BMI","Age","Grip strength"]
summary = df[num_cols].agg(["mean","median","std"]).T.reset_index().rename(columns={"index":"variable"})
summary_long = summary.melt(id_vars="variable", var_name="stat", value_name="value")
summary_long.insert(0, "section", "summary_stats")

#correlation
corr_val = df["Grip strength"].corr(df["Frailty_binary"])
corr_df = pd.DataFrame({
    "section": ["correlation"],
    "variable": ["Grip strength vs Frailty_binary"],
    "stat": ["pearson_corr"],
    "value": [corr_val]
})

#all the counts - in case needed later
age_counts = df["AgeGroup"].value_counts(dropna=False).rename_axis("variable").reset_index(name="value")
age_counts.insert(0, "section", "agegroup_counts")
age_counts["stat"] = "count"
age_counts = age_counts[["section","variable","stat","value"]]

frailty_counts = df["Frailty_binary"].value_counts(dropna=False).rename_axis("variable").reset_index(name="value")
frailty_counts.insert(0, "section", "frailty_counts")
frailty_counts["stat"] = "count"
frailty_counts = frailty_counts[["section","variable","stat","value"]]

#combing to csv
reports = pd.concat([summary_long, corr_df, age_counts, frailty_counts], ignore_index=True)
reports.to_csv("/content/Full_report.csv", index=False)



In [32]:
print(df)


   Height  Weight  Age  Grip strength Frailty  Height_m  Weight_kg    BMI  \
0    65.8     112   30             30       n   1.67132  50.802345  18.19   
1    71.5     136   19             31       n   1.81610  61.688562  18.70   
2    69.4     153   45             29       n   1.76276  69.399633  22.33   
3    68.2     142   22             28       y   1.73228  64.410117  21.46   
4    67.8     144   29             24       y   1.72212  65.317301  22.02   
5    68.7     123   50             26       n   1.74498  55.791862  18.32   
6    69.8     141   51             22       y   1.77292  63.956524  20.35   
7    70.1     136   23             20       y   1.78054  61.688562  19.46   
8    67.9     112   17             19       n   1.72466  50.802345  17.08   
9    66.8     120   39             31       n   1.69672  54.431084  18.91   

  AgeGroup  Frailty_binary  AgeGroup_<30  AgeGroup_30–45  AgeGroup_46–60  \
0    30–45               0         False            True           False   


In [33]:
from tabulate import tabulate


# Load the combined report CSV - GPT assisted for clean formatting
reports = pd.read_csv("/content/Full_reports.csv")

# --- Split by section ---
summary = reports[reports["section"]=="summary_stats"]
corr    = reports[reports["section"]=="correlation"]
agec    = reports[reports["section"]=="agegroup_counts"]
frailc  = reports[reports["section"]=="frailty_counts"]

# --- Build markdown content ---
md_path = "/content/findings.md"

with open(md_path, "w") as f:
    f.write("# Q1 Findings Report\n\n")

    # Summary stats
    f.write("## Summary Statistics\n\n")
    summary_table = summary.pivot(index="variable", columns="stat", values="value")
    f.write(tabulate(summary_table, headers="keys", tablefmt="github"))
    f.write("\n\n")

    # Correlation
    f.write("## Correlation Analysis\n\n")
    corr_val = corr["value"].iloc[0]
    f.write(f"- Pearson correlation between **Grip strength** and **Frailty_binary**: **{corr_val:.3f}**\n\n")

    # AgeGroup counts
    f.write("## Age Group Distribution\n\n")
    f.write(tabulate(agec[["variable","value"]].rename(columns={"variable":"AgeGroup","value":"Count"}), headers="keys", tablefmt="github"))
    f.write("\n\n")

    # Frailty counts
    f.write("## Frailty Distribution\n\n")
    f.write(tabulate(frailc[["variable","value"]].rename(columns={"variable":"Frailty_binary","value":"Count"}), headers="keys", tablefmt="github"))
    f.write("\n\n")

    # Notes
    f.write("## Notes\n\n")
    f.write("- Units standardized: Height → meters, Weight → kilograms.\n")
    f.write("- BMI derived as kg/m², rounded to 2 decimals.\n")
    f.write("- Age grouped into: <30, 30–45, 46–60, >60.\n")
    f.write("- Frailty encoded to binary: Y→1, N→0.\n")

print("Markdown report saved to:", md_path)


Markdown report saved to: /content/findings.md


The analysis shows that people in this small dataset are mostly young adults with  low but normal BMI values and similar body sizes. Frailty was present in 40% of the sample, and a negative correlation (-0.476) was observed between grip strength and frailty. This means that individuals with lower grip strength were more likely to be classified as frail, which aligns with the existing thought that identifying grip strength ais a simple biomarker of physical frailty. The dataset is limited in size and lacks older participants, but the findings support the importance of muscle strength in evaluating frailty, and maybe grip strength measurements could play a role in early detection.

Assignment one:
 part two

In [34]:
data = pd.read_csv("/content/StudentsPerformance.csv")
print(data.shape)
print(data.columns)

(1000, 8)
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


In [35]:
print(data.isna().sum())

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [36]:
import plotly.express as px
#GPT assisted
df_melted = data.melt(
    id_vars="gender",
    value_vars=["math score", "reading score"],
    var_name="subject",
    value_name="score"
)

fig = px.box(
    df_melted,
    x="gender",
    y="score",
    color="subject",
    title="Math vs Reading Scores by Gender",
    labels={"score":"Score", "gender":"Gender", "subject":"Subject"}
)

fig.show()


This chart compares math and reading scores by gender. Female students seemed to score higher in reading than in math, while male students score higher in math than in reading. The results suggest a gender pattern: girls tend to perform better in reading, and boys tend to perform better in math.

In [37]:

fig = px.box(
    data,
    x="test preparation course",
    y="math score",
    color="test preparation course",
    title="Math Scores by Test Preparation Course",
    labels={"test preparation course": "Test Prep", "math score": "Math Score"}
)

fig.show()


This shows the effect of test preparation on math scores. Students who completed the test preparation course scored higher in math compared to those with no prep. The median and upper range of scores are  higher for the “completed” group, meaning test prep may have a positive impact on performance, though it is a relatively small jump on average

In [38]:
#making average
data["overall_avg"] = data[["math score","reading score","writing score"]].mean(axis=1)
avg_scores = data.groupby("lunch", as_index=False)["overall_avg"].mean()

# Create bar chart - GPT assisted
fig = px.bar(
    avg_scores,
    x="lunch",
    y="overall_avg",
    color="lunch",
    title="Average Overall Performance by Lunch Type",
    labels={"lunch":"Lunch Type", "overall_avg":"Average Score"},
    text="overall_avg"
)

fig.update_traces(texttemplate='%{text:.2f}', textposition="outside")
fig.show()


This chart compares average overall performance by lunch type. Students receiving a standard lunch scored higher when compared to those with free/reduced lunch. The results could mean that students with access to standard lunch may perform better academically, possibly reflecting outside differences in resources or support.

In [40]:
#correlation btwn needed variabels
corr_matrix = data[["math score","reading score","writing score"]].corr().round(2)

# Turn into long format for Plotly -GPT
corr_long = corr_matrix.reset_index().melt(id_vars="index")
corr_long.columns = ["Subject1","Subject2","Correlation"]

# Heatmap
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale="RdBu",
    zmin=-1, zmax=1,
    title="Correlation Between Subject Scores"
)

fig.update_layout(
    xaxis_title="Subject 1",
    yaxis_title="Subject 2"
)

fig.show()


This heatmap shows strong positive correlations among math, reading, and writing scores. The highest is between reading and writing (0.95). Math is also strongly correlated with both reading (0.82) and writing (0.80), from this it seems academic performance is consistent across subjects.

In [42]:
#gpt assisted
counts = data["test preparation course"].value_counts().to_dict()

# Make a new column for legend labels with group name + sample size
data["test_prep_label"] = data["test preparation course"].map(
    lambda x: f"{x} (n={counts[x]})"
)

# Scatter with separate regression lines per group
fig = px.scatter(
    data,
    x="reading score",
    y="math score",
    color="test_prep_label",
    trendline="ols",
    trendline_scope="trace",
    title="Math vs Reading Scores by Test Prep Status",
    labels={"reading score": "Reading Score", "math score": "Math Score", "test_prep_label":"Test Prep"}
)

fig.update_layout(
    legend_title_text="Test Prep",
    xaxis=dict(title="Reading Score"),
    yaxis=dict(title="Math Score")
)

fig.show()

This shows the relationship between reading and math scores, with separate trend lines for students who completed test preparation vs those who did not. Both groups show a strong positive relationship since higher reading scores are linked to higher math scores. The trend lines are similar in slope, but students who completed test prep scored slightly higher in math for the same reading level. This could mean that while reading and math performance are strongly related, test prep may provide a slight benefit.