In [None]:
import os
import sys

# 確保能匯入 src
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.load_data import load_data
from src.clean_data import clean_data

df_raw = load_data()
df_clean = clean_data(df_raw)

In [None]:
df_clean.info()

In [None]:
df_clean.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="darkgrid", palette="muted")
plt.figure(figsize=(14, 10))

num_features = [
    "Age", "Weight (kg)", "Height (m)",
    "BMI","Fat_Percentage",
    'Avg_BPM',"Session_Duration (hours)", "Calories_Burned", 
    "Workout_Frequency (days/week)",'Experience_Level'
]

# 平均值線顏色
mean_color = "red"

# enumerate() 會同時取出「索引」和「內容」
for i, feature in enumerate(num_features, 1):
    plt.subplot(4, 4, i)
    data = df_clean[feature]
    
    # 直方圖 + KDE核密度
    sns.histplot(data, kde=True, bins=30, color="skyblue")
    
    # 平均值線
    mean_val = data.mean()
    plt.axvline(mean_val, color=mean_color, linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
    
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
categorical_features = ["Gender", "Workout_Type"]
plt.figure(figsize=(12, 5))

for i, feature in enumerate(categorical_features, 1):
    plt.subplot(1, 2, i)
    
    # 第 2 張圖要加 hue（男女不同顏色）
    if feature == "Workout_Type":
        sns.countplot(data=df_clean, x=feature, hue="Gender", palette="Set2")
    else:
        sns.countplot(data=df_clean, x=feature, palette="Set2")

    plt.title(f"Countplot of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
corr = df_clean.corr(numeric_only=True).unstack().sort_values(ascending=False)
#top_corr = corr[(corr < 1) & (corr > 0.6)].head(5)
#low_corr = corr[corr < 0.6].head(5)
low_corr = corr[corr < 1]
#print("Top correlations:\n", top_corr)
print("Low correlations:\n", low_corr)

sns.heatmap(df_clean.corr(numeric_only=True), cmap="YlGnBu", annot=False)
plt.title("Correlation Heatmap (Preview)")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", palette="muted")

variables = ["Calories_Burned", "Avg_BPM", "Session_Duration (hours)", "Fat_Percentage", "BMI"]

plt.figure(figsize=(16, 12))
for i, var in enumerate(variables, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(x="Workout_Type", y=var, data=df_clean)
    plt.title(f"{var} across Workout Types")
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x="Avg_BPM", y="Calories_Burned", hue="Workout_Type", data=df_clean)
sns.regplot(x="Avg_BPM", y="Calories_Burned", data=df_clean, scatter=False, color="black")
plt.title("Calories Burned vs Avg Heart Rate")
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x="Session_Duration (hours)", y="Calories_Burned", hue="Workout_Type", data=df_clean)
sns.regplot(x="Session_Duration (hours)", y="Calories_Burned", data=df_clean, scatter=False, color="black")
plt.title("Calories Burned vs Session Duration")
plt.show()


In [None]:
variables = ["Avg_BPM", "Fat_Percentage","Calories_Burned", "BMI"]
plt.figure(figsize=(10, 8))
for i, var in enumerate(variables, 1):
    plt.subplot(2, 2, i)
    sns.kdeplot(data=df_clean, x=var, hue="Workout_Type", fill=True)
    plt.title(f"{var} Distribution by Workout Type")
plt.tight_layout()
plt.show()


In [None]:
sns.barplot(data=df_clean, x='Workout_Type', y='Calories_Burned', estimator=lambda x: x.mean())
plt.title('Average Calories Burned by Workout Type')
plt.show()

In [None]:
# 衍生欄位
df_clean['Calories_per_hour'] = df_clean['Calories_Burned'] / df_clean['Session_Duration (hours)']
df_clean['Calories_per_heartbeat'] = df_clean['Calories_Burned'] / df_clean['Avg_BPM']

In [None]:
import seaborn as sns, matplotlib.pyplot as plt
cols = ['Calories_Burned','Avg_BPM','Session_Duration (hours)','Fat_Percentage','BMI','Calories_per_hour','Calories_per_heartbeat']
corr = df_clean[cols].corr().round(3)
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation matrix (overall)')
plt.show()  

In [None]:
import seaborn as sns, matplotlib.pyplot as plt
cols = ['Calories_Burned','Avg_BPM','Session_Duration (hours)','Fat_Percentage',"Age",'BMI']
corr = df_clean[cols].corr().round(3)
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation matrix (overall)')
plt.show()

In [None]:
import seaborn as sns, matplotlib.pyplot as plt
plt.figure(figsize=(9,6))
sns.scatterplot(data=df_clean, x='Avg_BPM', y='Calories_Burned', hue='Workout_Type', alpha=0.6)
sns.regplot(data=df_clean, x='Avg_BPM', y='Calories_Burned', scatter=False, color='black', line_kws={'linewidth':1})
plt.title('Calories Burned vs Avg BPM (by Workout Type)')
plt.show()


In [None]:
import seaborn as sns, matplotlib.pyplot as plt
plt.figure(figsize=(9,6))
sns.scatterplot(data=df_clean, x='Session_Duration (hours)', y='Calories_Burned', hue='Workout_Type', alpha=0.6)
sns.regplot(data=df_clean, x='Session_Duration (hours)', y='Calories_Burned', scatter=False, color='black', line_kws={'linewidth':1})
plt.title('Calories Burned vs Session_Duration (by Workout Type)')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
workouts = ['Strength', 'Cardio', 'HIIT', 'Yoga']

for i, w in enumerate(workouts, 1):
    plt.subplot(2, 2, i)
    subset = df_clean[df_clean['Workout_Type'] == w]
    sns.regplot(
        data=subset,
        x='Avg_BPM',
        y='Calories_Burned',
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
    )
    plt.title(f"{w}", fontsize=14)
    plt.xlabel("Avg_BPM")
    plt.ylabel("Calories_Burned")

plt.suptitle("Calories Burned vs Avg BPM", fontsize=16, y=0.95)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
workouts = ['Strength', 'Cardio', 'HIIT', 'Yoga']

for i, w in enumerate(workouts, 1):
    plt.subplot(2, 2, i)
    subset = df_clean[df_clean['Workout_Type'] == w]
    sns.regplot(
        data=subset,
        x='Session_Duration (hours)',
        y='Calories_Burned',
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
    )
    plt.title(f"{w}", fontsize=14)
    plt.xlabel("Session_Duration (hours)")
    plt.ylabel("Calories_Burned")

plt.suptitle("Calories Burned vs Session_Duration", fontsize=16, y=0.95)
plt.tight_layout()
plt.show()

In [None]:
sns.lmplot(
    data=df_clean,
    x="Fat_Percentage",
    y="Calories_Burned",
    col="Workout_Type",
    col_wrap=2,
    line_kws={'color':'black'},
    scatter_kws={'alpha':0.3, 's':15}
)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
workouts = ['Strength', 'Cardio', 'HIIT', 'Yoga']

for i, w in enumerate(workouts, 1):
    plt.subplot(2, 2, i)
    subset = df_clean[df_clean['Workout_Type'] == w]
    sns.regplot(
        data=subset,
        x='Calories_per_hour',
        y='Calories_Burned',
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
    )
    plt.title(f"{w}", fontsize=14)
    plt.xlabel('Calories_per_hour')
    plt.ylabel("Calories_Burned")
    # 固定 y 軸刻度範圍
    plt.ylim(0, 3000)

plt.suptitle("Calories Burned vs Calories_per_hour", fontsize=16, y=0.95)
plt.tight_layout()
plt.show()

In [None]:
sns.lmplot(
    data=df_clean,
    x="Calories_per_hour",
    y="Calories_Burned",
    col="Workout_Type",
    col_wrap=2,
    line_kws={'color':'black'},
    scatter_kws={'alpha':0.3, 's':15}
)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
workouts = ['Strength', 'Cardio', 'HIIT', 'Yoga']

for i, w in enumerate(workouts, 1):
    plt.subplot(2, 2, i)
    subset = df_clean[df_clean['Workout_Type'] == w]
    sns.regplot(
        data=subset,
        x='Calories_per_heartbeat',
        y='Calories_Burned',
        scatter_kws={'alpha': 0.5, 's': 20},
        line_kws={'color': 'black'},
    )
    plt.title(f"{w}", fontsize=14)
    plt.xlabel('Calories_per_heartbeat')
    plt.ylabel("Calories_Burned")

plt.suptitle("Calories Burned vs Calories_per_heartbeat", fontsize=16, y=0.95)
plt.tight_layout()
plt.show()

In [None]:
sns.lmplot(
    data=df_clean,
    x="Calories_per_heartbeat",
    y="Calories_Burned",
    col="Workout_Type",
    col_wrap=2,
    line_kws={'color':'black'},
    scatter_kws={'alpha':0.3, 's':15}
)

In [None]:
summary = df_clean.groupby('Workout_Type').agg({
    'Calories_Burned':'mean',
    'Calories_per_hour':'median',
    'Avg_BPM':'mean',
    'Session_Duration (hours)':'mean',
    'Fat_Percentage':'mean',
    'BMI':'mean'
}).round(2)

summary

In [None]:
import pandas as pd
df_clean['Age_Group'] = pd.cut(df_clean['Age'], bins=[0,25,45,120], labels=['Youth','Adult','Senior'])
df_clean.groupby("Age_Group")["Age"].describe()

族群差異：Gender × Age_Group（效率差異）

In [None]:
g = sns.catplot( 
        data=df_clean, 
        x="Workout_Type", 
        y="Calories_per_hour", 
        kind="violin", 
        cut=0, 
        inner="quartile", 
        col="Age_Group", 
        row="Gender", 
        height=4, aspect=1.2 )
# 遍歷所有子圖，將中位數線（quartile）改成紅色
for ax in g.axes.flatten():
    lines = ax.lines
    for i in range(1, len(lines), 3):   # 每3條取中間那一條
        lines[i].set_color("red")
        lines[i].set_linewidth(2)
    

In [None]:
g = sns.catplot( 
        data=df_clean, 
        x="Workout_Type", 
        y="Calories_per_heartbeat", 
        kind="violin", 
        cut=0, 
        inner="quartile", 
        col="Age_Group", 
        row="Gender", 
        height=4, aspect=1.2 )
# 遍歷所有子圖，將中位數線（quartile）改成紅色
for ax in g.axes.flatten():
    lines = ax.lines
    for i in range(1, len(lines), 3):   # 每3條取中間那一條
        lines[i].set_color("red")
        lines[i].set_linewidth(2)