[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/indicium15/ml-workshop/blob/main/guerrero-et-al.ipynb)

In [None]:
! pip install pandas numpy openpyxl scikit-learn matplotlib seaborn gdown

In [None]:
# Download Data from Google Drive
import gdown
import os
# Make sure data folder exists
os.makedirs("data", exist_ok=True)
# Google Drive file ID
excel_file_id = "1p195_dWUin95MP8oZEpa_rFDKeVighQmhUtzo45gzns"
pdf_file_id = "1jKC_mAWeSCvrPeTyLJaMi5w90_axClsH"
# Download to data/ folder
gdown.download(id=excel_file_id, output="data/guerrero-et-al-data.xlsx", quiet=True)
gdown.download(id=pdf_file_id, output="data/guerrero-et-al.pdf", quiet=True)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import matplotlib.pyplot as plt

In [None]:
# === Step 1: Load Data ===
# Replace this with your actual CSV file
df = pd.read_excel("./data/guerrero-et-al-data.xlsx", sheet_name="normalized")
df = df.iloc[:, 3:]
df = df.fillna(0)
df.head()

In [None]:
for col in df.columns.to_list():
    print(col)

In [None]:
df['Average physical activity'] = (df['Child average time spent on household chores'] + df['Child total time spent on physical play (indoor & outdoor)'])/2
df.head()

In [None]:
# === Step 3: Define Predictors and Target ===
# Replace with your actual column names

predictor_cols = [
    'Average physical activity',        # example physical activity
    'Child average total media use',      # screen time
    'Average sleep duration'    # sleep duration
]

target_col = 'Total Health QofL'  # Replace with your actual target column

In [None]:
# === Step 4: Clean Data ===

df = df.dropna(subset=predictor_cols + [target_col])

# Optional: Convert QoL_score to multiple categories (for classification)
# Use this block if you want a classification tree (e.g. Low/Medium/High QoL)
def convert_to_categories(score):
    if score < 65:
        return 'Low'
    elif score < 85:
        return 'Medium'
    else:
        return 'High'

df['QoL_category'] = df[target_col].apply(convert_to_categories)

In [None]:
# === Step 5: Split into Train/Test ===

X = df[predictor_cols]

# Choose either y_reg (for regression) or y_class (for classification)
y_reg = df[target_col]
y_class = df['QoL_category']

X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
_, _, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.2, random_state=42)

In [None]:
# === Step 5a: Train Regression Tree ===
reg_tree = DecisionTreeRegressor(max_depth=4, random_state=42)
reg_tree.fit(X_train, y_train_reg)

# === Step 5b: Train Classification Tree ===
class_tree = DecisionTreeClassifier(max_depth=4, random_state=42)
class_tree.fit(X_train, y_train_class)

In [None]:
# === Step 6a: Evaluate Regression ===

y_pred_reg = reg_tree.predict(X_test)
rmse = mean_squared_error(y_test_reg, y_pred_reg)
print("Regression RMSE:", rmse)

# === Step 6b: Evaluate Classification ===
y_pred_class = class_tree.predict(X_test)
acc = accuracy_score(y_test_class, y_pred_class)
print("Classification Accuracy:", acc)
print("Classification Report:\n", classification_report(y_test_class, y_pred_class))

In [None]:
# === Step 7: Visualise the Trees ===

# Regression Tree
plt.rcParams['figure.dpi'] = 300
plt.figure(figsize=(24, 12))
plot_tree(reg_tree, feature_names=predictor_cols, filled=True)
plt.title("Decision Tree (Regression)")
plt.show()

In [None]:
# Classification Tree
plt.rcParams['figure.dpi'] = 300
plt.figure(figsize=(24, 12))
plot_tree(class_tree, feature_names=predictor_cols, class_names=class_tree.classes_, filled=True)
plt.title("Decision Tree (Classification)")
plt.show()

In [None]:
# === Step 8: Feature Importance ===
print("\nFeature Importance (Regression):")
print(pd.Series(reg_tree.feature_importances_, index=predictor_cols).sort_values(ascending=False))

print("\nFeature Importance (Classification):")
print(pd.Series(class_tree.feature_importances_, index=predictor_cols).sort_values(ascending=False))