<a href="https://colab.research.google.com/github/laurrennamber/Machine-Learning-for-Insights-on-a-Coffee-Health-Dataset/blob/main/SUBMISSION_Machine_Learning_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 2.2 - Implementation (30%)


Implement the exact Machine Learning workflow from Task 2.1 using Python and Scikit-Learn in a Jupiter
Notebook. Additional Python libraries may be used for specific ML models, e.g., XGBoost. Your submitted
Jupiter Notebook should run successfully on Google Colab.

##Technical Requirements:
• Implement all algorithms specified in Task 2.1 (minimum 3)

• Apply your chosen data quality, class imbalance, and validation strategies

• Code must run without errors and produce meaningful results

• Save notebook with outputs visible (cleared notebooks suggest fabricated results in Task 2.3)

##Complexity expectations:
• Basic implementations alone will not achieve high marks

• Code sophistication should match the complexity claimed in Task 2.1

• Advanced techniques attempted in Task 2.1 must be properly implemented

• Professional code structure with clear comments linking to Task 2.1 decisions

##Mismatch penalties:
Implementations that do not match Task 2.1 decisions are subject to mark reductions.

Your code is your proof of understanding, though we will consider cases where certain things from Task 2.1 turned out to be too complex to implement. Ensure you comment code accordingly.

#Imports and Setting Up

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.pipeline import Pipeline
#Chosen models from Decision Point 3
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


!pip -q install imbalanced-learn
from imblearn.over_sampling import SMOTE #Decision Point 6
smote = SMOTE(random_state=42)

#Loading the Data

In [None]:
df = pd.read_csv("coffee_health.csv")
df.head()

Unnamed: 0,ID,Age,Gender,Country,Daily Coffees,Caffeine Intake,Sleep Hours,Sleep Quality,BMI,Heart Rate,Physical Activity,Health Issues,Occupation,Smoker,Drinks Alcohol,Stress Level
0,1.0,40.0,Male,Germany,3.5,328.1,7.5,Good,24.9,78.0,14.5,,Other,No,No,Low
1,2.0,33.0,Male,Germany,1.0,94.1,,Good,20.0,67.0,11.0,,Service,No,No,Low
2,3.0,42.0,Male,Brazil,5.3,503.7,5.9,Fair,22.7,59.0,11.2,Mild,Office,No,No,Medium
3,4.0,53.0,Male,Germany,2.6,249.2,7.3,Good,24.7,71.0,6.6,Mild,Other,No,No,Low
4,5.0,32.0,Female,Spain,3.1,298.0,5.3,Fair,24.1,76.0,8.5,Mild,Student,No,Yes,Medium


# Feature Groups and Targets

In [None]:
#The outcome that I am trying to predict
TARGET_COL = "Stress Level"

# FEATURE SELECTION
#Dropping ID, Occupation etc
drop = ["ID", "Country"]
numeric_cols = [
    "Age",
    "Daily Coffees",
    "Caffeine Intake",
    "Sleep Hours",
    "BMI",
    "Heart Rate"
]
categorical_cols = [
    "Gender",
    "Sleep Quality",
    "Physical Activity",
    "Health Issues",
    "Smoker",
    "Drinks Alcohol"
]

X = df.drop(columns=[TARGET_COL] + drop)
y = df[TARGET_COL]

In [None]:

#Preventing data type errors
# Make categorical columns safe for sklearn
for col in categorical_cols:
    X[col] = X[col].where(X[col].notna(), "Missing")  # replace NaN with a string label
    X[col] = X[col].astype(str)                       # force everything to string


for col in numeric_cols:
    X[col] = pd.to_numeric(X[col], errors="coerce")
#Duplicates were not dropped blindly, and noise (found in EDA) was manageable.

# Stratified Split 80:20
Decision Point 6

In [None]:
# Decision Point 6
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,   # ~80/20
    stratify=y,
    random_state=42
)

print("Train class balance:\n", y_train.value_counts(normalize=True))
print("Test class balance:\n", y_test.value_counts(normalize=True))


Train class balance:
 Stress Level
Low       0.699601
Medium    0.204461
High      0.095938
Name: proportion, dtype: float64
Test class balance:
 Stress Level
Low       0.699552
Medium    0.204285
High      0.096163
Name: proportion, dtype: float64


# Handling Missingness

In [None]:
# Decision Points 1 and 2
#Imputating Median values for numerical
numeric_transformer = SimpleImputer(strategy="median")
#Imputing Categorical as "Missing"
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
## Preprocessing for the tree models RF and GB
preprocess_tree = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)


## scaling numeric pipeline ONLY for Logistic Regression (Decision Point 2)
numeric_transformer_lr = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocess_lr = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_lr, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop"
)

In [None]:
#PREVENTING ERRORS - section advised + code syntax provided by AI
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def score_on_validation(model, X_val_proc, y_val):
    y_pred = model.predict(X_val_proc)
    return {
        "accuracy": accuracy_score(y_val, y_pred),
        "precision": precision_score(y_val, y_pred, average="weighted", zero_division=0),
        "recall": recall_score(y_val, y_pred, average="weighted", zero_division=0),
        "f1": f1_score(y_val, y_pred, average="weighted", zero_division=0),
    }


# Defining Final Eval Function

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
## The following is a function suggested by AI to avoid repetitiveness that was in original code
def final_train_and_test(
    model_name: str,
    model,
    preprocess,
    X_train, y_train,
    X_test, y_test,
    random_state: int = 42
):
    # Fit preprocessing on FULL training set
    X_train_proc = preprocess.fit_transform(X_train)
    X_test_proc  = preprocess.transform(X_test)

    # SMOTE on training only
    smote = SMOTE(random_state=random_state)
    X_train_sm, y_train_sm = smote.fit_resample(X_train_proc, y_train)

    # Train final model
    model.fit(X_train_sm, y_train_sm)

    # Test predictions
    y_pred = model.predict(X_test_proc)

    # Metrics for Validation - Decision Point 7
    results = {
        "Model": model_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
        "Recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
        "F1": f1_score(y_test, y_pred, average="weighted", zero_division=0),
        "ConfusionMatrix": confusion_matrix(y_test, y_pred) #Decision Point 7
    }

    print(f"\n=== {model_name} (Test Set) ===")
    print(f"Accuracy : {results['Accuracy']:.4f}")
    print(f"Precision: {results['Precision']:.4f}")
    print(f"Recall   : {results['Recall']:.4f}")
    print(f"F1       : {results['F1']:.4f}")
    print("Confusion matrix:\n", results["ConfusionMatrix"])

    return results


# Splitting and Preparing for Models


In [None]:
# validation split from training only
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)



In [None]:
#Prep for LR and tree models
#Using Logistic Regression preprocessing, with scaling
X_tr_proc_lr = preprocess_lr.fit_transform(X_tr)
X_val_proc_lr = preprocess_lr.transform(X_val)
X_tr_sm_lr, y_tr_sm_lr = smote.fit_resample(X_tr_proc_lr, y_tr)

# Using tree preprocessing, no scaling
X_tr_proc_tree = preprocess_tree.fit_transform(X_tr)
X_val_proc_tree = preprocess_tree.transform(X_val)

X_tr_sm_tree, y_tr_sm_tree = smote.fit_resample(X_tr_proc_tree, y_tr)
#Using SMOTE on the training data addresses Class Imbalance - Decision Point 6

# ***Experiment 1: Logistic Regression***
Decision Point 3

# Logistic Regression


In [None]:
#Tuning C
# Logistic Regression hyperparameter tuning (on training only)
C_values = [0.1, 1.0, 10.0]  # small and controlled range as defined in
#... Decision point 5
best_lr = None
best_f1 = -1
best_C = None
#Validation set being used to tune hyperparameters - Decision Point 5
for C in C_values:
    lr = LogisticRegression(C=C, max_iter=2000)
    lr.fit(X_tr_sm_lr, y_tr_sm_lr)
    scores = score_on_validation(lr, X_val_proc_lr, y_val)
    if scores["f1"] > best_f1:
        best_f1 = scores["f1"]
        best_lr = lr
        best_C = C

best_C, best_f1

print('Best C:', best_C)
print('Best F1 Score:', best_f1)


Best C: 0.1
Best F1 Score: 0.977873496865571


# Training and Evaluation

In [None]:
final_lr = LogisticRegression(C=best_C, max_iter=2000)  # using found best C
lr_results = final_train_and_test(
    model_name="Logistic Regression",
    model=final_lr,
    preprocess=preprocess_lr,
    X_train=X_train, y_train=y_train, #Training Set
    X_test=X_test, y_test=y_test #Test Set - only used once
)


=== Logistic Regression (Test Set) ===
Accuracy : 0.9751
Precision: 0.9756
Recall   : 0.9751
F1       : 0.9753
Confusion matrix:
 [[ 182    2    9]
 [   6 1381   17]
 [  11    5  394]]


# ***Experiment 2: Random Forest***
Decision Point 3

# Random Forest

In [None]:
#TUNING
n_estimators_list = [100, 200]
max_depth_list = [None, 10]  # small, controlled range

best_rf = None
best_rf_params = None
best_rf_f1 = -1
#Using Validation set for Hyperparameter tuning - Decision Point 5
for n_estimators in n_estimators_list:
    for max_depth in max_depth_list:
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )
        rf.fit(X_tr_sm_tree, y_tr_sm_tree)
        scores = score_on_validation(rf, X_val_proc_tree, y_val)
        if scores["f1"] > best_rf_f1:
            best_rf_f1 = scores["f1"]
            best_rf = rf
            best_rf_params = {"n_estimators": n_estimators, "max_depth": max_depth}

best_rf_params, best_rf_f1

print('Best Random Forest Params:', best_rf_params)
print('Best Random Forest F1 Score:', best_rf_f1)

Best Random Forest Params: {'n_estimators': 100, 'max_depth': 10}
Best Random Forest F1 Score: 0.9975021282039545


# Training and Evaluation

In [None]:
final_rf = RandomForestClassifier(
    n_estimators=best_rf_params["n_estimators"],
    max_depth=best_rf_params["max_depth"],
    random_state=42
)

rf_results = final_train_and_test(
    model_name="Random Forest",
    model=final_rf,
    preprocess=preprocess_tree,
    X_train=X_train, y_train=y_train, #Training Set
    X_test=X_test, y_test=y_test #Test Set - only used once
)



=== Random Forest (Test Set) ===
Accuracy : 0.9910
Precision: 0.9911
Recall   : 0.9910
F1       : 0.9910
Confusion matrix:
 [[ 182    3    8]
 [   0 1402    2]
 [   2    3  405]]


# ***Experiment 3: Gradient Boosting***
Decision Point 3

# Gradient Boosting


In [None]:
#Gradient boosting is sequential and so takes longer than RF and LR
#TUNING
n_estimators_list = [100, 200]
learning_rates = [0.05, 0.1]  # small range as defined in Decision Point 5

best_gb = None
best_gb_params = None
best_gb_f1 = -1
#Using Validation Set to tune hyper parameters - Decision Point 5
for n_estimators in n_estimators_list:
    for lr in learning_rates:
        gb = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=lr,
            random_state=42
        )
        gb.fit(X_tr_sm_tree, y_tr_sm_tree)
        scores = score_on_validation(gb, X_val_proc_tree, y_val)
        if scores["f1"] > best_gb_f1:
            best_gb_f1 = scores["f1"]
            best_gb = gb
            best_gb_params = {"n_estimators": n_estimators, "learning_rate": lr}

best_gb_params, best_gb_f1
print('Best Gradient Boosting Params:', best_gb_params)
print('Best Gradient Boosting F1 Score:', best_gb_f1)

Best Gradient Boosting Params: {'n_estimators': 100, 'learning_rate': 0.05}
Best Gradient Boosting F1 Score: 0.9956295085995177


# Training and Evaluation

In [None]:
#Training
final_gb = GradientBoostingClassifier(
    n_estimators=best_gb_params["n_estimators"],
    learning_rate=best_gb_params["learning_rate"],
    random_state=42
)

gb_results = final_train_and_test(
    model_name="Gradient Boosting",
    model=final_gb,
    preprocess=preprocess_tree,
    X_train=X_train, y_train=y_train, #Training set
    X_test=X_test, y_test=y_test #Test Set - only used once
)



=== Gradient Boosting (Test Set) ===
Accuracy : 0.9935
Precision: 0.9935
Recall   : 0.9935
F1       : 0.9935
Confusion matrix:
 [[ 185    3    5]
 [   0 1404    0]
 [   2    3  405]]


# Task 2.3: Results Table

In [None]:
#Syntax for creating data frame generated by AI, variables etc modified by me
results_df = pd.DataFrame([
    {k: v for k, v in lr_results.items() if k != "ConfusionMatrix"},
    {k: v for k, v in rf_results.items() if k != "ConfusionMatrix"},
    {k: v for k, v in gb_results.items() if k != "ConfusionMatrix"},
])

results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.975087,0.975611,0.975087,0.975277
1,Random Forest,0.991031,0.991051,0.991031,0.99098
2,Gradient Boosting,0.993523,0.993503,0.993523,0.993487
