In [1]:


import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    balanced_accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, make_scorer
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.inspection import permutation_importance

import joblib


## 1) Konfiguracja

In [2]:
class Config:
    excel_path: str = "prawidowy_excel.xlsx"
    sheet_name: str | None = None

    stress_high_threshold: float = 8.0
    test_size: float = 0.35
    random_state: int = 42

cfg = Config()


## 2) Funkcje pomocnicze + testy jednostkowe (czytelne, z liczbami)

In [4]:
import re

def find_stress_column(df: pd.DataFrame) -> str:
    cols_lower = {c: str(c).lower() for c in df.columns}
    stress_candidates = [c for c in df.columns if "stres" in cols_lower[c]]
    if not stress_candidates:
        raise ValueError("Nie znalazłem kolumny stresu (musi zawierać 'stres').")

    for c in stress_candidates:
        cl = cols_lower[c]
        if "poziom" in cl or "skali" in cl or "skala" in cl:
            return c
    return stress_candidates[0]

def coerce_numeric_commas(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.replace(",", ".", regex=False).str.strip()
    s = s.replace({"nan": np.nan, "None": np.nan, "none": np.nan, "": np.nan})
    return pd.to_numeric(s, errors="coerce")

def make_target_from_stress(stress_num: pd.Series, thr: float) -> pd.Series:
    return pd.Series(np.where(stress_num >= thr, "HIGH", "NOT_HIGH"), index=stress_num.index)

def make_preprocess():
    num_sel = make_column_selector(dtype_include=np.number)
    cat_sel = make_column_selector(dtype_exclude=np.number)

    num_pipe = Pipeline(steps=[
        ("imp", SimpleImputer(strategy="median")),
    ])

    cat_pipe = Pipeline(steps=[
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ])

    return ColumnTransformer(
        transformers=[("num", num_pipe, num_sel), ("cat", cat_pipe, cat_sel)],
        remainder="drop"
    )

#testy
def _test_find_stress_column_verbose():
    print("TEST: find_stress_column\n" + "-"*35)
    df0 = pd.DataFrame({"Poziom stresu (skala 1-10)": [1, 2], "wiek": [20, 21]})
    res0 = find_stress_column(df0)
    print("Kolumny:", list(df0.columns))
    print("Wybrana:", res0)
    assert res0 == "Poziom stresu (skala 1-10)"
    print("OK\n")

    df1 = pd.DataFrame({"stres": [1, 2], "x": [0, 1]})
    res1 = find_stress_column(df1)
    print("Kolumny:", list(df1.columns))
    print("Wybrana:", res1)
    assert res1 == "stres"
    print("OK\n")

    try:
        find_stress_column(pd.DataFrame({"abc": [1]}))
        raise AssertionError("Powinien być wyjątek, a nie był.")
    except ValueError as e:
        print("Oczekiwany wyjątek:", repr(e))
        print("OK\n")

def _test_coerce_numeric_commas_verbose():
    print("TEST: coerce_numeric_commas\n" + "-"*35)
    s = pd.Series(["1,5", "2", "abc", None, ""])
    out = coerce_numeric_commas(s)
    df_show = pd.DataFrame({"in": s, "out": out})
    print(df_show)
    assert np.isclose(out.iloc[0], 1.5)
    assert np.isclose(out.iloc[1], 2.0)
    assert np.isnan(out.iloc[2])
    assert np.isnan(out.iloc[3])
    assert np.isnan(out.iloc[4])
    print("OK\n")

def _test_make_target_from_stress_verbose():
    print("TEST: make_target_from_stress\n" + "-"*35)
    s = pd.Series([7.9, 8.0, 10.0])
    y = make_target_from_stress(s, 8.0)
    df_show = pd.DataFrame({"stress": s, "thr": 8.0, "class": y})
    print(df_show)
    assert list(y) == ["NOT_HIGH", "HIGH", "HIGH"]
    print("OK\n")

_test_find_stress_column_verbose()
_test_coerce_numeric_commas_verbose()
_test_make_target_from_stress_verbose()

print("OK — testy funkcji pomocniczych przeszły.")


TEST: find_stress_column
-----------------------------------
Kolumny: ['Poziom stresu (skala 1-10)', 'wiek']
Wybrana: Poziom stresu (skala 1-10)
OK

Kolumny: ['stres', 'x']
Wybrana: stres
OK

Oczekiwany wyjątek: ValueError("Nie znalazłem kolumny stresu (musi zawierać 'stres').")
OK

TEST: coerce_numeric_commas
-----------------------------------
     in  out
0   1,5  1.5
1     2  2.0
2   abc  NaN
3  None  NaN
4        NaN
OK

TEST: make_target_from_stress
-----------------------------------
   stress  thr     class
0     7.9  8.0  NOT_HIGH
1     8.0  8.0      HIGH
2    10.0  8.0      HIGH
OK

OK — testy funkcji pomocniczych przeszły.


## 3) Wczytanie i przygotowanie danych (X, y)

In [5]:
import os

if not os.path.exists(cfg.excel_path):
    raise FileNotFoundError(f"Nie widzę pliku: {cfg.excel_path}. Ustaw cfg.excel_path poprawnie.")

raw = pd.read_excel(cfg.excel_path, sheet_name=cfg.sheet_name, engine="openpyxl")
df = raw[list(raw.keys())[0]].copy() if isinstance(raw, dict) else raw.copy()
df = df.dropna(how="all")

stress_col = find_stress_column(df)
stress_num = coerce_numeric_commas(df[stress_col])
mask = stress_num.notna()

df = df.loc[mask].copy()
stress_num = stress_num.loc[mask].copy()

y = make_target_from_stress(stress_num, cfg.stress_high_threshold)
y.name = "stress_class"

X = df.drop(columns=[stress_col], errors="ignore").copy()

# Konwersja "liczb z przecinkami" w kolumnach tekstowych, jeśli >=70% to liczby
for c in X.columns:
    if X[c].dtype == "object":
        maybe = coerce_numeric_commas(X[c])
        if maybe.notna().mean() >= 0.70:
            X[c] = maybe

print("Dane X:", X.shape, " | y:", y.shape)
print("Rozkład klas:\n", y.value_counts())
print("Kolumna stresu:", stress_col)


Dane X: (107, 7)  | y: (107,)
Rozkład klas:
 stress_class
NOT_HIGH    90
HIGH        17
Name: count, dtype: int64
Kolumna stresu: na_ile_oceniasz_swoj_poziom_stresu_w_skali_110_1_brak_stresu_10_bardzo_wysoki


## 4) Podział train/test (holdout)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=cfg.test_size,
    random_state=cfg.random_state,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Train klasy:\n", y_train.value_counts())
print("Test  klasy:\n", y_test.value_counts())


Train: (69, 7) Test: (38, 7)
Train klasy:
 stress_class
NOT_HIGH    58
HIGH        11
Name: count, dtype: int64
Test  klasy:
 stress_class
NOT_HIGH    32
HIGH         6
Name: count, dtype: int64


## 5) Modele bazowe: Logistic Regression vs ExtraTrees (holdout)

In [7]:
f1_high_scorer = make_scorer(f1_score, pos_label="HIGH")

pipe_lr = Pipeline(steps=[
    ("preprocess", make_preprocess()),
    ("model", LogisticRegression(max_iter=5000, class_weight="balanced", random_state=cfg.random_state))
])

pipe_etc = Pipeline(steps=[
    ("preprocess", make_preprocess()),
    ("model", ExtraTreesClassifier(
        n_estimators=600,
        random_state=cfg.random_state,
        n_jobs=1,
        class_weight="balanced",
        min_samples_leaf=2
    ))
])

def eval_holdout(pipe, name):
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    ba = balanced_accuracy_score(y_test, pred)
    f1h = f1_score(y_test, pred, pos_label="HIGH")
    cm = confusion_matrix(y_test, pred, labels=["HIGH","NOT_HIGH"])

    print(f"\n{name} — HOLDOUT")
    print("BA:", round(float(ba), 4), " | F1(HIGH):", round(float(f1h), 4))
    print("CM (rows=true [HIGH, NOT_HIGH], cols=pred [HIGH, NOT_HIGH]):\n", cm)
    print(classification_report(y_test, pred))

eval_holdout(pipe_lr, "LogisticRegression")
eval_holdout(pipe_etc, "ExtraTreesClassifier")



LogisticRegression — HOLDOUT
BA: 0.8177  | F1(HIGH): 0.7273
CM (rows=true [HIGH, NOT_HIGH], cols=pred [HIGH, NOT_HIGH]):
 [[ 4  2]
 [ 1 31]]
              precision    recall  f1-score   support

        HIGH       0.80      0.67      0.73         6
    NOT_HIGH       0.94      0.97      0.95        32

    accuracy                           0.92        38
   macro avg       0.87      0.82      0.84        38
weighted avg       0.92      0.92      0.92        38


ExtraTreesClassifier — HOLDOUT
BA: 0.8333  | F1(HIGH): 0.8
CM (rows=true [HIGH, NOT_HIGH], cols=pred [HIGH, NOT_HIGH]):
 [[ 4  2]
 [ 0 32]]
              precision    recall  f1-score   support

        HIGH       1.00      0.67      0.80         6
    NOT_HIGH       0.94      1.00      0.97        32

    accuracy                           0.95        38
   macro avg       0.97      0.83      0.88        38
weighted avg       0.95      0.95      0.94        38



## 6) Stabilna ocena: RepeatedStratifiedKFold 5×5 (CV)

In [8]:
cv_rep = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=cfg.random_state)

scoring = {
    "bal_acc": "balanced_accuracy",
    "f1_high": f1_high_scorer,
    "roc_auc": "roc_auc"
}

def cv_report(pipe, name):
    out = cross_validate(pipe, X, y, cv=cv_rep, scoring=scoring, n_jobs=1, return_train_score=False)
    ba = out["test_bal_acc"]; f1h = out["test_f1_high"]; roc = out["test_roc_auc"]
    print(f"\n{name} (Repeated 5×5 CV)")
    print(f"BA      : {ba.mean():.4f} ± {ba.std(ddof=1):.4f}")
    print(f"F1(HIGH): {f1h.mean():.4f} ± {f1h.std(ddof=1):.4f}")
    print(f"ROC-AUC : {roc.mean():.4f} ± {roc.std(ddof=1):.4f}")
    return out

out_lr = cv_report(pipe_lr, "LOGISTIC REGRESSION")
out_etc = cv_report(pipe_etc, "ETC (wstępne parametry)")



LOGISTIC REGRESSION (Repeated 5×5 CV)
BA      : 0.6094 ± 0.1413
F1(HIGH): 0.3246 ± 0.1458
ROC-AUC : 0.6478 ± 0.1733

ETC (wstępne parametry) (Repeated 5×5 CV)
BA      : 0.6844 ± 0.1171
F1(HIGH): 0.4732 ± 0.1921
ROC-AUC : 0.6313 ± 0.1943


## 7) Strojenie hiperparametrów ETC (RandomizedSearchCV na Repeated CV)

In [9]:
param_dist = {
    "model__n_estimators": [400, 800, 1200],
    "model__max_depth": [None, 5, 8, 12, 16, 20],
    "model__min_samples_split": [2, 5, 10, 20, 40],
    "model__min_samples_leaf": [1, 2, 4, 8, 12, 20],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5, 0.8, None],
    "model__bootstrap": [False, True],
    "model__class_weight": [None, "balanced"],
}

search = RandomizedSearchCV(
    estimator=pipe_etc,
    param_distributions=param_dist,
    n_iter=40,
    scoring=scoring,
    refit="f1_high",
    cv=cv_rep,
    n_jobs=1,
    random_state=cfg.random_state,
    verbose=1,
    return_train_score=False
)

search.fit(X, y)

best_pipe = search.best_estimator_

print("\nNajlepszy wynik (CV) F1(HIGH):", round(float(search.best_score_), 4))
print("Najlepsze parametry:")
for k, v in search.best_params_.items():
    print(" -", k, "=", v)

# raport stabilności dla best_pipe
out_best = cv_report(best_pipe, "BEST ETC")


Fitting 25 folds for each of 40 candidates, totalling 1000 fits

Najlepszy wynik (CV) F1(HIGH): 0.4951
Najlepsze parametry:
 - model__n_estimators = 1200
 - model__min_samples_split = 10
 - model__min_samples_leaf = 2
 - model__max_features = sqrt
 - model__max_depth = 12
 - model__class_weight = balanced
 - model__bootstrap = True

BEST ETC (Repeated 5×5 CV)
BA      : 0.6961 ± 0.1190
F1(HIGH): 0.4951 ± 0.1949
ROC-AUC : 0.6433 ± 0.1890


## 8) Wpływ zmiennych: permutation importance (na holdout)

In [18]:
best_pipe.fit(X_train, y_train)

perm = permutation_importance(
    best_pipe,
    X_test, y_test,
    scoring=f1_high_scorer,
    n_repeats=100,
    random_state=cfg.random_state,
    n_jobs=1
)

feat_names = best_pipe.named_steps["preprocess"].get_feature_names_out()
imp_perm = pd.DataFrame({
    "feature": feat_names,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

display(imp_perm.head(20))


Unnamed: 0,feature,importance_mean,importance_std
5,num__jak_czesto_palisz_papierosy,0.381667,0.174134
3,num__ile_dni_w_tygodniu_cwiczysz,0.134048,0.119031
1,num__ile_kaw_napojow_energetycznych_250_ml_spo...,0.130543,0.122583
6,num__ile_razy_w_miesiacu_uczestniczysz_w_aktyw...,0.082333,0.091747
0,num__ile_godzin_spisz_srednio_na_dob,0.049889,0.096577
4,num__jak_czesto_spozywasz_alkohol,0.003333,0.042557
2,num__ile_ile_godzin_dziennie_poswiecasz_na_nauke,-0.06,0.066332


## 9) Zapis modelu do użycia np. w Streamlit

In [38]:
import joblib
joblib.dump(best_pipe, "results/best_model.joblib")


['results/best_model.joblib']

In [42]:
MODEL_PATH = "results/best_model.joblib"

USE_THRESHOLD = True
THRESHOLD = 0.40

FEATURES = [
    "ile_godzin_spisz_srednio_na_dob",
    "ile_kaw_napojow_energetycznych_250_ml_spozywasz_w_ciagu_dnia",
    "ile_ile_godzin_dziennie_poswiecasz_na_nauke",
    "ile_dni_w_tygodniu_cwiczysz",
    "jak_czesto_spozywasz_alkohol",
    "jak_czesto_palisz_papierosy",
    "ile_razy_w_miesiacu_uczestniczysz_w_aktywnosciach_odstresowujacych_npkino_zakupy_spacery_restauracja_kregle",
]

def ask_option(question: str, options: list[str]) -> tuple[int, str]:
    legend = " ".join([f"[{i+1}={opt}]" for i, opt in enumerate(options)])
    base_prompt = f"{question} {legend}: "
    prompt = base_prompt
    while True:
        s = input(prompt).strip()
        if s.isdigit():
            k = int(s)
            if 1 <= k <= len(options):
                return k, options[k - 1]
        prompt = f"BŁĄD!: WPISZ NUMER Z PODANYCH. Pytanie: {base_prompt}"

def risk_level(p_high: float) -> str:
    if p_high < 0.20:
        return "niskie"
    if p_high < 0.40:
        return "umiarkowane"
    if p_high < 0.60:
        return "podwyższone"
    return "wysokie"

def main():
    pipe = joblib.load(MODEL_PATH)
    print("KALKULATOR: Predykcja wysokiego stresu (HIGH vs NOT_HIGH)\n")

    sleep_opts = ["Mniej niż 5", "5-6", "7-8", "Więcej niż 8"]
    caffeine_opts = ["0", "1", "2", "3", "4 lub więcej"]
    study_opts = ["Mniej niż 1 godzinę", "1-2 godziny", "3-4 godziny", "5 lub więcej"]
    exercise_opts = ["0", "1-2 dni", "3-4 dni", "5-6 dni", "Codziennie"]
    alc_opts = ["Nigdy", "Sporadycznie (raz w miesiącu lub rzadziej)", "Kilka razy w miesiącu", "Regularnie (kilka razy w tygodniu)"]
    smoke_opts = ["Nigdy", "Sporadycznie (np. przy okazji imprezy)", "Kilka razy w tygodniu", "Codziennie"]
    relax_opts = ["W ogóle (0 razy w miesiącu)", "Rzadko (1-2 razy w miesiącu)", "Kilka razy w miesiącu (3-5 razy)", "Często (6 lub więcej razy w miesiącu)"]

    SLEEP_MAP = {1: 4.5, 2: 5.5, 3: 7.5, 4: 8.5}
    CAFFEINE_MAP = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
    STUDY_MAP = {1: 0.5, 2: 1.5, 3: 3.5, 4: 5.0}
    EXERCISE_MAP = {1: 0, 2: 1.5, 3: 3.5, 4: 5.5, 5: 7}
    ALC_MAP = {1: 1, 2: 2, 3: 3, 4: 4}
    SMOKE_MAP = {1: 1, 2: 2, 3: 3, 4: 4}
    RELAX_MAP = {1: 0.0, 2: 1.5, 3: 4.0, 4: 6.0}

    questions = [
        ("ile_godzin_spisz_srednio_na_dob", "1/7 Ile godzin śpisz średnio na dobę?", sleep_opts, SLEEP_MAP),
        ("ile_kaw_napojow_energetycznych_250_ml_spozywasz_w_ciagu_dnia", "2/7 Ile kaw/ napojów energetycznych (250 ml) spożywasz w ciągu dnia?", caffeine_opts, CAFFEINE_MAP),
        ("ile_ile_godzin_dziennie_poswiecasz_na_nauke", "3/7 Ile godzin dziennie poświęcasz na naukę?", study_opts, STUDY_MAP),
        ("ile_dni_w_tygodniu_cwiczysz", "4/7 Ile dni w tygodniu ćwiczysz?", exercise_opts, EXERCISE_MAP),
        ("jak_czesto_spozywasz_alkohol", "5/7 Jak często spożywasz alkohol?", alc_opts, ALC_MAP),
        ("jak_czesto_palisz_papierosy", "6/7 Jak często palisz papierosy?", smoke_opts, SMOKE_MAP),
        ("ile_razy_w_miesiacu_uczestniczysz_w_aktywnościach_odstresowujacych_npkino_zakupy_spacery_restauracja_kregle".replace("ą", "a").replace("ł", "l"), "", [], {}),  # placeholder to keep code compact
    ]
    # poprawny ostatni rekord (bez kombinowania z polskimi znakami)
    questions[-1] = (
        "ile_razy_w_miesiacu_uczestniczysz_w_aktywnosciach_odstresowujacych_npkino_zakupy_spacery_restauracja_kregle",
        "7/7 Ile razy w miesiącu uczestniczysz w aktywnościach odstresowujących (np. kino, zakupy, spacery, restauracja, kręgle)?",
        relax_opts,
        RELAX_MAP,
    )

    x = {}
    summary = []

    for col, q, opts, mapper in questions:
        k, label = ask_option(q, opts)
        x[col] = mapper[k]
        summary.append((q.split(" ", 1)[1], label))  # bez "1/7 "

    df = pd.DataFrame([x], columns=FEATURES)

    pred = pipe.predict(df)[0]
    p_high = None

    if hasattr(pipe, "predict_proba"):
        proba = pipe.predict_proba(df)[0]
        classes = list(pipe.classes_)
        if "HIGH" in classes:
            p_high = float(proba[classes.index("HIGH")])
        if USE_THRESHOLD and p_high is not None:
            pred = "HIGH" if p_high >= THRESHOLD else "NOT_HIGH"

    print("\nTwoje odpowiedzi:")
    for q, label in summary:
        print(f"- {q}: {label}")

    print(f"\nWynik: {pred}")
    if p_high is not None:
        print(f"Prawdopodobieństwo HIGH: {p_high:.3f}")
        if USE_THRESHOLD:
            print(f"Próg HIGH: {THRESHOLD:.2f}")
        print(f"Ocena ryzyka HIGH: {risk_level(p_high)}")

if __name__ == "__main__":
    main()


KALKULATOR: Predykcja wysokiego stresu (HIGH vs NOT_HIGH)


Twoje odpowiedzi:
- Ile godzin śpisz średnio na dobę?: Więcej niż 8
- Ile kaw/ napojów energetycznych (250 ml) spożywasz w ciągu dnia?: 3
- Ile godzin dziennie poświęcasz na naukę?: 5 lub więcej
- Ile dni w tygodniu ćwiczysz?: 5-6 dni
- Jak często spożywasz alkohol?: Regularnie (kilka razy w tygodniu)
- Jak często palisz papierosy?: Codziennie
- Ile razy w miesiącu uczestniczysz w aktywnościach odstresowujących (np. kino, zakupy, spacery, restauracja, kręgle)?: Często (6 lub więcej razy w miesiącu)

Wynik: HIGH
Prawdopodobieństwo HIGH: 0.412
Próg HIGH: 0.40
Ocena ryzyka HIGH: podwyższone
