In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv("data.csv") 

In [11]:
# Step 3: Vyčištění dat
# Odstraníme všechny ID, NO, BMI a další zadané proměnné
drop_patterns = ["ID", "NO", "COUNTRY","AGE_MONTHS","THINK_BODY","HEALTH"]
explicit_drop = [
    "BMI",
    "Z_SCORE",
    "BODY_WEIGHT",
    "BODY_HIGHT"
]

cols_to_keep = [
    col for col in df.columns
    if not any(pat.lower() in col.lower() for pat in drop_patterns)
    and col.upper() not in [e.upper() for e in explicit_drop]
]

df_clean = df[cols_to_keep].copy()

In [12]:
# Step 4: Definice cílové proměnné
target = "OVERWEIGHT"
X = df_clean.drop(columns=[target])
y = df_clean[target]

# Pokud je y kategorická (např. Yes/No)
if y.dtype == 'object':
    y = y.astype(str).str.strip().map({'Yes': 1, 'No': 0, '1': 1, '0': 0}).fillna(0).astype(int)

In [13]:
# Step 6: Nahrazení chybějících hodnot, krok 5 byl preskocen
X = X.fillna(-1)

In [14]:
# Step 7: Split dat
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=4,
    class_weight="balanced"
)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("\n=== Výsledky Random Forest (balanced) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



=== Výsledky Random Forest (balanced) ===
Accuracy: 0.615
F1 score: 0.371
ROC AUC: 0.652

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.62      0.72    200584
           1       0.27      0.60      0.37     46972

    accuracy                           0.62    247556
   macro avg       0.57      0.61      0.55    247556
weighted avg       0.75      0.62      0.66    247556



In [17]:
print("Rozdělení tříd v cíli:")
print(y.value_counts())
print("\nRelativní četnosti:")
print(y.value_counts(normalize=True))


Rozdělení tříd v cíli:
OVERWEIGHT
0    668613
1    156572
Name: count, dtype: int64

Relativní četnosti:
OVERWEIGHT
0    0.810258
1    0.189742
Name: proportion, dtype: float64


In [18]:
# Spočítání důležitostí pro všechny proměnné
importances = rf.feature_importances_
feature_names = X_train.columns

feat_imp_all = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Vypiš celý seznam (může být dlouhý)
pd.set_option("display.max_rows", None)
print(feat_imp_all)



SEX                      0.228335
SWEETS                   0.133218
TOOTH_BRUSHING           0.116908
AGE                      0.052911
BREAKFAST_WEEKDAYS       0.050039
FIGHT_YEAR               0.043289
PHYS_ACT_60              0.034412
BUL_BEEN                 0.031969
LIFESAT                  0.029669
TIME_EXE                 0.025310
YEAR                     0.021112
BREAKFAST_WEEKEND        0.019434
TALK_FATHER              0.013740
SOFT_DRINKS              0.012016
FRIEND_TALK              0.011839
SLEEP_DIF                0.011488
VEGETABLES               0.011309
HEADACHE                 0.010895
NERVOUS                  0.010534
FRUITS                   0.010481
FAM_CAR                  0.010412
DRUNK_30                 0.010118
FEEL_LOW                 0.010095
FAMILY_MEALS_TOGETHER    0.009399
BUL_OTHERS               0.008878
TALK_MOTHER              0.008737
SCHOOL_PRESSURE          0.008570
INJURED_YEAR             0.008015
STUD_TOGETHER            0.007915
STOMACHACHE   

In [19]:
#zmenseni na 20

# === Vyhodnocení modelu ===
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import pandas as pd

# Predikce na testovací sadě
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

# Metriky výkonu
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("=== Výsledky modelu ===")
print(f"Accuracy: {acc:.3f}")
print(f"F1 score: {f1:.3f}")
print(f"ROC AUC: {auc:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# === Feature importances ===
importances = rf.feature_importances_
feature_names = X_train.columns
feat_imp_all = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Výpis top 20 nejdůležitějších proměnných
print("\n=== Top 20 nejdůležitějších proměnných ===")
print(feat_imp_all.head(20))


=== Výsledky modelu ===
Accuracy: 0.615
F1 score: 0.371
ROC AUC: 0.652

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.62      0.72    200584
           1       0.27      0.60      0.37     46972

    accuracy                           0.62    247556
   macro avg       0.57      0.61      0.55    247556
weighted avg       0.75      0.62      0.66    247556


=== Top 20 nejdůležitějších proměnných ===
SEX                   0.228335
SWEETS                0.133218
TOOTH_BRUSHING        0.116908
AGE                   0.052911
BREAKFAST_WEEKDAYS    0.050039
FIGHT_YEAR            0.043289
PHYS_ACT_60           0.034412
BUL_BEEN              0.031969
LIFESAT               0.029669
TIME_EXE              0.025310
YEAR                  0.021112
BREAKFAST_WEEKEND     0.019434
TALK_FATHER           0.013740
SOFT_DRINKS           0.012016
FRIEND_TALK           0.011839
SLEEP_DIF             0.011488
VEGETABLES            0.011309
HEADACH