In [96]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt


pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [97]:
df = pd.read_csv("data.csv") 

In [98]:
# Step 3: Vyčištění dat
# Odstraníme všechny ID, NO, BMI a další zadané proměnné
drop_patterns = ["ID", "NO","AGE_MONTHS","THINK_BODY","HEALTH"]
explicit_drop = [
    "BMI",
    "Z_SCORE",
    "BODY_WEIGHT",
    "BODY_HIGHT"
]

cols_to_keep = [
    col for col in df.columns
    if not any(pat.lower() in col.lower() for pat in drop_patterns)
    and col.upper() not in [e.upper() for e in explicit_drop]
]

df_clean = df[cols_to_keep].copy()

In [99]:
# Step 4: Definice cílové proměnné
target = "OVERWEIGHT"
X = df_clean.drop(columns=[target])
y = df_clean[target]


""" 
není nutné, není kategorická, nejsou tam null hodnoty
# Pokud je y kategorická (např. Yes/No) 
if y.dtype == 'object':
    y = y.astype(str).str.strip().map({'Yes': 1, 'No': 0, '1': 1, '0': 0}).fillna(0).astype(int)
"""

" \nnení nutné, není kategorická, nejsou tam null hodnoty\n# Pokud je y kategorická (např. Yes/No) \nif y.dtype == 'object':\n    y = y.astype(str).str.strip().map({'Yes': 1, 'No': 0, '1': 1, '0': 0}).fillna(0).astype(int)\n"

In [100]:
X = pd.get_dummies(X, drop_first=True)

In [101]:
# Step 6: Nahrazení chybějících hodnot, krok 5 byl preskocen
X = X.fillna(0)

In [102]:
X.head()

Unnamed: 0,SEX,YEAR,FRUITS,AGE,SOFT_DRINKS,SWEETS,VEGETABLES,LIKE_SCHOOL,TOOTH_BRUSHING,HEADACHE,STUD_TOGETHER,STOMACHACHE,SCHOOL_PRESSURE,SLEEP_DIF,DIZZY,NERVOUS,INJURED_YEAR,PHYS_ACT_60,FEEL_LOW,FAM_CAR,FIGHT_YEAR,LIFESAT,OWN_BEDROOM_FLAG,MOTHER_HOME_FLAG,FATHER_HOME_FLAG,TALK_MOTHER,TALK_FATHER,SOCIAL_MEDIA_FLAG,BUL_BEEN,BREAKFAST_WEEKEND,BUL_OTHERS,BREAKFAST_WEEKDAYS,FRIEND_TALK,DRUNK_30,TIME_EXE,FAMILY_MEALS_TOGETHER,COUNTRY_NAME_Armenia,COUNTRY_NAME_Austria,COUNTRY_NAME_Azerbaijan,COUNTRY_NAME_Belgium (Flemish),COUNTRY_NAME_Belgium (French),COUNTRY_NAME_Bulgaria,COUNTRY_NAME_Canada,COUNTRY_NAME_Croatia,COUNTRY_NAME_Czech Republic,COUNTRY_NAME_Denmark,COUNTRY_NAME_England,COUNTRY_NAME_Estonia,COUNTRY_NAME_Finland,COUNTRY_NAME_France,COUNTRY_NAME_Georgia,COUNTRY_NAME_Germany,COUNTRY_NAME_Greece,COUNTRY_NAME_Greenland,COUNTRY_NAME_Hungary,COUNTRY_NAME_Iceland,COUNTRY_NAME_Ireland,COUNTRY_NAME_Israel,COUNTRY_NAME_Italy,COUNTRY_NAME_Kazakhstan,COUNTRY_NAME_Latvia,COUNTRY_NAME_Lithuania,COUNTRY_NAME_Luxembourg,COUNTRY_NAME_Macedonia,COUNTRY_NAME_Malta,COUNTRY_NAME_Netherlands,COUNTRY_NAME_Norway,COUNTRY_NAME_Poland,COUNTRY_NAME_Portugal,COUNTRY_NAME_Republic of Moldova,COUNTRY_NAME_Romania,COUNTRY_NAME_Russia,COUNTRY_NAME_Scotland,COUNTRY_NAME_Serbia,COUNTRY_NAME_Slovakia,COUNTRY_NAME_Slovenia,COUNTRY_NAME_Spain,COUNTRY_NAME_Sweden,COUNTRY_NAME_Switzerland,COUNTRY_NAME_Turkey,COUNTRY_NAME_USA,COUNTRY_NAME_Ukraine,COUNTRY_NAME_Wales
0,2,2010,6.0,15,5.0,3.0,7.0,1.0,1.0,5.0,1.0,4.0,1.0,3.0,5.0,4.0,3.0,2.0,5.0,2.0,1.0,6.0,0.0,1.0,1.0,1.0,2.0,-1.0,1.0,3.0,1.0,4.0,1.0,0.0,4.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2,2010,6.0,16,5.0,5.0,3.0,3.0,2.0,3.0,3.0,5.0,3.0,4.0,5.0,2.0,2.0,2.0,2.0,2.0,2.0,4.0,0.0,1.0,1.0,0.0,0.0,-1.0,1.0,3.0,1.0,2.0,0.0,1.0,5.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
2,2,2010,5.0,13,6.0,5.0,4.0,2.0,1.0,4.0,3.0,3.0,3.0,2.0,4.0,3.0,1.0,5.0,2.0,3.0,2.0,8.0,0.0,1.0,1.0,1.0,2.0,-1.0,1.0,3.0,1.0,2.0,1.0,0.0,3.0,0.0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,1,2010,7.0,15,4.0,5.0,7.0,3.0,2.0,4.0,2.0,5.0,2.0,4.0,5.0,4.0,1.0,3.0,5.0,2.0,2.0,7.0,1.0,1.0,1.0,2.0,2.0,-1.0,1.0,3.0,2.0,2.0,1.0,1.0,6.0,0.0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,1,2010,2.0,14,7.0,6.0,6.0,3.0,1.0,4.0,1.0,5.0,2.0,4.0,1.0,3.0,1.0,3.0,1.0,3.0,1.0,9.0,1.0,1.0,1.0,3.0,3.0,-1.0,1.0,3.0,1.0,6.0,4.0,1.0,2.0,0.0,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [103]:
# Step 7: Split dat
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=4,
    class_weight="balanced"
)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [108]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
"n_estimators": [100, 200],
"max_depth": [10, None],
"max_features": ["sqrt", "log2"],
"min_samples_split": [2, 4],
"min_samples_leaf": [1,4],
"bootstrap": [False],
"class_weight": ["balanced", "balanced_subsample"]
}
rf = RandomForestClassifier(
random_state=42,
n_jobs=5
)

grid = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=4,
scoring="f1_macro",
n_jobs=5,
verbose=3
)

grid.fit(X_train, y_train)
best_model = grid.best_estimator_
best_params = grid.best_params_
best_score = grid.best_score_

print(best_params)
print(best_score)

Fitting 4 folds for each of 64 candidates, totalling 256 fits
[CV 1/4] END bootstrap=False, class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.550 total time=  42.4s
[CV 3/4] END bootstrap=False, class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.553 total time=  42.4s
[CV 2/4] END bootstrap=False, class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.549 total time=  42.4s
[CV 4/4] END bootstrap=False, class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.550 total time=  42.6s
[CV 1/4] END bootstrap=False, class_weight=balanced, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.550 total time= 1.5min
[CV 1/4] END bootstrap=False, class_weight=balanced, 



[CV 1/4] END bootstrap=False, class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200;, score=0.461 total time= 4.9min
[CV 2/4] END bootstrap=False, class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200;, score=0.459 total time= 4.9min
[CV 3/4] END bootstrap=False, class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200;, score=0.460 total time= 4.9min
[CV 4/4] END bootstrap=False, class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200;, score=0.461 total time= 5.0min
[CV 2/4] END bootstrap=False, class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=100;, score=0.557 total time= 2.3min
[CV 3/4] END bootstrap=False, class_weight=balanced, max_depth=None, max_features=sqrt, min_samples_leaf=

KeyboardInterrupt: 

In [None]:
# ulozeni vysledneho modelu

import joblib

# save
joblib.dump(rf, "model.pkl") 

# load
rf2 = joblib.load("model.pkl")

x = rf2.predict(X[100:101])

In [None]:
x

array([1])

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("\n=== Výsledky Random Forest (balanced) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 score: {f1_score(y_test, y_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



=== Výsledky Random Forest (balanced) ===
Accuracy: 0.756
F1 score: 0.306
ROC AUC: 0.654

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.85    200584
           1       0.33      0.28      0.31     46972

    accuracy                           0.76    247556
   macro avg       0.58      0.57      0.58    247556
weighted avg       0.74      0.76      0.75    247556



In [None]:
print("Rozdělení tříd v cíli:")
print(y.value_counts())
print("\nRelativní četnosti:")
print(y.value_counts(normalize=True))


Rozdělení tříd v cíli:
OVERWEIGHT
0    668613
1    156572
Name: count, dtype: int64

Relativní četnosti:
OVERWEIGHT
0    0.810258
1    0.189742
Name: proportion, dtype: float64


In [None]:
# Spočítání důležitostí pro všechny proměnné
importances = rf.feature_importances_
feature_names = X_train.columns

feat_imp_all = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Vypiš celý seznam (může být dlouhý)
pd.set_option("display.max_rows", None)
print(feat_imp_all)



SEX                      0.228335
SWEETS                   0.133218
TOOTH_BRUSHING           0.116908
AGE                      0.052911
BREAKFAST_WEEKDAYS       0.050039
FIGHT_YEAR               0.043289
PHYS_ACT_60              0.034412
BUL_BEEN                 0.031969
LIFESAT                  0.029669
TIME_EXE                 0.025310
YEAR                     0.021112
BREAKFAST_WEEKEND        0.019434
TALK_FATHER              0.013740
SOFT_DRINKS              0.012016
FRIEND_TALK              0.011839
SLEEP_DIF                0.011488
VEGETABLES               0.011309
HEADACHE                 0.010895
NERVOUS                  0.010534
FRUITS                   0.010481
FAM_CAR                  0.010412
DRUNK_30                 0.010118
FEEL_LOW                 0.010095
FAMILY_MEALS_TOGETHER    0.009399
BUL_OTHERS               0.008878
TALK_MOTHER              0.008737
SCHOOL_PRESSURE          0.008570
INJURED_YEAR             0.008015
STUD_TOGETHER            0.007915
STOMACHACHE   

In [None]:
#zmenseni na 20

# === Vyhodnocení modelu ===
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import pandas as pd

# Predikce na testovací sadě
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

# Metriky výkonu
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print("=== Výsledky modelu ===")
print(f"Accuracy: {acc:.3f}")
print(f"F1 score: {f1:.3f}")
print(f"ROC AUC: {auc:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# === Feature importances ===
importances = rf.feature_importances_
feature_names = X_train.columns
feat_imp_all = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Výpis top 20 nejdůležitějších proměnných
print("\n=== Top 20 nejdůležitějších proměnných ===")
print(feat_imp_all.head(20))


=== Výsledky modelu ===
Accuracy: 0.615
F1 score: 0.371
ROC AUC: 0.652

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.62      0.72    200584
           1       0.27      0.60      0.37     46972

    accuracy                           0.62    247556
   macro avg       0.57      0.61      0.55    247556
weighted avg       0.75      0.62      0.66    247556


=== Top 20 nejdůležitějších proměnných ===
SEX                   0.228335
SWEETS                0.133218
TOOTH_BRUSHING        0.116908
AGE                   0.052911
BREAKFAST_WEEKDAYS    0.050039
FIGHT_YEAR            0.043289
PHYS_ACT_60           0.034412
BUL_BEEN              0.031969
LIFESAT               0.029669
TIME_EXE              0.025310
YEAR                  0.021112
BREAKFAST_WEEKEND     0.019434
TALK_FATHER           0.013740
SOFT_DRINKS           0.012016
FRIEND_TALK           0.011839
SLEEP_DIF             0.011488
VEGETABLES            0.011309
HEADACH