In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt

In [2]:
#1. načteme data
df = pd.read_csv("data.csv") 

In [3]:
#2. transformace dat

#sjednotit země

df["COUNTRY_NAME"] = df["COUNTRY_NAME"].replace({
            "Belgium (Flemish)": "Belgium",
            "Belgium (French)": "Belgium"
        })

uk_map = {
            "England": "United Kingdom", "Scotland": "United Kingdom",
            "Wales": "United Kingdom", "Northern Ireland": "United Kingdom",
            "Great Britain": "United Kingdom",
            "UK (England)": "United Kingdom",
            "UK (Wales)": "United Kingdom",
            "UK (Scotland)": "United Kingdom"
        }
df["COUNTRY_NAME"] = df["COUNTRY_NAME"].replace(uk_map)

In [4]:
def reverse_scale(series, min_val=1, max_val=7):
    return max_val + min_val - series


In [7]:
#revers scales

to_reverse = {
    "HEADACHE", "NERVOUS", "SLEEP_DIF", "DIZZY", "FEEL_LOW",
    "BREAKFAST_WEEKDAYS", "BREAKFAST_WEEKEND",
    "FRIEND_TALK", "FRUITS", "PHYS_ACT_60", "VEGETABLES"
} 

In [8]:
for col in to_reverse:
    if col in df.columns:
        df[col] = reverse_scale(df[col])

In [9]:

#odstranit prediktory které nemají takový vliv -> nechat jen top prediktory

# Odstraníme všechny ID, NO, BMI a další zadané proměnné
drop_patterns = ["ID", "NO","AGE_MONTHS","THINK_BODY","HEALTH"]
explicit_drop = [
    "BMI",
    "Z_SCORE",
    "BODY_WEIGHT",
    "BODY_HIGHT"
]

cols_to_keep = [
    col for col in df.columns
    if not any(pat.lower() in col.lower() for pat in drop_patterns)
    and col.upper() not in [e.upper() for e in explicit_drop]
]

df_clean = df[cols_to_keep].copy()




In [10]:
#rozdělíme si data na X,Y
# Definice cílové proměnné
target = "OVERWEIGHT"
X = df_clean.drop(columns=[target])
y = df_clean[target]

X = pd.get_dummies(X, drop_first=True)



In [11]:
# Step 6: Nahrazení chybějících hodnot, krok 5 byl preskocen
X = X.fillna(0)

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    class_weight='balanced', 
    max_depth=None, 
    max_features='log2', 
    min_samples_leaf=4, 
    min_samples_split=4,
    n_estimators=200 
)
rf.fit(X, y)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,4
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
#uložit model do repa
import joblib



In [14]:
feature_names = X.columns  # sloupce po get_dummies

bundle = {
    "model": rf,
    "features":feature_names
}

In [15]:
# save
joblib.dump(bundle, "model.pkl") 

['model.pkl']

In [16]:
bundle = joblib.load("model.pkl")
rf2 = bundle["model"]
feature_names = bundle["features"]

In [17]:
# sloupce z X (po get_dummies – ideálně ty, které jsou přímo škálové / ordinal)
controlled_features = [
    "SEX",
    "AGE",
    "SOFT_DRINKS",
    "SWEETS",
    "VEGETABLES",
    "FRIEND_TALK",
    "PHYS_ACT_60",
    "BREAKFAST_WEEKDAYS",
    "TOOTH_BRUSHING",
    "FEEL_LOW",
    "TALK_FATHER",
   # POZOR – u COUNTRY_NAME je lepší to řešit přes get_dummies, viz dřív
    # ... co chceš mít nastavitelné
]



In [18]:
feature_alias = {
    "SOFT_DRINKS": "Soft drinks",
    "SWEETS": "Sweets",
    "VEGETABLES": "No vegetables",
    "FRIEND_TALK": "No friends talk",
    "PHYS_ACT_60": "Below 60 min/day",
    "BREAKFAST_WEEKDAYS": "No breakfast (weekdays)",
    "TOOTH_BRUSHING": "Poor teeth care",
    "FEEL_LOW": "Feels low",
    "TALK_FATHER": "No dad talk",
    "COUNTRY_NAME": "Country",
    "SEX": "Gender",
    "AGE": "Age (years)",
}

In [19]:
Frozen_features = [col for col in feature_names if col not in controlled_features]

In [20]:
COUNTRY_PREFIX = "COUNTRY_NAME_"

In [21]:
def build_input_row(user_input: dict, feature_names, controlled_features):
    row = pd.Series(0, index=feature_names, dtype="float")

    # normální featury
    for col, val in user_input.items():
        if col == "COUNTRY_NAME":
            # to řešíme zvlášť
            continue

        if col not in controlled_features:
            raise ValueError(f"{col} není v controlled_features.")
        if col not in feature_names:
            raise ValueError(f"{col} není v feature_names.")
        row[col] = val

    # COUNTRY handling (one-hot)
    if "COUNTRY_NAME" in user_input:
        country = user_input["COUNTRY_NAME"]
        col_name = f"{COUNTRY_PREFIX}{country}"

        if col_name in feature_names:
            row[col_name] = 1
        else:
            print(f"⚠ Country '{country}' nemá vlastní sloupec (asi je referenční / drop_first).")

    X_new = pd.DataFrame([row], columns=feature_names)
    return X_new

In [22]:
user_input = {
    "SEX": 1,
    "AGE": 13,
    "SOFT_DRINKS": 3,
    "SWEETS": 5,
    "VEGETABLES": 2,
    "FRIEND_TALK": 4,
    "PHYS_ACT_60": 2,
    "BREAKFAST_WEEKDAYS": 2,
    "TOOTH_BRUSHING": 2,
    "FEEL_LOW": 3,
    "TALK_FATHER": 2,
    "COUNTRY_NAME": "Czech Republic"
}

In [23]:
X_user = build_input_row(
    user_input=user_input,
    feature_names=feature_names,
    controlled_features=controlled_features
)

In [24]:
print([c for c in X_user.columns if c.startswith("COUNTRY_NAME_")and X_user[c].iloc[0]==1])

['COUNTRY_NAME_Czech Republic']


In [25]:
X_user = build_input_row(
    user_input=user_input,
    feature_names=feature_names,
    controlled_features=controlled_features
)

pred_class = rf2.predict(X_user)[0]
pred_proba = rf2.predict_proba(X_user)[0, 1]

print("Predikovaná třída (0 = bez nadváhy, 1 = nadváha):", pred_class)
print("Pravděpodobnost nadváhy (%):", f"{pred_proba:.1%}")

Predikovaná třída (0 = bez nadváhy, 1 = nadváha): 0
Pravděpodobnost nadváhy (%): 42.3%


In [26]:
def predict_child(user_input: dict):
    """
    user_input = slovník s hodnotami, např.:
    {
        "SEX": 1,
        "AGE": 13,
        "SOFT_DRINKS": 3,
        "SWEETS": 5,
        ...
        "COUNTRY_NAME": "Czech Republic"
    }
    """
    X_user = build_input_row(
        user_input=user_input,
        feature_names=feature_names,
        controlled_features=controlled_features
    )
    pred_class = rf2.predict(X_user)[0]
    pred_proba = rf2.predict_proba(X_user)[0, 1]
    return pred_class, pred_proba

In [27]:
cls, proba = predict_child(user_input)
print("Class:", cls)
print("Proba:", f"{proba:.1%}")

Class: 0
Proba: 42.3%


In [None]:
#ve Streamlitu toto:
#cls, proba = predict_child(user_input)
#st.write(f"Odhadovaná pravděpodobnost nadváhy: {proba:.0%}"

In [None]:
#load modelu z githubu
#musí být public repo 

from io import BytesIO
import pickle
import requests
mLink = 'link_github + raw=true'
mfile = BytesIO(requests.get(mLink).content)
model_knn = pickle.load(mfile)