In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np


In [3]:
# ------------------------------------------------------------
# LOAD DATA
# ------------------------------------------------------------
df = pd.read_csv("data.csv")

# sjednocení Belgie
df["COUNTRY_NAME"] = df["COUNTRY_NAME"].replace({
    "Belgium (Flemish)": "Belgium",
    "Belgium (French)": "Belgium"
})

# Oprava 999 na NaN
df.loc[df["BUL_BEEN"] == 999, "BUL_BEEN"] = np.nan

In [4]:
factors = [
    "FRUITS", "SOFT_DRINKS", "SWEETS", "VEGETABLES", "FRIEND_TALK",
    "TIME_EXE", "PHYS_ACT_60", "DRUNK_30",
    "FAMILY_MEALS_TOGETHER", "BREAKFAST_WEEKDAYS", "BREAKFAST_WEEKEND",
    "TOOTH_BRUSHING", "STUD_TOGETHER", "BUL_OTHERS", "BUL_BEEN",
    "FIGHT_YEAR", "INJURED_YEAR", "HEADACHE", "FEEL_LOW",
    "NERVOUS", "SLEEP_DIF", "DIZZY",
    "TALK_MOTHER", "TALK_FATHER",
    "LIKE_SCHOOL", "SCHOOL_PRESSURE", "COMPUTER_NO"
]

dictionary = {
    "HEADACHE": 5, "NERVOUS": 5, "SLEEP_DIF": 5, "FEEL_LOW": 5,
    "DIZZY": 5, "TALK_FATHER": 5, "TALK_MOTHER": 5,
    "FAMILY_MEALS_TOGETHER": 6, "TIME_EXE": 7,
    "TOOTH_BRUSHING": 5, "LIKE_SCHOOL": 4, "STUD_TOGETHER": 5,
    "FRUITS": 7, "SOFT_DRINKS": 7, "SWEETS": 7, "VEGETABLES": 7,
    "FRIEND_TALK": 7, "PHYS_ACT_60": 7, "DRUNK_30": 7,
    "BREAKFAST_WEEKDAYS": 6, "BREAKFAST_WEEKEND": 3,
    "BUL_OTHERS": 5, "BUL_BEEN": 5,
    "FIGHT_YEAR": 5, "INJURED_YEAR": 5,
    "SCHOOL_PRESSURE": 4, "COMPUTER_NO": 4
}

reverse_scales = {
    "HEADACHE", "NERVOUS", "SLEEP_DIF", "DIZZY", "FEEL_LOW",
    "BREAKFAST_WEEKDAYS", "BREAKFAST_WEEKEND",
    "FRIEND_TALK", "FRUITS", "PHYS_ACT_60", "VEGETABLES"
}


In [5]:
# ------------------------------------------------------------
# GRAF 1 — LINE CHART (vývoj obezity ve všech zemích)
# ------------------------------------------------------------

df_line = (
    df.groupby(["YEAR", "COUNTRY_NAME"], as_index=False)["OVERWEIGHT"]
      .mean()
)

fig_line = px.line(
    df_line,
    x="YEAR",
    y="OVERWEIGHT",
    color="COUNTRY_NAME",
    markers=True,
    title="Vývoj prevalence obezity ve všech zemích"
)

fig_line.update_layout(
    hovermode="x unified",
    height=700,
    width=1200,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.3,
        xanchor="center",
        x=0.5
    )
)

fig_line.show()

In [6]:
df_2018 = df[df["YEAR"] == 2018].copy()
df_norm = df_2018.copy()

for col in factors:
    if col in dictionary:
        max_val = dictionary[col]
        if col in reverse_scales:
            df_norm[col] = (max_val + 1 - df_norm[col]) / max_val
        else:
            df_norm[col] = df_norm[col] / max_val

# ------------------------------------------------------------
# DATA 2018 + NORMALIZACE (znovu použijeme df_norm z výše)
# ------------------------------------------------------------

ow = df_norm[df_norm["OVERWEIGHT"] == 1]
non = df_norm[df_norm["OVERWEIGHT"] == 0]

ow_mean = ow[factors].mean()
non_mean = non[factors].mean()

effect = (ow_mean - non_mean).sort_values(ascending=False)
top5_diff = effect.head(5).index.tolist()
print("TOP 5 podle rozdílu OW vs non-OW:", top5_diff)

# ------------------------------------------------------------
# Připravit pro graf
# ------------------------------------------------------------
df_diff_grouped = (
    df_norm.groupby("COUNTRY_NAME")[top5_diff]
    .mean()
    .reset_index()
)

df_diff_long = df_diff_grouped.melt(
    id_vars="COUNTRY_NAME",
    value_vars=top5_diff,
    var_name="FEATURE",
    value_name="VALUE"
)

# ------------------------------------------------------------
# GRAF – duhový koberec podle rozdílu
# ------------------------------------------------------------
fig_diff = px.bar(
    df_diff_long,
    x="VALUE",
    y="FEATURE",
    color="COUNTRY_NAME",
    orientation="h",
    barmode="group",
    title="TOP 5 faktorů podle rozdílu OW vs non-OW (2018) — Rainbow Carpet"
)

fig_diff.update_layout(
    template="simple_white",
    height=700,
    width=1200
)

fig_diff.show()


TOP 5 podle rozdílu OW vs non-OW: ['BREAKFAST_WEEKDAYS', 'PHYS_ACT_60', 'FIGHT_YEAR', 'FRIEND_TALK', 'TOOTH_BRUSHING']


In [7]:
# ------------------------------------------------------------
# 1) Filtrace na rok 2018
# ------------------------------------------------------------
df_2018 = df[df["YEAR"] == 2018].copy()

# ------------------------------------------------------------
# 2) Evropská unie v roce 2018 (UK je zahrnuto)
# ------------------------------------------------------------
eu_countries = [
    "Austria", "Belgium", "Bulgaria", "Croatia", "Czech Republic", "Denmark",
    "Estonia", "Finland", "France", "Germany", "Greece", "Hungary",
    "Ireland", "Italy", "Latvia", "Lithuania", "Malta",
    "Netherlands", "Poland", "Portugal", "Romania", "Slovakia",
    "Slovenia", "Spain", "Sweden", "United Kingdom"
]

# ------------------------------------------------------------
# 3) Filtrace na EU státy
# ------------------------------------------------------------
df_eu = df_2018[df_2018["COUNTRY_NAME"].isin(eu_countries)]

# ------------------------------------------------------------
# 4) Výpočet EU average (jen EU země)
# ------------------------------------------------------------
eu_avg = df_eu["OVERWEIGHT"].mean()

# ------------------------------------------------------------
# 5) Výpočet průměrů podle zemí
# ------------------------------------------------------------
df_dev = (
    df_eu.groupby("COUNTRY_NAME", as_index=False)["OVERWEIGHT"]
    .mean()
)

df_dev["DEVIATION"] = df_dev["OVERWEIGHT"] - eu_avg
df_dev = df_dev.sort_values("DEVIATION", ascending=True)

# ------------------------------------------------------------
# 6) Graf odchylek od EU průměru
# ------------------------------------------------------------
fig_dev_eu = px.bar(
    df_dev,
    x="DEVIATION",
    y="COUNTRY_NAME",
    orientation="h",
    color="DEVIATION",
    color_continuous_scale="RdBu_r",
    title=f"Odchylka od průměru EU (2018), EU average = {eu_avg:.3f}"
)

fig_dev_eu.update_layout(
    xaxis_title="Odchylka od EU průměru",
    yaxis_title="Země",
    height=650,
    template="simple_white"
)

fig_dev_eu.add_vline(x=0, line_width=2, line_dash="dash", line_color="black")

fig_dev_eu.show()


In [None]:
# ------------------------------------------------------------
# 1) EU seznam pro rok 2018 (UK zahrnuto)
# ------------------------------------------------------------
eu_countries = [
    "Austria","Belgium","Bulgaria","Croatia","Czech Republic","Denmark",
    "Estonia","Finland","France","Germany","Greece","Hungary","Ireland",
    "Italy","Latvia","Lithuania","Luxembourg","Malta","Netherlands",
    "Poland","Portugal","Romania","Slovakia","Slovenia","Spain","Sweden",
    "United Kingdom"
]

# ------------------------------------------------------------
# 2) Data 2018
# ------------------------------------------------------------
df_2018 = df[df["YEAR"] == 2018].copy()

# ------------------------------------------------------------
# 3) Zachovat jen EU státy, které jsou v datasetu
# ------------------------------------------------------------
eu_in_data = [
    c for c in eu_countries 
    if c in df_2018["COUNTRY_NAME"].unique()
]

df_2018 = df_2018[df_2018["COUNTRY_NAME"].isin(eu_in_data)]

# ------------------------------------------------------------
# 4) Pohlaví
# ------------------------------------------------------------
df_2018["SEX_LABEL"] = df_2018["SEX"].map({1: "Boys", 2: "Girls"})

# ------------------------------------------------------------
# 5) Průměr overweight pro Boys / Girls
# ------------------------------------------------------------
df_gender = (
    df_2018.groupby(["COUNTRY_NAME", "SEX_LABEL"], as_index=False)["OVERWEIGHT"]
    .mean()
    .pivot(index="COUNTRY_NAME", columns="SEX_LABEL", values="OVERWEIGHT")
    .reset_index()
)

# vyhodit země, kde chybí Boys nebo Girls
df_gender = df_gender.dropna(subset=["Boys", "Girls"])

# ------------------------------------------------------------
# 6) Rozdíl Girls – Boys (pro řazení)
# ------------------------------------------------------------
df_gender["DIFF"] = df_gender["Girls"] - df_gender["Boys"]
df_gender = df_gender.sort_values("DIFF")

# ------------------------------------------------------------
# 7) DUMBBELL GRAF
# ------------------------------------------------------------
fig_intro = go.Figure()

# Girls
fig_intro.add_trace(go.Scatter(
    x=df_gender["Girls"], y=df_gender["COUNTRY_NAME"],
    mode='markers', marker=dict(color="hotpink", size=10), name="Girls"
))

# Boys
fig_intro.add_trace(go.Scatter(
    x=df_gender["Boys"], y=df_gender["COUNTRY_NAME"],
    mode='markers', marker=dict(color="cornflowerblue", size=10), name="Boys"
))

# spojovací čáry
fig_intro.add_trace(go.Scatter(
    x=pd.concat([df_gender["Boys"], df_gender["Girls"]]),
    y=pd.concat([df_gender["COUNTRY_NAME"], df_gender["COUNTRY_NAME"]]),
    mode='lines', line=dict(color="gray", width=1.5),
    hoverinfo='skip', showlegend=False
))

fig_intro.update_layout(
    title="Overweight Boys vs Girls (EU only, 2018)",
    xaxis_title="Overweight",
    yaxis_title="Countries (EU)",
    height=900,
    font=dict(size=11),
    margin=dict(l=40, r=20, t=60, b=20),
    legend_title="Gender"
)

fig_intro.show()


In [9]:
df_gender["DIFF"] = df_gender["Girls"] - df_gender["Boys"]
# kde je rozdíl největší?
max_diff_row = df_gender.loc[df_gender["DIFF"].abs().idxmax()]

print("Země s největším rozdílem mezi Boys a Girls:")
print(f"Country: {max_diff_row['COUNTRY_NAME']}")
print(f"Boys overweight:  {max_diff_row['Boys']:.3f}")
print(f"Girls overweight: {max_diff_row['Girls']:.3f}")
print(f"Difference (Girls - Boys): {max_diff_row['DIFF']:.3f}")


Země s největším rozdílem mezi Boys a Girls:
Country: Italy
Boys overweight:  0.316
Girls overweight: 0.155
Difference (Girls - Boys): -0.161
