In [28]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
px.defaults.template = "plotly"
import plotly.graph_objects as go

In [2]:
# load data
df = pd.read_csv("data.csv") 

In [3]:
# spojeni dvou sloupcu country_name Belgium (Flemish) a Belgium (French) do jednoho sloupce Belgium
df['COUNTRY_NAME'] = df['COUNTRY_NAME'].replace({'Belgium (Flemish)': 'Belgium', 'Belgium (French)': 'Belgium'})

In [4]:
# filtrovani datasetu podle vybranych zemi a overweight
df_filtr = df.groupby(["YEAR", "COUNTRY_NAME"],as_index=False)["OVERWEIGHT"].mean()

In [None]:
df_filtr

Unnamed: 0,YEAR,COUNTRY_NAME,OVERWEIGHT
0,2002,Austria,0.173717
1,2002,Belgium,0.138671
2,2002,Canada,0.232289
3,2002,Croatia,0.168396
4,2002,Czech Republic,0.137074
...,...,...,...
188,2018,Sweden,0.185174
189,2018,Switzerland,0.157220
190,2018,Turkey,0.247632
191,2018,Ukraine,0.162013


In [5]:
# plot line chart
fig = px.line(df_filtr, x="YEAR", y="OVERWEIGHT",color="COUNTRY_NAME", title="Overweight Czech republic and selected countries")

In [6]:
fig.show()

In [21]:
#filtr roku 2018
df_ow_2018 = df[(df["YEAR"] == 2018) & (df["OVERWEIGHT"] == 1)]

In [22]:
list_columns = [
    "FRUITS", "SOFT_DRINKS", "SWEETS", "VEGETABLES", "FRIEND_TALK",
    "TIME_EXE", "PHYS_ACT_60", "DRUNK_30", "LIFESAT",
    "FAMILY_MEALS_TOGETHER", "BREAKFAST_WEEKDAYS", "BREAKFAST_WEEKEND",
    "TOOTH_BRUSHING", "STUD_TOGETHER", "BUL_OTHERS", "BUL_BEEN",
    "FIGHT_YEAR", "INJURED_YEAR", "HEADACHE", "FEEL_LOW",
    "NERVOUS", "SLEEP_DIF", "DIZZY",
    "TALK_MOTHER", "TALK_FATHER", "FAMILY_MEALS_TOGETHER",
     "LIKE_SCHOOL", "SCHOOL_PRESSURE", "COMPUTER_NO"
]

In [23]:
# horizontal bar chart Nejaktualnejsi stav pro rok 2018

# dictionary nazev sloupce a maximalni hodnota v datasetu
dictionary = {
    "FRUITS": 7,
    "SOFT_DRINKS": 7,
    "SWEETS": 7,
    "VEGETABLES": 7,
    "FRIEND_TALK": 7,
    "TIME_EXE": 7,
    "PHYS_ACT_60": 7,
    "DRUNK_30": 7,
    "LIFESAT": 10,
    "FAMILY_MEALS_TOGETHER": 6,
    "BREAKFAST_WEEKDAYS": 6,
    "BREAKFAST_WEEKEND": 6,
    "TOOTH_BRUSHING": 5,
    "STUD_TOGETHER": 5,
    "BUL_OTHERS": 5,
    "BUL_BEEN": 5,
    "FIGHT_YEAR": 5,
    "INJURED_YEAR": 5,
    "HEADACHE": 5,
    "FEEL_LOW": 5,
    "NERVOUS": 5,
    "SLEEP_DIF": 5,
    "DIZZY": 5,
    "THINK_BODY": 5,
    "TALK_MOTHER": 5,
    "TALK_FATHER": 5,
    "HEALTH": 4,
    "LIKE_SCHOOL": 4,
    "SCHOOL_PRESSURE": 4,
    "COMPUTER_NO": 4
}


In [24]:
# ------------------------------------------------------------
#  Reversed faktory – čím více, tím lepší (musíme otočit)
# ------------------------------------------------------------

reverse_scales = {
    "HEADACHE",
    "NERVOUS",
    "SLEEP_DIF",
    "DIZZY",
    "FEEL_LOW",
    "STOMACHACHE",
    "BREAKFAST_WEEKDAYS",
    "BREAKFAST_WEEKEND",
    "FRIEND_TALK",
    "FRUITS",
    "LIFESAT",
    "PHYS_ACT_60",
    "VEGETABLES"
}


In [25]:
# groupovat podle zeme a vypocet prumeru podle 6 sloupcu
df_2018_grouped = df_2018.groupby("COUNTRY_NAME", as_index=False)[list_columns].mean()

In [26]:
for feature in list_columns:
   df_2018_grouped[feature] = df_2018_grouped[feature] / dictionary[feature]

In [None]:
df_2018_grouped

In [None]:
# list_columns a dict_scales předpokládám definované stejně jako u tebe

# df_2018_grouped: už agregovaná data za rok 2018 po zemích
# (COUNTRY_NAME + průměry faktorů)

# 1) normalizace faktorů na 0–1
for feature in list_columns:
    max_scale = dictionary[feature]
    df_2018_grouped[feature] = df_2018_grouped[feature] / max_scale

# 2) variabilita mezi zeměmi pro každý faktor (std přes COUNTRY_NAME)
feature_std = df_2018_grouped[list_columns].std()

# 3) TOP 5 faktorů podle největší variability mezi zeměmi
top6_features = (
    feature_std
    .sort_values(ascending=False)
    .head (6)
    .index
    .tolist()
)

print("TOP 5 faktorů s největší variabilitou mezi zeměmi:", top6_features)

# 4) melt – jen TOP 5 faktorů
df_country_long = df_2018_grouped.melt(
    id_vars=["COUNTRY_NAME"],
    value_vars=top6_features,   # ⬅ místo list_columns jen top 5
    var_name="FEATURE",
    value_name="VALUE"
)

df_country_long["Y_LABEL"] = df_country_long["FEATURE"] + " - " + df_country_long["COUNTRY_NAME"]

# pořadí faktorů na ose Y: podle variability (od největší std)
feature_order = (
    feature_std.loc[top6_features]
    .sort_values(ascending=False)
    .index
    .tolist()
)

fig_horizontal = px.bar(
    df_country_long,
    x="VALUE",
    y="FEATURE",
    color="COUNTRY_NAME",
    barmode="group",
    orientation="h",
    category_orders={"FEATURE": feature_order},
    title="Overweight: Czech Republic vs other country in 2018"
)

fig_horizontal.show()

TOP 5 faktorů s největší variabilitou mezi zeměmi: ['COMPUTER_NO', 'SCHOOL_PRESSURE', 'LIKE_SCHOOL', 'NERVOUS', 'FEEL_LOW', 'SLEEP_DIF']


In [22]:
#filtr roku 2018 VERTIKALNI, VYBER 10 JINYCH FEATURE
df_2018_ver = df[df["YEAR"] == 2018]

list_columns_2 = ["TOOTH_BRUSHING", "SOFT_DRINKS","FRUITS", "VEGETABLES", "TALK_FATHER","TALK_MOTHER", "FRIEND_TALK", "LIFESAT", "TIME_EXE", "FIGHT_YEAR", "HEADACHE", "BUL_BEEN"]


In [23]:
# groupovat podle zeme a vypocet prumeru podle 10 sloupcu
df_2018_ver_grouped = df_2018_ver.groupby("COUNTRY_NAME", as_index=False)[list_columns_2].mean()


In [24]:
for feature in list_columns_2:
   df_2018_ver_grouped[feature] = df_2018_ver_grouped[feature] / dictionary[feature]

In [25]:
df_country_long = df_2018_ver_grouped.melt(
    id_vars=["COUNTRY_NAME"],
    value_vars=list_columns_2,
    var_name="FEATURE",
    value_name="VALUE"
)

fig_ver = px.bar(
    df_country_long,
    x="FEATURE",
    y="VALUE",
    color="COUNTRY_NAME",
    barmode="group",
    title="Overweight: Czech Republic vs other country in 2018"
)

In [26]:
fig_ver.show()