In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "vscode"
px.defaults.template = "plotly"
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("data.csv")

In [3]:
# columns odpovidaji top cca 20 z radom forest modelu + talk_mother pridan jako protipol talk_father
list_columns = [
    "SWEETS", 
    "TOOTH_BRUSHING", 
    "BREAKFAST_WEEKDAYS", 
    "BREAKFAST_WEEKEND", 
    "SOFT_DRINKS", 
    "PHYS_ACT_60", 
    "NERVOUS", 
    "FRUITS", 
    "VEGETABLES", 
    "TALK_FATHER",
    "TALK_MOTHER", 
    "FRIEND_TALK", 
    "SLEEP_DIF", 
    "TIME_EXE", 
    "FIGHT_YEAR", 
    "HEADACHE", 
    "BUL_BEEN",
    "FAMILY_MEALS_TOGETHER", # test
    "DRUNK_30", # test
    "LIKE_SCHOOL" # test
]


   
dict_scales = {
    # Symptomy (1=bad → 5=good)
    "HEADACHE": 5,
    "NERVOUS": 5,
    "SLEEP_DIF": 5,
    "FEEL_LOW": 5,
    "STOMACHACHE": 5,
    "DIZZY": 5,
    # Komunikace s rodiči (1=good → 5=bad)
    "TALK_FATHER": 5,
    "TALK_MOTHER": 5,
    "FAMILY_MEALS_TOGETHER": 5,
    # Životní návyky (1=good → max=bad)
    "TIME_EXE": 7,
    "TOOTH_BRUSHING": 5,
    "HEALTH": 4,
    "LIKE_SCHOOL": 4,
    "STUD_TOGETHER": 5,
    # Strava & životní styl
    "FRUITS": 7,
    "VEGETABLES": 7,
    "FRIEND_TALK": 7,
    "BREAKFAST_WEEKDAYS": 5,
    "BREAKFAST_WEEKEND": 3,
    "PHYS_ACT_60": 7,
    "LIFESAT": 10,
    # Rizikové chování
    "SWEETS": 7,
    "SOFT_DRINKS": 7,
    "DRUNK_30": 5,
    "BUL_BEEN": 5,
    "BUL_OTHERS": 5,
    "FIGHT_YEAR": 5,
    "INJURED_YEAR": 5,
    "COMPUTER_NO": 4,
    "THINK_BODY": 5,
    "SCHOOL_PRESSURE": 4
}

# ty factors kde vyssi hodnota = zdravejsi (5 nikdy headache, 6 snidane kazdy den, 7 hodne sportuje -> reverse aby max nejhorsi/nejmene zdravy)
reverse_scales = {
    "HEADACHE",
    "NERVOUS",
    "SLEEP_DIF",
    "DIZZY",
    "FEEL_LOW",
    "STOMACHACHE",
    "BREAKFAST_WEEKDAYS",
    "BREAKFAST_WEEKEND",
    "FRIEND_TALK",
    "FRUITS",
    "LIFESAT",
    "PHYS_ACT_60",
    "VEGETABLES"
}

# Normalizace 0-1:
    # - výsledek 0 (min) = nejlepší (zdravější)
    # - výsledek 1 (max) = nejhorší (rizikovost)
def normalize_feature(series, max_scale, reverse=False):
    norm = series / max_scale
    if reverse:
        norm = 1 - norm
    return norm

### GRAPH 1
### Overweight in Time: Boys vs Girls
Story: Nadvaha roste, je to problem, kluci jsou na tom hur nez holky.


In [4]:
### Graph1 
### OVERVIEW OVERWEIGHT IN TIME Boys vs Girls
# Nejdriv zafiltrovat sloupce - az pak dolu
df_filter = df.groupby(["YEAR", "SEX"], as_index=False)["OVERWEIGHT"].mean()

# Zadani df, columns (pripadne uz zafiltrovane) na osy, nazev grafu

df_filter['SEX'] = df_filter['SEX'].map({2: 'Girls', 1: 'Boys'})
colors = {'Girls': "#eb8fbd", # pink
'Boys': "#3b8ee1"} # blue

fig = px.line(df_filter, y="OVERWEIGHT", x="YEAR", title="Overweight in Time", color="SEX", color_discrete_map=colors)
fig.update_yaxes(range=[0, 0.5])
fig.update_xaxes(tickvals=[2002, 2006, 2010, 2014, 2018])
fig.update_traces(fill="tozeroy")
fig.show()



### GRAPH 2
### TOP Gender behaviour differences of overweight children
Story: V jakych factors/types of behaviour se lisi kluci s nadvahou od holek s nadvahou, top chovani ktera jsou nejzajimavejsi/maji nej dopad. 

*How features selected: feature engineering/random forest, regrese a R-squared, kontrolni grafy treba v Tableau


In [5]:
## treba sort podle gender gap, jinak vylepsit

df_2018 = df[df["YEAR"] == 2018].copy()

# použijeme jen sloupce, které jsou v dictionary
factor_cols = [col for col in dict_scales if col in df_2018.columns]

# 2) normalizace všech faktorů v list_columns (0–1, s reversem kde je potřeba)
for feature in list_columns:
    max_scale = dict_scales[feature]
    reverse = feature in reverse_scales
    df_2018[feature] = normalize_feature(df_2018[feature], max_scale, reverse)

# 3) narrowing df to year=2018, overweight=1  ➜ teď už z normalizovaných dat
df_h = df_2018[df_2018["OVERWEIGHT"] == 1].copy()

# 4) specifying 2 groups within the new df - sex: 1 boys, 2 girls
df_h = df_h.groupby("SEX", as_index=False)[list_columns].mean()

# 5) wide -> long
df_sex_long = df_h.melt(
    id_vars=["SEX"],
    value_vars=list_columns,
    var_name="FEATURE",
    value_name="VALUE"
)

# 6) SEX čísla → text
sex_map = {1: "Boys", 2: "Girls"}
df_sex_long["SEX_STRING"] = df_sex_long["SEX"].map(sex_map)

# 7) gender gap pro každý faktor (Girls − Boys)
gap_table = (
    df_sex_long
    .groupby(["FEATURE", "SEX_STRING"])["VALUE"]
    .mean()
    .unstack("SEX_STRING")
)

gap_table["GIRLS_MINUS_BOYS"] = gap_table["Girls"] - gap_table["Boys"]
gap_table["ABS_GAP"] = gap_table["GIRLS_MINUS_BOYS"].abs()

# 8) TOP 5 faktorů
top5_features = (
    gap_table
    .sort_values("ABS_GAP", ascending=False)
    .head(5)
    .index
    .tolist()
)

print("TOP 5 faktorů pro graf 2:", top5_features)

df_top5 = df_sex_long[df_sex_long["FEATURE"].isin(top5_features)].copy()

feature_order = (
    gap_table.loc[top5_features, "ABS_GAP"]
    .sort_values(ascending=False)
    .index
    .tolist()
)

colors = {"Girls": "#eb8fbd", "Boys": "#3b8ee1"}

fig_horizontal_barchart = px.bar(
    df_top5,
    x="VALUE",
    y="FEATURE",
    color="SEX_STRING",
    orientation="h",
    barmode="group",
    category_orders={
        "FEATURE": feature_order,
        "SEX_STRING": ["Boys", "Girls"]
    },
    color_discrete_map=colors,
    title="Overweight Children - Boys vs Girls (scaled 0–1, 2018)"
)

fig_horizontal_barchart.update_layout(
    xaxis_title="Average (0–1 scale, higher = worse)",
    yaxis_title="Factor",
    legend_title="Gender"
)

fig_horizontal_barchart.show()

TOP 5 faktorů pro graf 2: ['FIGHT_YEAR', 'TALK_FATHER', 'HEADACHE', 'NERVOUS', 'PHYS_ACT_60']


### GRAPH 3
### Komplexni prehled - Gender differences in all relevant factors
- sirsi obrazek k grafu 2 (tam jen highlights), tady vse

In [9]:
# tady nejlip videt gender gap ale neumim interpretovat po reverse scales
# nejlepe zkombinovat - jake factor delaji nejvice a zarove kde je nejvetsi rozdil kluci/holky
# dodat aspon % u sloupcu


# GRAPH 3_v03
# GENDER GAP - only one bar - difference girls/boys, sorted DESC
# another option for graph no3


for feature in list_columns:
    max_scale = dict_scales[feature]
    reverse = feature in reverse_scales
    df_2018[feature] = normalize_feature(df_2018[feature], max_scale, reverse)

gap_table = (
    df_sex_long
    .pivot(index="FEATURE", columns="SEX_STRING", values="VALUE")
    .reset_index()
)

# gender gap: Girls - Boys (+ girls more, - boys more)
gap_table["GAP"] = gap_table["Girls"] - gap_table["Boys"]

# gender gap difference calculated
gap_table["ABS_GAP"] = gap_table["GAP"].abs()

# sorting factors by gender gap DESC
feature_order = (
    gap_table
    .sort_values("ABS_GAP", ascending=False)["FEATURE"]
    .tolist()
)

gap_plot_df = gap_table.sort_values("GAP", ascending=False)

# custom barvy: růžová pokud Girls > Boys, modrá pokud Boys > Girls
gap_colors = np.where(gap_plot_df["GAP"] > 0, "#eb8fbd", "#3b8ee1")

fig_gap = px.bar(
    gap_plot_df,
    x="FEATURE",
    y="GAP",
    title="Gender gap by factor (Girls − Boys, overweight children, 2018)",
)

fig_gap.update_traces(marker_color=gap_colors)

fig_gap.update_layout(
    xaxis_title="Factor",
    yaxis_title="Gender gap (Girls − Boys, scaled 0–1)",
    showlegend=False
)

fig_gap.show()

### GRAPH 4
### Behaviour differences: Overweight vs Non-Overweight children

Doplnkovy graf k Kluci vs Holky -> Jak se lisi deti s nadvahou od tech co nemaji nadvahu

In [10]:
# SCATTER PLOT ow/non-w, kluci/holky
# zkusit zprehlednit

# už máme df_2018 normované výše, použijme znovu:
df_2018 = df[df["YEAR"] == 2018].copy()


for feature in list_columns:
    max_scale = dict_scales[feature]
    reverse = feature in reverse_scales
    df_2018[feature] = normalize_feature(df_2018[feature], max_scale, reverse)


ow  = df_2018[df_2018["OVERWEIGHT"] == 1]
non = df_2018[df_2018["OVERWEIGHT"] == 0]

ow_means  = ow[list_columns].mean()
non_means = non[list_columns].mean()

boys_ow  = ow[ow["SEX"] == 1][list_columns].mean()
girls_ow = ow[ow["SEX"] == 2][list_columns].mean()

overview = pd.DataFrame({
    "FACTOR": list_columns,
    "OW_minus_NonOW": ow_means.values - non_means.values,
    "Girls_minus_Boys_OW": girls_ow.values - boys_ow.values
})

# velikost bodu = jak moc je faktor odlišný mezi OW a Non-OW
overview["ABS_OW_DIFF"] = overview["OW_minus_NonOW"].abs()

fig_meta = px.scatter(
    overview,
    x="OW_minus_NonOW",
    y="Girls_minus_Boys_OW",
    text="FACTOR",
    size="ABS_OW_DIFF",
    size_max=18,
    color="OW_minus_NonOW",
    color_continuous_scale=px.colors.diverging.RdYlGn_r,
    title="Mapa faktorů: rozdíl OW vs Non-OW × gender gap (jen OW, 2018)"
)

fig_meta.update_traces(
    textposition="top center",
    hovertemplate="<b>%{text}</b><br>"
                  "OW–Non-OW: %{x:.3f}<br>"
                  "Girls–Boys (OW): %{y:.3f}<extra></extra>"
)

fig_meta.update_layout(
    xaxis_title="Difference OW − Non-OW (normalized 0–1)",
    yaxis_title="Difference Girls − Boys (OW children)",
    coloraxis_colorbar_title="OW − Non-OW",
    plot_bgcolor="white",
    xaxis=dict(zeroline=True, zerolinecolor="grey"),
    yaxis=dict(zeroline=True, zerolinecolor="grey")
)

fig_meta.show()

In [12]:
# barchart side by side - diverging green-orange
# ne uplne ok

# jen rok 2018
df_2018 = df[df["YEAR"] == 2018].copy()

# průměry pro overweight a non-overweight
ow_means  = df_2018[df_2018["OVERWEIGHT"] == 1][list_columns].mean()
non_means = df_2018[df_2018["OVERWEIGHT"] == 0][list_columns].mean()

# rozdíl OW − NON
diff = ow_means.reindex(list_columns) - non_means.reindex(list_columns)

df_diff = diff.reset_index()
df_diff.columns = ["FACTOR", "DIFFERENCE"]

df_diff["DIFFERENCE"] = df_diff["DIFFERENCE"].fillna(0.0)
df_diff["ABS_DIFF"] = df_diff["DIFFERENCE"].abs()

df_diff = df_diff.sort_values("ABS_DIFF", ascending=False)

# 🌈 CONTINUOUS DIVERGING COLOR SCALE
diverging_scale = [
    "#00441b",  # dark green
    "#1b7837",
    "#a6dba0",
    "#f7f7f7",  # neutral
    "#fdae61",
    "#d7191c"   # dark orange/red
]

fig = px.bar(
    df_diff,
    x="DIFFERENCE",
    y="FACTOR",
    orientation="h",
    color="DIFFERENCE",
    color_continuous_scale=diverging_scale,
    category_orders={"FACTOR": df_diff["FACTOR"].tolist()},
    title="Overweight vs Non-overweight — Behaviour Differences (diverging scale)"
)

fig.update_layout(
    xaxis_title="Difference (OW – Non-OW)",
    yaxis_title="Factor",
    coloraxis_colorbar_title="Difference",
)

fig.show()