In [15]:
import matplotlib.pyplot as plt
import polars as pl
import pandas as pd
import seaborn as sns
from ydata_profiling import ProfileReport

from tqdm import tqdm
from loguru import logger

from machine_learning.config import EXTERNAL_DATA_DIR, MPLSTYLE_DIR, REPORTS_DIR

plt.style.use(MPLSTYLE_DIR / "iragca_ml.mplstyle")

In [2]:
DATA_PATH = EXTERNAL_DATA_DIR / "kaggle" / "imtkaggleteam"

DATASETS = dict()

for file in DATA_PATH.iterdir():
    if file.suffix == ".csv":
        print(file.name)
        DATASETS[file.stem[0]] = pl.read_csv(file)

1- mental-illnesses-prevalence.csv
2- burden-disease-from-each-mental-illness.csv
3- adult-population-covered-in-primary-data-on-the-prevalence-of-major-depression.csv
4- adult-population-covered-in-primary-data-on-the-prevalence-of-mental-illnesses.csv
5- anxiety-disorders-treatment-gap.csv
6- depressive-symptoms-across-us-population.csv
7- number-of-countries-with-primary-data-on-prevalence-of-mental-illnesses-in-the-global-burden-of-disease-study.csv


In [3]:
DATASETS["1"].sample(5)

Entity,Code,Year,Schizophrenia disorders (share of population) - Sex: Both - Age: Age-standardized,Depressive disorders (share of population) - Sex: Both - Age: Age-standardized,Anxiety disorders (share of population) - Sex: Both - Age: Age-standardized,Bipolar disorders (share of population) - Sex: Both - Age: Age-standardized,Eating disorders (share of population) - Sex: Both - Age: Age-standardized
str,str,i64,f64,f64,f64,f64,f64
"""Low-income countries""",,2012,0.215539,4.659665,3.625562,0.548664,0.08283
"""Mauritius""","""MUS""",2011,0.3028433,4.023178,3.8565311,0.337485,0.125921
"""American Samoa""","""ASM""",2016,0.305845,2.6280975,4.0254774,0.2797356,0.128426
"""Haiti""","""HTI""",2018,0.241357,3.6294103,4.6204476,0.843951,0.121482
"""Saudi Arabia""","""SAU""",2006,0.261447,4.0944386,4.4302883,0.7653696,0.251602


In [4]:
DATASETS["2"].sample(5)

Entity,Code,Year,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Depressive disorders,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Schizophrenia,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Bipolar disorder,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Eating disorders,DALYs (rate) - Sex: Both - Age: Age-standardized - Cause: Anxiety disorders
str,str,i64,f64,f64,f64,f64,f64
"""Vietnam""","""VNM""",1999,382.3727,202.63898,73.11277,14.86394,217.76917
"""Central African Republic""","""CAF""",1995,1091.8298,122.60952,111.911865,16.651552,393.87048
"""Congo""","""COG""",2005,1073.3224,135.15547,116.30052,25.225908,353.14032
"""Nauru""","""NRU""",2012,496.54883,177.13698,56.286434,19.621428,383.22137
"""Algeria""","""DZA""",1995,750.99445,160.22092,163.86661,39.559963,445.68146


In [5]:
DATASETS["3"].sample(5)

Entity,Code,Year,Major depression
str,str,i64,f64
"""Southern Latin America""",,2008,16.5
"""Caribbean""",,2008,9.1
"""World""","""OWID_WRL""",2008,35.4
"""Central Asia""",,2008,0.0
"""East Sub-Saharan Africa""",,2008,1.3


In [6]:
DATASETS["4"].sample(5)


Entity,Code,Year,Major depression,Bipolar disorder,Eating disorders,Dysthymia,Schizophrenia,Anxiety disorders
str,str,i64,f64,f64,f64,f64,str,f64
"""West Sub-Saharan Africa""",,2008,46.6,47.0,0.0,46.6,"""0""",46.6
"""Central Latin America""",,2008,49.8,34.5,71.0,46.0,"""0.7""",69.7
"""Oceania""",,2008,0.0,0.0,0.0,0.0,"""0.4""",0.0
"""Eastern Europe""",,2008,23.6,1.7,0.0,22.9,"""1.3""",22.3
"""Tropical Latin America""",,2008,9.7,6.4,21.1,6.4,"""0""",6.4


In [7]:
DATASETS["5"].sample(5)


Entity,Code,Year,"Potentially adequate treatment, conditional","Other treatments, conditional","Untreated, conditional"
str,str,i64,f64,f64,f64
"""Italy""","""ITA""",2002,9.1,20.6,70.3
"""Medellin, Colombia""",,2012,3.8,15.0,81.2
"""Mexico""","""MEX""",2002,3.3,12.8,83.9
"""United States""","""USA""",2003,16.1,26.2,57.7
"""Peru""","""PER""",2005,1.1,16.8,82.1


In [8]:
DATASETS["6"].sample(5)


Entity,Code,Year,Nearly every day,More than half the days,Several days,Not at all
str,str,i64,f64,f64,f64,f64
"""Difficulty concentrating""",,2014,3.5,3.6,10.9,82.1
"""Low energy""",,2014,9.0,7.8,34.0,49.1
"""Average across symptoms""",,2014,4.4,4.3,15.0,76.3
"""Loss of interest""",,2014,4.4,5.4,16.3,73.8
"""Appetite change""",,2014,4.6,5.1,15.5,74.8


In [9]:
DATASETS["7"].sample(5)

Entity,Code,Year,Number of countries with primary data on prevalence of mental disorders
str,str,i64,i64
"""Autism spectrum disorders""",,2019,34
"""Opioid use disorders""",,2019,31
"""Bipolar disorder""",,2019,41
"""Personality disorders""",,2019,2
"""Other drug use disorders""",,2019,2


## Profiling

In [30]:
SAVE_PATH = REPORTS_DIR / "ydata" / "group2"

if not SAVE_PATH.exists():
    SAVE_PATH.mkdir(parents=True, exist_ok=True)

logger.info("Profiling datasets")
for file in DATA_PATH.iterdir():
    if file.suffix == ".csv":
        logger.debug(f"Profiling {file.stem}")
        report = ProfileReport(pd.read_csv(file), title=file.stem)
        report.to_file(SAVE_PATH / f"{file.stem}.html")
logger.success("Profiling complete")

[32m2025-02-21 15:36:52.866[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mProfiling datasets[0m
[32m2025-02-21 15:36:52.867[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 1- mental-illnesses-prevalence[0m


Summarize dataset: 100%|██████████| 53/53 [00:04<00:00, 12.12it/s, Completed]                                                                                                                                                                   
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 18.47it/s]


[32m2025-02-21 15:37:01.612[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 2- burden-disease-from-each-mental-illness[0m


Summarize dataset: 100%|██████████| 53/53 [00:03<00:00, 13.93it/s, Completed]                                                                                                                                                             
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 90.87it/s]


[32m2025-02-21 15:37:09.203[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 3- adult-population-covered-in-primary-data-on-the-prevalence-of-major-depression[0m


Summarize dataset: 100%|██████████| 14/14 [00:00<00:00, 49.82it/s, Completed]                                 
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  8.80it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 333.60it/s]


[32m2025-02-21 15:37:10.828[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 4- adult-population-covered-in-primary-data-on-the-prevalence-of-mental-illnesses[0m


Summarize dataset: 100%|██████████| 43/43 [00:02<00:00, 18.18it/s, Completed]                                   
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 200.10it/s]


[32m2025-02-21 15:37:15.548[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 5- anxiety-disorders-treatment-gap[0m


Summarize dataset: 100%|██████████| 31/31 [00:01<00:00, 20.97it/s, Completed]                                                                                       
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 200.06it/s]


[32m2025-02-21 15:37:18.985[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 6- depressive-symptoms-across-us-population[0m


Summarize dataset: 100%|██████████| 32/32 [00:01<00:00, 25.06it/s, Completed]                                               
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 333.46it/s]


[32m2025-02-21 15:37:21.830[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [34m[1mProfiling 7- number-of-countries-with-primary-data-on-prevalence-of-mental-illnesses-in-the-global-burden-of-disease-study[0m


Summarize dataset: 100%|██████████| 14/14 [00:00<00:00, 49.22it/s, Completed]                                                                                                                                               
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  9.13it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 333.62it/s]

[32m2025-02-21 15:37:23.066[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [32m[1mProfiling complete[0m





### Widgets

In [28]:
ProfileReport(DATASETS["1"].to_pandas()).to_widgets()

Summarize dataset: 100%|██████████| 53/53 [00:03<00:00, 13.35it/s, Completed]                                                                                                                                                                   
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

                                                             

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [29]:
ProfileReport(DATASETS["2"].to_pandas()).to_widgets()

Summarize dataset: 100%|██████████| 53/53 [00:04<00:00, 11.59it/s, Completed]                                                                                                                                                             
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

                                                             

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…