In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px



In [5]:
mental_health_data_path = 'data/mental_health_dataset.csv'
students_mental_health_data_path = 'data/mental_health_poll_updated.csv'

In [16]:
df = pd.read_csv(mental_health_data_path)
df.dropna(inplace=True)
df

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
5,8/27/2014 11:49,Female,Poland,Corporate,No,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Not sure
6,8/27/2014 11:51,Female,Australia,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
7,8/27/2014 11:52,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,7/27/2015 23:25,Male,United States,Business,Yes,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292360,8/17/2015 9:38,Male,South Africa,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes
292361,8/25/2015 19:59,Male,United States,Business,No,Yes,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,No
292362,9/26/2015 1:07,Male,United States,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes


In [17]:
df.shape

(287162, 17)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 287162 entries, 3 to 292363
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Timestamp                287162 non-null  object
 1   Gender                   287162 non-null  object
 2   Country                  287162 non-null  object
 3   Occupation               287162 non-null  object
 4   self_employed            287162 non-null  object
 5   family_history           287162 non-null  object
 6   treatment                287162 non-null  object
 7   Days_Indoors             287162 non-null  object
 8   Growing_Stress           287162 non-null  object
 9   Changes_Habits           287162 non-null  object
 10  Mental_Health_History    287162 non-null  object
 11  Mood_Swings              287162 non-null  object
 12  Coping_Struggles         287162 non-null  object
 13  Work_Interest            287162 non-null  object
 14  Social_Weakness          

In [28]:
var_list = list(df.columns.drop('Timestamp'))

for var in var_list:
    # print(var)
    print(df[var].value_counts())

Gender
Male      235950
Female     51212
Name: count, dtype: int64
Country
United States             168056
United Kingdom             50624
Canada                     17556
Australia                   6026
Netherlands                 5894
Ireland                     5548
Germany                     4680
Sweden                      2818
India                       2774
France                      2340
Brazil                      2340
New Zealand                 1994
South Africa                1994
Switzerland                 1560
Israel                      1560
Italy                       1560
Belgium                      824
Poland                       824
Russia                       780
Denmark                      780
Singapore                    780
Greece                       780
Czech Republic               390
Georgia                      390
Colombia                     390
Moldova                      390
Mexico                       390
Croatia                      390
T

In [None]:
# opciones del dashboard



# 1. visualizacion exploratoria: volumen de datos


* fijar zona: total: todos. paises: fijo por pais. opcion de total
* mostrar 3 kpis: personas encuestadas, porcentaje de hombres, porcentaje de mujeres

* graficas por variable: grafico de barras con las categorias, separadas por hombre y mujer.
* otra opcion: meter un grafico de barras con el total de hombres y mujeres. Luego meter dos donuts para cada uno de los generos


In [76]:
class DataPipeline():

    def __init__(self, data_path):

        self.df = pd.read_csv(data_path).dropna()

    
    def get_data(self):
        return self.df
    
    
    def get_column_list(self, only_desc = False):

        column_list = self.df.columns.drop('Timestamp')
        
        if only_desc:
            column_list = column_list.drop(['Gender', 'Country'])
            
        return list(column_list)
    

    def get_country_list(self):
        return list(self.df.Country.drop_duplicates())


    def filter_country(self, country):

        if country != 'Total':
            df_country = df[df['Country'] == country]
        else:
            df_country = df
        
        return df_country
    

    def get_kpi_obs(self, country):

        df_country = self.filter_country(country)
        people_requested = df_country.shape[0]
        man_requested = df_country['Gender'].value_counts()['Male']
        woman_requested = df_country['Gender'].value_counts()['Female']

        return people_requested, man_requested, woman_requested


    def get_agg_data(self, country, column, use_gender = False):

        df_country = self.filter_country(country)

        if use_gender:
            df_agg = (
                df_country
                .groupby([column, 'Gender'])
                .agg(count_values = (column, 'count'))
                .reset_index()
            )
        else:
            df_agg = (
                df_country
                .groupby([column])
                .agg(count_values = (column, 'count'))
                .reset_index()
            )

        return df_agg

        

        

In [77]:
data_pipeline = DataPipeline(mental_health_data_path)

country = "United States"

# print(data_pipeline.get_data().shape)
# print(data_pipeline.filter_country(country).shape)
# print(data_pipeline.get_kpi_obs(country))

df_country = data_pipeline.filter_country(country)
df_country

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
7,8/27/2014 11:52,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
8,8/27/2014 12:18,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
9,8/27/2014 12:37,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292358,5/6/2015 16:55,Male,United States,Business,No,No,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292359,7/27/2015 23:25,Male,United States,Business,Yes,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292361,8/25/2015 19:59,Male,United States,Business,No,Yes,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,No
292362,9/26/2015 1:07,Male,United States,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes


In [80]:
data_pipeline = DataPipeline(mental_health_data_path)

country = "United States"
column = data_pipeline.get_column_list(only_desc=True)[1]

df_agg = data_pipeline.get_agg_data(country, column, use_gender=True)

df_agg

Unnamed: 0,self_employed,Gender,count_values
0,No,Female,33418
1,No,Male,125190
2,Yes,Female,868
3,Yes,Male,8580


In [None]:
df_agg
fig = px.bar(df_agg, x="nation", y=["gold", "silver", "bronze"], title="")

In [65]:
data_pipeline.get_country_list()

['United States',
 'Poland',
 'Australia',
 'Canada',
 'United Kingdom',
 'South Africa',
 'Sweden',
 'New Zealand',
 'Netherlands',
 'India',
 'Belgium',
 'Ireland',
 'France',
 'Portugal',
 'Brazil',
 'Costa Rica',
 'Russia',
 'Germany',
 'Switzerland',
 'Finland',
 'Israel',
 'Italy',
 'Bosnia and Herzegovina',
 'Singapore',
 'Nigeria',
 'Croatia',
 'Thailand',
 'Denmark',
 'Mexico',
 'Greece',
 'Moldova',
 'Colombia',
 'Georgia',
 'Czech Republic',
 'Philippines']

In [75]:
df_agg
fig = px.bar(
    df_agg,
    x=column,
    y="count_values", 
    color="Gender",
    barmode="group"
    )
fig.show()

In [None]:
fig = px.bar(wide_df, x="nation", y=["gold", "silver", "bronze"], title="Wide-Form Input")
barmode="group"