In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px

from src.params import mental_health_data_path
from src.data_pipeline import DataPipeline
from src.data_viz import PlotBuilder



In [2]:
data_pipeline = DataPipeline(mental_health_data_path)
data_pipeline.get_data().head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
5,8/27/2014 11:49,Female,Poland,Corporate,No,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Not sure
6,8/27/2014 11:51,Female,Australia,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
7,8/27/2014 11:52,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No


In [3]:
data_pipeline.get_country_list()[:5]

['United States', 'Poland', 'Australia', 'Canada', 'United Kingdom']

In [4]:
print(data_pipeline.get_column_list()[:5])
print(data_pipeline.get_column_list(only_desc=True)[:5])

['Gender', 'Country', 'Occupation', 'self_employed', 'family_history']
['Occupation', 'self_employed', 'family_history', 'treatment', 'Days_Indoors']


In [5]:
df_country = data_pipeline.filter_country(country="Canada")
df_country.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
13,8/27/2014 12:49,Female,Canada,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
21,8/27/2014 13:31,Female,Canada,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
49,8/27/2014 17:37,Female,Canada,Corporate,Yes,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
54,8/27/2014 19:05,Female,Canada,Corporate,No,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
69,8/28/2014 9:55,Female,Canada,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure


In [6]:
data_pipeline.get_kpi_obs(country='Canada')

(17556, 13650, 3906)

In [7]:
country = data_pipeline.get_country_list()[0]
column = data_pipeline.get_column_list(only_desc=True)[0]

data_pipeline.get_agg_data(country, column, use_gender = False)

Unnamed: 0,Occupation,count_values
0,Business,28879
1,Corporate,35001
2,Housewife,38111
3,Others,30461
4,Student,35604


In [8]:
country = data_pipeline.get_country_list()[0]
column = data_pipeline.get_column_list(only_desc=True)[0]

data_pipeline.get_agg_data(country, column, use_gender = True)

Unnamed: 0,Occupation,Gender,count_values
0,Business,Female,6241
1,Business,Male,22638
2,Corporate,Female,5846
3,Corporate,Male,29155
4,Housewife,Female,7584
5,Housewife,Male,30527
6,Others,Female,6794
7,Others,Male,23667
8,Student,Female,7821
9,Student,Male,27783


In [9]:
country = data_pipeline.get_country_list()[0]
column = data_pipeline.get_column_list(only_desc=True)[0]
gender = "Male"
data_pipeline.get_agg_by_gender(country, column, gender)

Unnamed: 0,Occupation,count_values
0,Business,22638
1,Corporate,29155
2,Housewife,30527
3,Others,23667
4,Student,27783


# 1. visualizacion exploratoria: volumen de datos


* fijar zona: total: todos. paises: fijo por pais. opcion de total
* mostrar 3 kpis: personas encuestadas, porcentaje de hombres, porcentaje de mujeres

* graficas por variable: grafico de barras con las categorias, separadas por hombre y mujer.
* otra opcion: meter un grafico de barras con el total de hombres y mujeres. Luego meter dos donuts para cada uno de los generos


In [10]:

data_pipeline = DataPipeline(mental_health_data_path)

list_country = data_pipeline.get_country_list()
list_column = data_pipeline.get_column_list(only_desc=True)

country = list_country[0]
column = list_column[0]
print(f"country: {country}\ncolumn: {column}")

people_requested, man_requested, woman_requested =  data_pipeline.get_kpi_obs(country=country)
print(f"people_requested: {people_requested}, man_requested: {man_requested}, woman_requested: {woman_requested}")

country: United States
column: Occupation
people_requested: 168056, man_requested: 133770, woman_requested: 34286


In [16]:
round((man_requested / people_requested)*100, 2)

79.6

In [11]:
# plots
plot_builder = PlotBuilder(data_pipeline)

barplot_fig = plot_builder.build_barplot(country, column)
barplot_fig.show()


In [12]:
gender = "Male"

plot_builder = PlotBuilder(data_pipeline)

donut_fig = plot_builder.build_dount_gender(country, column, gender)
donut_fig.show()

In [14]:
# grafico de barras de categorias

df_agg = data_pipeline.get_agg_data(country, column, use_gender = False)
df_agg

Unnamed: 0,Occupation,count_values
0,Business,28879
1,Corporate,35001
2,Housewife,38111
3,Others,30461
4,Student,35604


In [29]:
class PlotBuilder:

    def __init__(self, data_pipeline: DataPipeline):
        self.data_pipeline = data_pipeline


    def build_barplot(self, country, column):

        df_agg = self.data_pipeline.get_agg_data(country, column, use_gender = False)

        fig = px.bar(
            df_agg,
            x=column,
            y="count_values",
            title=f"Frecuencia por {column} (País: {country})"
        )

        return fig
    

    def build_dount_gender(self, country, column, gender):

        df_agg = self.data_pipeline.get_agg_by_gender(country, column, gender)

        if gender == "Female":
            donut_title = f'Frecuencia de {column} en mujeres (País: {country})'
        else:
            donut_title = f'Frecuencia de {column} en hombres (País: {country})'

        fig = px.pie(
            df_agg, 
            values='count_values',
            names=column,
            hole=.5, 
            title=donut_title
        )

        return fig



In [30]:
plot_builder = PlotBuilder(data_pipeline)

barplot_fig = plot_builder.build_barplot(country, column)
barplot_fig.show()

In [34]:
gender = "Male"

plot_builder = PlotBuilder(data_pipeline)

donut_fig = plot_builder.build_dount_gender(country, column, gender)
donut_fig.show()


In [28]:
fig = px.pie(
        df_agg, 
        values='count_values',
        names=column,
        hole=.5, 
        title=f'Frecuencia de {column} en mujeres (País: {country})'
)
fig.show()

In [None]:
df_agg
fig = px.bar(
    df_agg,
    x=column,
    y="count_values", 
    color="Gender",
    barmode="group"
    )
fig.show()

In [8]:
class DataPipeline():

    def __init__(self, data_path):

        self.df = pd.read_csv(data_path).dropna()

    
    def get_data(self):
        return self.df
    
    
    def get_column_list(self, only_desc = False):

        column_list = self.df.columns.drop('Timestamp')
        
        if only_desc:
            column_list = column_list.drop(['Gender', 'Country'])
            
        return list(column_list)
    

    def get_country_list(self):
        return list(self.df.Country.drop_duplicates())


    def filter_country(self, country):

        if country != 'Total':
            df_country = df[df['Country'] == country]
        else:
            df_country = df
        
        return df_country
    

    def get_kpi_obs(self, country):

        df_country = self.filter_country(country)
        people_requested = df_country.shape[0]
        man_requested = df_country['Gender'].value_counts()['Male']
        woman_requested = df_country['Gender'].value_counts()['Female']

        return people_requested, man_requested, woman_requested


    def get_agg_data(self, country, column, use_gender = False):

        df_country = self.filter_country(country)

        if use_gender:
            df_agg = (
                df_country
                .groupby([column, 'Gender'])
                .agg(count_values = (column, 'count'))
                .reset_index()
            )
        else:
            df_agg = (
                df_country
                .groupby([column])
                .agg(count_values = (column, 'count'))
                .reset_index()
            )

        return df_agg

        

        

In [9]:
data_pipeline = DataPipeline(mental_health_data_path)

country = "United States"

# print(data_pipeline.get_data().shape)
# print(data_pipeline.filter_country(country).shape)
# print(data_pipeline.get_kpi_obs(country))

df_country = data_pipeline.filter_country(country)
df_country

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
3,8/27/2014 11:37,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,8/27/2014 11:43,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
7,8/27/2014 11:52,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
8,8/27/2014 12:18,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
9,8/27/2014 12:37,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292358,5/6/2015 16:55,Male,United States,Business,No,No,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292359,7/27/2015 23:25,Male,United States,Business,Yes,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292361,8/25/2015 19:59,Male,United States,Business,No,Yes,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,No
292362,9/26/2015 1:07,Male,United States,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes


In [10]:
data_pipeline = DataPipeline(mental_health_data_path)

country = "United States"
column = data_pipeline.get_column_list(only_desc=True)[1]

df_agg = data_pipeline.get_agg_data(country, column, use_gender=True)

df_agg

Unnamed: 0,self_employed,Gender,count_values
0,No,Female,33418
1,No,Male,125190
2,Yes,Female,868
3,Yes,Male,8580


In [11]:
# df_agg
# fig = px.bar(df_agg, x="nation", y=["gold", "silver", "bronze"], title="")

In [12]:
data_pipeline.get_country_list()

['United States',
 'Poland',
 'Australia',
 'Canada',
 'United Kingdom',
 'South Africa',
 'Sweden',
 'New Zealand',
 'Netherlands',
 'India',
 'Belgium',
 'Ireland',
 'France',
 'Portugal',
 'Brazil',
 'Costa Rica',
 'Russia',
 'Germany',
 'Switzerland',
 'Finland',
 'Israel',
 'Italy',
 'Bosnia and Herzegovina',
 'Singapore',
 'Nigeria',
 'Croatia',
 'Thailand',
 'Denmark',
 'Mexico',
 'Greece',
 'Moldova',
 'Colombia',
 'Georgia',
 'Czech Republic',
 'Philippines']

In [13]:
df_agg
fig = px.bar(
    df_agg,
    x=column,
    y="count_values", 
    color="Gender",
    barmode="group"
    )
fig.show()

In [None]:
fig = px.bar(wide_df, x="nation", y=["gold", "silver", "bronze"], title="Wide-Form Input")
barmode="group"